# Texte

## Initialisierung

In [1]:
import pandas as pd
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\volkm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\volkm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\volkm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# https://en.wikiquote.org/wiki/Ask_a_Ninja / https://www.youtube.com/watch?v=kO_u-knoehM
text = "Can ninjas catch colds? I guess the better question would be: can colds catch ninjas? Nope. We're faster than germs. So then yes, ninjas can catch colds. The only disease that a ninja is susceptible to is Saturday Night Fever. There are two main categories of ninjas skills: Deadly and possibly deadly. You don't hire ninjas for everyone you need to kill... that's what Italians are for!"
text

"Can ninjas catch colds? I guess the better question would be: can colds catch ninjas? Nope. We're faster than germs. So then yes, ninjas can catch colds. The only disease that a ninja is susceptible to is Saturday Night Fever. There are two main categories of ninjas skills: Deadly and possibly deadly. You don't hire ninjas for everyone you need to kill... that's what Italians are for!"

## Tokenisierung

In [4]:
df = pd.DataFrame({"token": word_tokenize(text) })
df

Unnamed: 0,token
0,Can
1,ninjas
2,catch
3,colds
4,?
...,...
77,what
78,Italians
79,are
80,for


## Normalisierung

In [5]:
def Normalisierung(token):
    # Satzzeichen entfernen
    token = re.sub(r'[^\w\s]', '', token)
    # Kleinschreibung
    token = token.lower()
    # Stopwords entfernen
    if token  in stopwords.words('english'):
        return ""
    
    return token

df["normalisierung"] = df.token.apply(Normalisierung)
df

Unnamed: 0,token,normalisierung
0,Can,
1,ninjas,ninjas
2,catch,catch
3,colds,colds
4,?,
...,...,...
77,what,
78,Italians,italians
79,are,
80,for,


## Stemming

In [6]:
stemmer = PorterStemmer()
df["stemming"] = df.normalisierung.apply(stemmer.stem)
df

Unnamed: 0,token,normalisierung,stemming
0,Can,,
1,ninjas,ninjas,ninja
2,catch,catch,catch
3,colds,colds,cold
4,?,,
...,...,...,...
77,what,,
78,Italians,italians,italian
79,are,,
80,for,,


## Lemmatisierung

In [7]:
lemmatizer = WordNetLemmatizer()
df["lemmatisierung"] = df.normalisierung.apply(lemmatizer.lemmatize)
df

Unnamed: 0,token,normalisierung,stemming,lemmatisierung
0,Can,,,
1,ninjas,ninjas,ninja,ninja
2,catch,catch,catch,catch
3,colds,colds,cold,cold
4,?,,,
...,...,...,...,...
77,what,,,
78,Italians,italians,italian,italian
79,are,,,
80,for,,,


### Unterschied zwischen Stemming und Lemmatisierung

In [8]:
df[df.stemming != df.lemmatisierung]


Unnamed: 0,token,normalisierung,stemming,lemmatisierung
28,yes,yes,ye,yes
37,disease,disease,diseas,disease
42,susceptible,susceptible,suscept,susceptible
53,categories,categories,categori,category
58,Deadly,deadly,deadli,deadly
60,possibly,possibly,possibl,possibly
61,deadly,deadly,deadli,deadly
69,everyone,everyone,everyon,everyone
