# Bag of Words

## Initialisierung

In [17]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [11]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\volkm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\volkm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\volkm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def prepareText(text):
    stemmer = PorterStemmer()
    tokens = []
    for token in word_tokenize(text):
        # Satzzeichen entfernen
        token = re.sub(r'[^\w\s]', '', token)
        # Kleinschreibung
        token = token.lower()
        # Stopwords entfernen
        if token not in stopwords.words('english'):
          tokens.append(stemmer.stem(token))
    
    return " ".join(tokens)

In [13]:
df = pd.DataFrame({"text": [
    "Can ninjas catch colds? I guess the better question would be: can colds catch ninjas?", 
    "Nope. We're faster than germs. So then yes, ninjas can catch colds.", 
    "The only disease that a ninja is susceptible to is Saturday Night Fever.", 
    "There are two main categories of ninjas skills: Deadly and possibly deadly.", 
    "You don't hire ninjas for everyone you need to kill... that's what Italians are for!"]})
df


Unnamed: 0,text
0,Can ninjas catch colds? I guess the better que...
1,"Nope. We're faster than germs. So then yes, ni..."
2,The only disease that a ninja is susceptible t...
3,There are two main categories of ninjas skills...
4,You don't hire ninjas for everyone you need to...


In [14]:
df["prepared"] = df.text.apply(prepareText)
df

Unnamed: 0,text,prepared
0,Can ninjas catch colds? I guess the better que...,ninja catch cold guess better question would ...
1,"Nope. We're faster than germs. So then yes, ni...",nope faster germ ye ninja catch cold
2,The only disease that a ninja is susceptible t...,diseas ninja suscept saturday night fever
3,There are two main categories of ninjas skills...,two main categori ninja skill deadli possibl ...
4,You don't hire ninjas for everyone you need to...,nt hire ninja everyon need kill italian


## Bag of Words für die Textspalte

In [20]:
count = CountVectorizer()
bow = count.fit_transform(df["text"])
count.get_feature_names()


['and',
 'are',
 'be',
 'better',
 'can',
 'catch',
 'categories',
 'colds',
 'deadly',
 'disease',
 'don',
 'everyone',
 'faster',
 'fever',
 'for',
 'germs',
 'guess',
 'hire',
 'is',
 'italians',
 'kill',
 'main',
 'need',
 'night',
 'ninja',
 'ninjas',
 'nope',
 'of',
 'only',
 'possibly',
 'question',
 're',
 'saturday',
 'skills',
 'so',
 'susceptible',
 'than',
 'that',
 'the',
 'then',
 'there',
 'to',
 'two',
 'we',
 'what',
 'would',
 'yes',
 'you']

In [19]:
df2 = pd.DataFrame(bow.A, columns=count.get_feature_names())
df2.insert(0, "text", df.text)
df2


Unnamed: 0,text,and,are,be,better,can,catch,categories,colds,deadly,...,the,then,there,to,two,we,what,would,yes,you
0,Can ninjas catch colds? I guess the better que...,0,0,1,1,2,2,0,2,0,...,1,0,0,0,0,0,0,1,0,0
1,"Nope. We're faster than germs. So then yes, ni...",0,0,0,0,1,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
2,The only disease that a ninja is susceptible t...,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
3,There are two main categories of ninjas skills...,1,1,0,0,0,0,1,0,2,...,0,0,1,0,1,0,0,0,0,0
4,You don't hire ninjas for everyone you need to...,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,2


## Bag of Words für die Prepared-Spalte

In [21]:
count = CountVectorizer()
bow = count.fit_transform(df["prepared"])
count.get_feature_names()


['better',
 'catch',
 'categori',
 'cold',
 'deadli',
 'diseas',
 'everyon',
 'faster',
 'fever',
 'germ',
 'guess',
 'hire',
 'italian',
 'kill',
 'main',
 'need',
 'night',
 'ninja',
 'nope',
 'nt',
 'possibl',
 'question',
 'saturday',
 'skill',
 'suscept',
 'two',
 'would',
 'ye']

In [22]:
df3 = pd.DataFrame(bow.A, columns=count.get_feature_names())
df3.insert(0, "text", df.text)
df3


Unnamed: 0,text,better,catch,categori,cold,deadli,diseas,everyon,faster,fever,...,nope,nt,possibl,question,saturday,skill,suscept,two,would,ye
0,Can ninjas catch colds? I guess the better que...,1,2,0,2,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,"Nope. We're faster than germs. So then yes, ni...",0,1,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2,The only disease that a ninja is susceptible t...,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,1,0,0,0
3,There are two main categories of ninjas skills...,0,0,1,0,2,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
4,You don't hire ninjas for everyone you need to...,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
