In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [11]:
mushroom = pd.read_csv("mushrooms.csv")

In [12]:
mushroom.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [13]:
le = LabelEncoder()

In [14]:
updated= mushroom.apply(le.fit_transform)

In [15]:
updated.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [16]:
X = updated.drop(["type"],axis = 1)
y = updated["type"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [18]:
class NB:
    
    def fit(self, X, y):
        model = {}
        prior = {}
        
        for klass in set(y):
            model[klass] = {}
            selected = X[y == klass]
            prior[klass] = len(selected) / len(X)
            for column in X.columns :
                model[klass][column] = {}
                for value in set(X[column]):
                    model[klass][column][value] = np.sum(selected[column] == value)/len(selected)
                    
                    
            self.model = model
            self.prior = prior
            
    def predict_point(self, point):
        probs = []
        model = self.model
        prior = self.prior
        for klass in model:
            p = prior[klass]
            for column in model[klass]:
                value = point[column]
                p*= model[klass][column][value] 
            
            probs.append(p)
        return np.argmax(probs)
    
    def predict(self, X):
        yh = []
        for index , row in X.iterrows():
            yh.append(self.predict_point(row))
        return np.array(yh)
    
    def score(self , X , y):
        return sum(self.predict(X) == np.array(y))/len(y)
            

In [19]:
m = NB()

In [20]:
m.fit(X_train , y_train)

In [22]:
m.predict(X_test)

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [23]:
m.score(X_test,y_test)

0.9973890339425587

# NLTK

In [49]:
from nltk.tokenize import sent_tokenize , word_tokenize

In [50]:
line = " apple is is is a fruit."

In [51]:
sent_tokenize(line)

[' apple is is is a fruit.']

In [52]:
word_tokenize(line)

['apple', 'is', 'is', 'is', 'a', 'fruit', '.']

In [60]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [61]:
ps = PorterStemmer()

In [62]:
ps.stem("going")

'go'

In [63]:
ps.stem("happiness")

'happi'

In [64]:
ps.stem("happy")

'happi'

In [65]:
ps.stem("better")

'better'

In [66]:
lem = WordNetLemmatizer()

In [68]:
lem.lemmatize("better", 'a')

'good'

In [69]:
from nltk.corpus import stopwords

In [73]:
st = stopwords.words("english")

In [75]:
from string import punctuation

In [76]:
punc = set(punctuation)

In [77]:
punc

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~'}

In [80]:
text = open("../speech.txt", "r", encoding = "utf-8").read()