In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

In [29]:
iris = sns.load_dataset('iris')
mushroom = pd.read_csv('./data/mushroom.csv')
census = pd.read_csv('./data/census.csv')

In [30]:
census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
x_train,x_test,y_train,y_test = train_test_split(iris.drop('species',axis=1),iris.species,random_state=10,
                                                test_size=0.15)

In [5]:
x_test.shape,x_train.shape

((23, 4), (127, 4))

In [139]:
ada = AdaBoostClassifier()
ada.fit(x_train,y_train)
ada_preds = ada.predict(x_test)
print('ada can do: {:.1%}'.format(accuracy_score(y_test,ada_preds)))
print('---------')
confusion_matrix(y_test,ada_preds)

ada can do: 95.7%
---------


array([[8, 0, 0],
       [0, 9, 0],
       [0, 1, 5]])

In [158]:
class NaiveBayes(object):
    def __init__(self,cat_list,lap=0.0000001):
        super(NaiveBayes,self).__init__()
        #self.model = None
        self.stats = {'tgt':{},'num':{},'cat':{}}
        self.lap=lap
        self.cat_list = cat_list
        
    def fit(self,x,y):
        tgt_size = len(y.unique())
        for v in y.unique():
            self.stats['tgt']['p_{}'.format(v)]=np.log((len(y[y==v])+self.lap)/(len(y)+self.lap*tgt_size)
                                                    )
            self.stats['num'][v] = {}
            self.stats['cat'][v] = {}
            for vx in x.columns:
                if vx not in self.cat_list:
                    self.stats['num'][v][vx] = {}
                elif vx in self.cat_list:
                    self.stats['cat'][v][vx] = {}
                    for ft_a in x[vx].unique():
                        self.stats['cat'][v][vx][ft_a] = {}
        #'''
        for v in y.unique():
            for vx in x.columns:
                if vx not in self.cat_list:
                    self.stats['num'][v][vx]['mean']=x[vx][y==v].mean()
                    self.stats['num'][v][vx]['std']=x[vx][y==v].std()
                elif vx in self.cat_list:
                    for ft_a in x[vx].unique():
                        self.stats['cat'][v][vx][ft_a] = np.log((len(x[vx][y==v][x[vx]==ft_a]
                                                             )+self.lap)/(len(y[y==v])+self.lap*tgt_size)
                                                                )
        #'''       
    def cal_density(self,x,std,mu):
        return np.log((1/(np.sqrt(2*np.pi)*std))*np.exp(-((x-mu)**2)/(2*(std**2))))
    
    def predict_instance(self,x):
        results = []
        labels = []
        for v in self.stats['num'].keys():
            result = 1
            for vx in x.index:
                if vx not in self.cat_list:
                    if vx in self.stats['num'][v].keys():
                        result = result+self.cal_density(
                            x[vx],self.stats['num'][v][vx]['std'],self.stats['num'][v][vx]['mean'])
                elif vx in self.cat_list:
                    if vx in self.stats['cat'][v].keys():
                        result = result+self.stats['cat'][v][vx][x[vx]]
            labels.append(v)
            results.append(result+self.stats['tgt']['p_{}'.format(v)])
            
        
        return labels[np.argmax(results)]
    
    def predict_instance_checker(self,x):
        results = []
        labels = []
        for v in self.stats['num'].keys():
            result = 1
            for vx in x.index:
                if vx not in self.cat_list:
                    if vx in self.stats['num'][v].keys():
                        result = result+self.cal_density(
                            x[vx],self.stats['num'][v][vx]['std'],self.stats['num'][v][vx]['mean'])
                elif vx in self.cat_list:
                    if vx in self.stats['cat'][v].keys():
                        result = result+self.stats['cat'][v][vx][x[vx]]
                        print(vx,self.stats['cat'][v][vx][x[vx]])
            print('------',result,np.log1p(result))
            labels.append(v)
            results.append(result+self.stats['tgt']['p_{}'.format(v)])
        
        return labels[np.argmax(results)],results
    
    def predict(self,x):
        return x.apply(self.predict_instance,axis=1)

In [157]:
#%%time
nb = NaiveBayes(cat_list=[])
nb.fit(x_train,y_train)
#'''
nb_preds = nb.predict(x_test)
print('nb can do: {:.1%}'.format(accuracy_score(y_test,nb_preds)))
print('while, by guessing randomly we would get something like:',
      np.exp(nb.stats['tgt'][list(nb.stats['tgt'].keys())[0]]))
print('---------')
confusion_matrix(y_test,nb_preds)
#'''

nb can do: 100.0%
while, by guessing randomly we would get something like: 0.33070866142352284
---------


array([[8, 0, 0],
       [0, 9, 0],
       [0, 0, 6]])

In [131]:
#%%time
snb = GaussianNB()
snb.fit(x_train,y_train)
snb_preds = snb.predict(x_test)
print('snb can do: {:.1%}'.format(accuracy_score(y_test,snb_preds)))
print('---------')
confusion_matrix(y_test,snb_preds)

snb can do: 100.0%
---------


array([[8, 0, 0],
       [0, 9, 0],
       [0, 0, 6]])

## mushroom

In [63]:
mushroom_ = mushroom.copy()

In [64]:
lbl = LabelEncoder()
for col in mushroom_.columns:
    mushroom_[col] = lbl.fit_transform(mushroom[col])

In [65]:
mx_train,mx_test,my_train,my_test = train_test_split(mushroom_.drop('target',axis=1),mushroom_.target,
                                                    random_state=10,test_size=0.15)

In [133]:
snb = GaussianNB()
snb.fit(mx_train,my_train)
snb_preds = snb.predict(mx_test)
print('snb can do: {:.1%}'.format(accuracy_score(my_test,snb_preds)))
print('---------')
confusion_matrix(my_test,snb_preds)

snb can do: 93.1%
---------


array([[600,  42],
       [ 42, 535]])

In [140]:
ada = AdaBoostClassifier()
ada.fit(mx_train,my_train)
ada_preds = ada.predict(mx_test)
print('ada can do: {:.1%}'.format(accuracy_score(my_test,ada_preds)))
print('---------')
confusion_matrix(my_test,ada_preds)

ada can do: 100.0%
---------


array([[642,   0],
       [  0, 577]])

In [153]:
nb = NaiveBayes(cat_list=mushroom.columns[1:],lap=0.0000001)
nb.fit(mx_train,my_train)
nb_preds = nb.predict(mx_test)
print('nb can do: {:.1%}'.format(accuracy_score(my_test,nb_preds)))
print('while, by guessing randomly we would get something like: {:.1%}'.format(
      np.exp(nb.stats['tgt'][list(nb.stats['tgt'].keys())[0]])))
print('-------------')
confusion_matrix(my_test,nb_preds)

nb can do: 99.8%
while, by guessing randomly we would get something like: 51.6%
-------------


array([[640,   2],
       [  0, 577]])

## census

In [73]:
census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [74]:
cat_list = ['workclass','education','marital-status','occupation','relationship',
           'race','sex','native-country']
num_list = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']

In [75]:
census_ = census.copy()
for col in num_list:
    census_[col] = np.log1p(census[col])
for col in cat_list:
    census_[col] = lbl.fit_transform(census[col])
cx_train,cx_test,cy_train,cy_test = train_test_split(census_.drop('target',axis=1),census_.target,
                                                    random_state=100,test_size=0.15)

In [154]:
nb = NaiveBayes(cat_list=cat_list)
nb.fit(cx_train,cy_train)
nb_preds = nb.predict(cx_test)
print('nb can do: {:.1%}'.format(accuracy_score(cy_test,nb_preds)))
print('while, by guessing randomly we would get something like: {:.1%}'.format(
      np.exp(nb.stats['tgt'][list(nb.stats['tgt'].keys())[0]])))
print('--------')
print(confusion_matrix(cy_test,nb_preds))

nb can do: 81.9%
while, by guessing randomly we would get something like: 75.9%
--------
[[3181  545]
 [ 339  820]]


In [155]:
snb = GaussianNB()
snb.fit(cx_train,cy_train)
snb_preds = snb.predict(cx_test)
print('snb can do: {:.1%}'.format(accuracy_score(cy_test,snb_preds)))
print('--------')
print(confusion_matrix(cy_test,snb_preds))

snb can do: 79.6%
--------
[[3086  640]
 [ 356  803]]


In [156]:
ada = AdaBoostClassifier()
ada.fit(cx_train,cy_train)
ada_preds = ada.predict(cx_test)
print('ada can do: {:.1%}'.format(accuracy_score(cy_test,ada_preds)))
print('---------')
confusion_matrix(cy_test,ada_preds)

ada can do: 86.0%
---------


array([[3498,  228],
       [ 457,  702]])