In [201]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score 
from sklearn.naive_bayes import MultinomialNB
from __future__ import division
from collections import defaultdict
from math import log
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [144]:
class Bayes:
    
    def __init__(self):
        self._classes = None
        self._freq = None
    
    def fit(self, samples):
        classes, freq = defaultdict(lambda:0), defaultdict(lambda:0)
        for feats, label in samples:
            classes[label] += 1         
            for feat in feats:
                freq[label, feat] += 1          

        for label, feat in freq:                
            freq[label, feat] /= classes[label]
        for c in classes:                       
            classes[c] /= len(samples)

        self._classes = classes
        self._freq = freq
    
    def predict(self, feats): 
        return min(self._classes.keys(),              
            key = lambda cl: -log(self._classes[cl]) + \
                sum(-log(self._freq.get((cl,feat), 10**(-7))) for feat in feats))
    
    def cross_val(self, values,n=5):
        lol = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
        val = lol(values,round(len(values)/n))
        accuracy, f1  = [], []
        for i in range(n):
            
            train_val = []
            for j in range(n):
                if j!=i:
                    train_val += val[j]
            self.fit(train_val)
            output = []
            check = [] 
            for l in range(len(val[i])):
                output.append(self.predict(val[i][l][0]))
                check.append(val[i][l][1])   
            output, check = np.array(output), np.array(check)   

            f1.append(f1_score(check,output, average='macro'))
            accuracy.append(accuracy_score(check,output))
        print('Accurace_score: {}'.format(accuracy))
        print('f1_score: {}'.format(f1))


In [202]:
df=pd.read_table('SMSSpamCollection',header=None)
df.rename(columns = {0: 'label', 1: 'message'}, inplace = True)
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [204]:
samples = []
for i in range(df.shape[0]):
    samples.append((re.findall(r"[\w']+",  df['message'][i]),df['label'][i]))

In [209]:
A = Bayes()
A.cross_val(samples)

Accurace_score: [0.981149012567325, 0.9883303411131059, 0.9856373429084381, 0.9757630161579892, 0.9847396768402155]
f1_score: [0.9634680521135405, 0.9740031270924323, 0.9664982557440154, 0.9492757655939329, 0.966204532354458]


In [221]:
train = df.loc[np.array([i for i in range(df.shape[0])])]
train.reset_index(inplace=True)
train.drop(['index'],axis=1,inplace=True)

clf = MultinomialNB()
clf=Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

cross_val_score(clf,train['message'],train['label'], cv=5)

array([0.96143498, 0.95515695, 0.95780969, 0.9551167 , 0.96050269])