In [1]:
from collections import Counter
from math import log
import numpy as np

## Создал кастомный класс 

In [2]:
class naive_bais:
    clasFreq = dict()
    wordProbs = {}
    classes = []
    
    # Counts words in each class and returns dictionary [class:Counter for words in class]
    def count(self,X,Y):
        wordDict = dict()
        for clas in self.classes:
            wordCounter = Counter()
            for i in range(len(X)):
                if Y[i] == clas:
                    text = X[i].lower().split()
                    wordCounter.update(text)
            wordDict.update({clas:dict(wordCounter)})
        return wordDict

    def fit(self,X, Y):
        # classes
        classes = Counter()
        classes.update(Y)
        self.classes = list(classes)
        clasFreq = classes.most_common()
        for i in range(len(clasFreq)):
            clasFreq[i] = (clasFreq[i][0], float(clasFreq[i][1]) / len(Y))
        clasFreq = dict(clasFreq)

        
        words = self.count(X,Y)

        # word frequency in classes
        wordProbs = {}
        self.unique_words = len(words.get("ham",[])) + len(words.get("spam",[]))
        for clas in self.classes:
            totalWords = sum(words[clas].values())
            wordProb = {}
            for word in words[clas]:
                wordProb.update([(word,log( ( words[clas][word]+1 )/ (self.unique_words + totalWords) ))])
            wordProbs.update({clas: wordProb})
        self.clasFreq = clasFreq
        self.wordProbs = wordProbs
        self.words =words

    def predict1(self, text):
        # splitting
        temp = text.split()
        bestClass = self.classes[0]
        bestScore = -999999999.
        for clas in self.classes:
            score = log(self.clasFreq[clas])
            for word in temp:
                if word in self.wordProbs[clas]:
                    prob = self.wordProbs[clas][word]
                else:
                    prob = -log( self.unique_words + sum(self.words[clas].values()))
                score += prob
            if score > bestScore:
                bestClass = clas
                bestScore = score
        return bestClass

    def predict(self,arr):
        ans = []
        for text in arr:
            ans.append(self.predict1(text))
        return ans
    def get_params(self, deep=True): # Сделано только что бы cross_val_score работал
        return dict()

In [3]:
# parsing
file = open("SMSSpamCollection", encoding="utf8")
file = file.read()
file = file.translate({ord(c): None for c in '1234567890.,-:!?"\'/«»„“();'})
lines = file.split('\n')
data = []
for i in range(len(lines)):
    data.append(lines[i].split('\t'))
data.pop()
X = []
Y = []
for row in data:
    Y.append(row[0])
    X.append(row[1])

In [4]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = \
    train_test_split(X,Y,test_size=0.33)


## Протестировал на работоспособность

In [5]:
clf = naive_bais()
clf.fit(x_train,y_train)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,clf.predict(x_test)))

0.9141304347826087


## Для сравнения запустил аналог из Sklearn 

In [6]:
from sklearn import naive_bayes
skclf = naive_bayes.MultinomialNB()

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)

# from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

x_trainT,x_testT,y_trainT,y_testT = \
    train_test_split(X_train_counts,Y,test_size=0.33)
skclf.fit(x_trainT,y_trainT)
print(accuracy_score(y_testT,skclf.predict(x_testT)))

0.9695652173913043


## Сравним результаты классификаторов на кросс валидации с 5 делениями по точности

In [8]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X,Y,scoring = 'accuracy',cv=5)
print("My NB\nAccuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
scores = cross_val_score(skclf,X_train_counts,Y,scoring = 'accuracy',cv=5)
print("Sklearn NB\nAccuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


My NB
Accuracy: 0.92 (+/- 0.01)
Sklearn NB
Accuracy: 0.97 (+/- 0.00)


## Так как выборка несбалансированна (14:86) сравним результаты классификаторов на кросс валидации с 5 делениями по f1 score которая более показательна
#### Выбрал F1 а не roc-auc ,так как он более внимателен к ошибкам нежели к попаданиям

In [9]:
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import label_binarize
import numpy as np
encoded_column_vector = label_binarize(Y, classes=['ham','spam']) # ham will be 0 and spam will be 1
Yb = np.ravel(encoded_column_vector)

In [10]:
scores = cross_val_score(clf, X,Yb,scoring = 'f1',cv=5)
print("My NB\nF1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

My NB
F1: 0.57 (+/- 0.02)
Wall time: 3.47 s


In [11]:
scores = cross_val_score(skclf,X_train_counts,Yb,scoring = 'f1',cv=5)
print("Sklearn NB\nF1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Sklearn NB
F1: 0.90 (+/- 0.01)
Wall time: 34 ms


# Вывод:
#### До Sklearn-а далеко, но рабочий алгоритм написан собственными руками