In [1]:
import pandas as pd

filename = './data/SMSSpamCollection'

In [2]:
df = pd.read_csv(
    filename,
    sep='\t',
    encoding='utf8',
    header=None,
    names=['class', 'sms_text']
)

df.head(3)

Unnamed: 0,class,sms_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [3]:
from collections import defaultdict
import string
from math import log

class NaiveBayes:
    def __init__(self):
        self.classes = defaultdict(lambda:0) 
        self.freq = defaultdict(lambda:0)

    def fit(self, data: list, target: list):
        """

        :param data: массив документов, каждый документ - объект типа str
        :param target: массив меток объектов
        :return:
        """
        num_objects = len(data)
        string_all_words = df['sms_text'].sum()

        tt = str.maketrans(dict.fromkeys(string.punctuation))
        all_words_without_punctuation = string_all_words.translate(tt)
        
        words = ' '.join([
            word.lower()
            for word in all_words_without_punctuation.split(sep=' ')
        ])

        all_words = ' '.join(set(words.split(sep=' ')))
        list_words = all_words.split(sep = ' ')
        
        words_all = ' '.join(list_words)
        
        #classes, freq = defaultdict(lambda:0), defaultdict(lambda:0)

        for label in target:
            self.classes[label] += 1
        
        for word in words_all.split(sep = ' '):
            j = 0
            iter_item = iter(data)
            for label in target:
                #sms = data['sms_text'][j]
                sms = next(iter_item)
                if sms.find(word) != -1:
                    self.freq[label, word] += 1
                j += 1

        for label in self.classes:  # normalize features frequencies
            for word in words_all.split(sep = ' '):
                self.freq[label, word] /= self.classes[label]
        for c in self.classes:                       # normalize classes frequencies
            self.classes[c] /= num_objects

        return self.classes, self.freq

    def predict(self, data: list):
        """

        :param data: массив документов, для каждого из которых нужно предсказать метку
        :return:
        """
        class_data = [0 for x in range(len(data))]
        i = 0
        
        for string in data:
            max_prob = 0
            max_cl = 0
            
            for cl in self.classes:
                p_current_class = log(self.classes[cl])
                iter_item = iter(self.classes)
                spam = next(iter_item)
                not_spam = next(iter_item)
                for word in string.split(sep = ' '):
                    #word = w.lower()
                    p_current_class = p_current_class+log(max(self.freq[cl, word],10**(-7)))
                if ((max_prob == 0) or (p_current_class > max_prob)):
                    max_prob = p_current_class
                    max_cl = cl
            class_data[i] = max_cl
            i += 1
        return class_data

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
class_trn, class_tst, sms_trn, sms_tst = train_test_split(df['class'], df['sms_text'], test_size=0.2)

In [5]:
sms_classifier = NaiveBayes()

In [6]:
prob_class,prob_word = sms_classifier.fit(sms_trn, class_trn)

In [7]:
class_test = sms_classifier.predict(sms_tst)

In [8]:
# Проверка: процент сообщений, для которых было правильно определено, являются ли они спамом 
s = 0
iter_item = iter(class_tst)
for cl in class_test:
    cl_tst = next(iter_item)
    if (cl == cl_tst):
        s += 1
        
print(f'Точность предсказания: {s/len(class_test):.5f}')

Точность предсказания: 0.93812
