In [78]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vital\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [79]:
# https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
data = pd.read_csv('spam.csv', encoding="ISO-8859-1")[['v1', 'v2']]
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [80]:
 # 0 - ham, 1 - spam
data['class_code'] = data['v1'].apply(lambda x: int(x == 'spam'))
bad_message = data.query('class_code == 1')['v2'].to_list()
good_message = data.query('class_code == 0')['v2'].to_list()

In [81]:
def clean_input_text(input_text):
    input_text = input_text.lower()
    bad_words = ['www', 'http', '@']
    for b_w in bad_words:
        input_text = " ".join([word for word in input_text.split() if word.find(b_w) == -1])
    input_text = re.sub(r'[^\w\s]', '', input_text)
    stop_words = stopwords.words('english')
    input_text = " ".join([word for word in input_text.split() if word not in (stop_words)])
    return input_text

In [82]:
# Чистим сообщения от ссылок, знаков препинаний, стоп-слов
data['v2'] = data['v2'].apply(clean_input_text)
data

Unnamed: 0,v1,v2,class_code
0,ham,go jurong point crazy available bugis n great ...,0
1,ham,ok lar joking wif u oni,0
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,1
3,ham,u dun say early hor u c already say,0
4,ham,nah dont think goes usf lives around though,0
...,...,...,...
5567,spam,2nd time tried 2 contact u u å750 pound prize ...,1
5568,ham,ì_ b going esplanade fr home,0
5569,ham,pity mood soany suggestions,0
5570,ham,guy bitching acted like id interested buying s...,0


In [83]:
X_train, X_test, y_train, y_test = train_test_split(data['v2'].values, data['class_code'].values, test_size=0.20, stratify=data['class_code'], random_state=42)

In [84]:
def create_dataset(ham, spam):
    dataset = []
    word_len = 4
    for text in ham:
        text_split = text.split(sep=' ')
        temp = []
        for word in text_split:
            if len(word) >= word_len:
                temp += [word]
        dataset.append([temp, 0])

    for text in spam:
        text_split = text.split(sep=' ')
        temp = []
        for word in text_split:
            if len(word) >= word_len:
                temp += [word]
        dataset.append([temp, 1])
    return dataset

In [85]:
ham_train = X_train[y_train == 0]
spam_train = X_train[y_train == 1]
ham_test = X_test[y_test == 0]
spam_test = X_test[y_test == 1]
train_dataset = create_dataset(ham_train, spam_train)
test_dataset = create_dataset(ham_test, spam_test)

In [86]:
class NaiveBayesClassifier:

    def __init__(self, alpha=1, dataset={}):
        self.dataset = dataset
        self.alpha = alpha
        self.total = set()
        self.classes = {}
        self.total_in_class = {}
        self.frequency = {}

    def fit(self):
        for items in self.dataset:
            if items[1] not in self.classes:
                self.classes[items[1]] = 0
                self.total_in_class[items[1]] = 0
            self.classes[items[1]] += 1
            for item in items[0]:
                if (item, items[1]) not in self.frequency:
                    self.frequency[(item, items[1])] = 0
                self.frequency[(item, items[1])] += 1
                self.total_in_class[items[1]] += 1
                self.total.add(item)

        for items in self.frequency:
            self.frequency[(items[0], items[1])] = (self.alpha + self.frequency[(items[0], items[1])])/(self.alpha*len(self.total) + self.total_in_class[items[1]])
        for cls in self.classes:
            self.classes[cls] /= len(self.dataset)
        return self

    def predict(self, items):
        result = max(self.classes.keys(), key = lambda cls: np.log10(self.classes[cls]) + sum(np.log10(self.frequency.get((feature, cls), self.alpha/(self.alpha*len(self.total)+self.total_in_class[cls]))) for feature in items))
        return result

In [87]:
bayesModel = NaiveBayesClassifier(alpha=1, dataset=train_dataset)
bayesModel.fit()

<__main__.NaiveBayesClassifier at 0x25e311b84d0>

In [88]:
train_predicts = []
for x_train in X_train:
  train_predicts.append(bayesModel.predict(x_train.split()))

In [89]:
test_predicts = []
for x_test in X_test:
  test_predicts.append(bayesModel.predict(x_test.split()))

In [90]:
print(f"Training accuracy: {accuracy_score(y_train, train_predicts)}")
print(f"Test accuracy: {accuracy_score(y_test, test_predicts)}")
print(classification_report(y_train, train_predicts))
print(classification_report(y_test, test_predicts))

Training accuracy: 0.9611846533542742
Test accuracy: 0.915695067264574
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      3859
           1       0.79      0.97      0.87       598

    accuracy                           0.96      4457
   macro avg       0.89      0.97      0.92      4457
weighted avg       0.97      0.96      0.96      4457

              precision    recall  f1-score   support

           0       0.99      0.91      0.95       966
           1       0.62      0.97      0.75       149

    accuracy                           0.92      1115
   macro avg       0.81      0.94      0.85      1115
weighted avg       0.94      0.92      0.92      1115

