##

# Окутин Денис ИУ9-11М Задание 8

In [1]:
from sklearn.datasets import fetch_20newsgroups
import re
import numpy as np

## Скачивание датасета

In [2]:
categories = [
    'comp.graphics',
    'sci.med',
    'alt.atheism',
]

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

X_train_raw, y_train = newsgroups_train.data, newsgroups_train.target
X_test_raw, y_test = newsgroups_test.data, newsgroups_test.target

## Обработка

In [3]:
def preprocessing (text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    return tokens

word_set = set()
for text in X_train_raw:
    tokens = preprocessing(text)
    word_set.update(tokens)

vocab = list(word_set)
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 29401


## Кодирование

In [4]:
import numpy as np

def one_hot(text,vocab):
    res = np.zeros(len(vocab))
    for word in text:
        if word in vocab:
            index = vocab.index(word)
            res[index]=1
    return res

X_train = []
for text in X_train_raw:
    tokens = preprocessing(text)
    X_train.append(one_hot(tokens,vocab))
X_train = np.array(X_train)

X_test = []
for text in X_test_raw:
    tokens = preprocessing(text)
    X_test.append(one_hot(tokens,vocab))
X_test = np.array(X_test)

## Метрики

In [5]:
def my_accuracy_score(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    return correct / len(y_true)

def my_precision_score(y_true, y_pred,average='macro'):
    classes = np.unique(y_true)
    precisions = []

    for c in classes:
        tp = np.sum((y_pred == c) & (y_true == c))
        fp = np.sum((y_pred == c) & (y_true != c))

        precision = 0
        if (tp + fp) > 0:
            precision = tp / (tp + fp)

        precisions.append(precision)
    if average == 'macro':
      return np.mean(precisions)
    return precisions

def my_recall_score(y_true, y_pred,average='macro'):
    classes = np.unique(y_true)
    recalls = []

    for c in classes:
        tp = np.sum((y_pred == c) & (y_true == c))
        fn = np.sum((y_true == c) & (y_pred != c))

        recall = 0
        if (tp + fn) > 0:
            recall = tp / (tp + fn)

        recalls.append(recall)

    if average == 'macro':
      return np.mean(recalls)
    return recalls

def my_f1_score(y_true, y_pred, average='macro'):
    precisions = my_precision_score(y_true, y_pred,average=None)
    recalls = my_recall_score(y_true, y_pred,average=None)

    f1_scores = []
    for p, r in zip(precisions, recalls):
      if (p + r) == 0:
          return 0
      f1 = 2 * p * r / (p + r)
      f1_scores.append(f1)

    if average == 'macro':
      return np.mean(f1_scores)

    return f1_scores

In [6]:
X_train_counts = X_train.astype(int)
X_test_counts = X_test.astype(int)

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

parameters = {'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]}
nb = MultinomialNB()
grid_search = GridSearchCV(nb, parameters, cv=5, scoring='f1_macro')
grid_search.fit(X_train_counts, y_train)

best_alpha = grid_search.best_params_['alpha']
best_nb = grid_search.best_estimator_

y_pred_sklearn = best_nb.predict(X_test_counts)

print("Sklearn MultinomialNB Results:")
print(f"Best Alpha: {best_alpha}")
print(f"Accuracy: {my_accuracy_score(y_test, y_pred_sklearn)}")
print(f"Precision: {my_precision_score(y_test, y_pred_sklearn)}")
print(f"Recall: {my_recall_score(y_test, y_pred_sklearn)}")
print(f"F1-Score: {my_f1_score(y_test, y_pred_sklearn)}")

Sklearn MultinomialNB Results:
Best Alpha: 0.1
Accuracy: 0.9538043478260869
Precision: 0.95307794162756
Recall: 0.9552029021712559
F1-Score: 0.9538134321759247


In [8]:
from collections import defaultdict
import math

class MultinomialNB:
    def __init__(self):
        self.V = []
        self.prior = {}
        self.condprob = defaultdict(dict)
        self.C = []

    def train(self, D, C):
        self.V = vocab
        N = len(D)

        self.C = np.unique(C)

        for c in self.C:
            N_c = np.sum(np.array(C) == c)
            self.prior[c] = N_c / N

            text_c = []
            for doc, cls in zip(D, C):
                if cls == c:
                    tokens = preprocessing(doc)
                    text_c.extend(tokens)

            T_ct = {}
            for t in self.V:
                T_ct[t] = text_c.count(t)

            denominator = sum(T_ct.values()) + len(self.V)

            for t in self.V:
                self.condprob[t][c] = (T_ct[t] + 1) / denominator

        return self.V, self.prior, self.condprob

    def apply(self, d):
        tokens = preprocessing(d)
        W = [t for t in tokens if t in self.V]
        score = {}

        for c in self.C:
            score[c] = math.log(self.prior[c])

            for t in W:
                score[c] += math.log(self.condprob[t][c])

        return max(score.items(), key=lambda x: x[1])[0]

    def predict(self, D):
        return [self.apply(d) for d in D]

In [10]:
nb = MultinomialNB()
V, prior, condprob = nb.train(X_train_raw, y_train)

y_pred = nb.predict(X_test_raw)

print("My MultinomialNB Results:")
print(f"Accuracy: {my_accuracy_score(y_test, y_pred)}")
print(f"Precision: {my_precision_score(y_test, y_pred)}")
print(f"Recall: {my_recall_score(y_test, y_pred)}")
print(f"F1-Score: {my_f1_score(y_test, y_pred)}")

My MultinomialNB Results:
Best Alpha: 0.1
Accuracy: 0.9483695652173914
Precision: 0.947269415135727
Recall: 0.9503252839836476
F1-Score: 0.9482125917923584
