In [1]:
from naive_bayes import NaiveBayes
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB

categories = ('alt.atheism', 'sci.space', 'talk.religion.misc', 'comp.graphics')

newsgroups_train = fetch_20newsgroups(
    subset='train',
    remove=('headers', 'footers', 'quotes'),
    categories=categories)

newsgroups_test = fetch_20newsgroups(
    subset='test',
    remove=('headers', 'footers', 'quotes'),
    categories=categories)

x_train = newsgroups_train.data
x_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

pattern = re.compile("^\s*$")

empty_ind = np.array([i for i, s in enumerate(x_test) if re.match(pattern, s) is not None])
x_test = np.delete(x_test, empty_ind)
y_test = np.delete(y_test, empty_ind)
empty_ind = np.array([i for i, s in enumerate(x_train) if re.match(pattern, s) is not None])
x_train = np.delete(x_train, empty_ind)
y_train = np.delete(y_train, empty_ind)

y_test = [newsgroups_test.target_names[ind] for ind in y_test]
y_train = [newsgroups_train.target_names[ind] for ind in y_train]

count_vectorizer = CountVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
texts = np.concatenate((x_train, x_test))

count_vectorizer.fit(texts)
tfidf_vectorizer.fit(texts)

xcv_train = count_vectorizer.transform(x_train)
xcv_test = count_vectorizer.transform(x_test)

xTfidf_train = tfidf_vectorizer.transform(x_train)
xTfidf_test = tfidf_vectorizer.transform(x_test)

nb = NaiveBayes()

nb.fit(xcv_train, y_train)
pred = nb.predict(xcv_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

nb = NaiveBayes()

nb.fit(xTfidf_train, y_train)
pred = nb.predict(xTfidf_test)

print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

clf = MultinomialNB(alpha=4)
clf.fit(xcv_train, y_train)

y_pred = clf.predict(xcv_test)
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

clf = MultinomialNB(alpha=4)
clf.fit(xTfidf_train, y_train)  

y_pred = clf.predict(xTfidf_test)
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

                    precision    recall  f1-score   support

       alt.atheism       0.62      0.74      0.67       311
     comp.graphics       0.91      0.90      0.90       384
         sci.space       0.77      0.90      0.83       378
talk.religion.misc       0.72      0.37      0.49       245

          accuracy                           0.76      1318
         macro avg       0.76      0.73      0.72      1318
      weighted avg       0.77      0.76      0.75      1318

[[229   9  41  32]
 [  9 344  31   0]
 [ 16  17 342   3]
 [117   8  29  91]]
                    precision    recall  f1-score   support

       alt.atheism       0.53      0.85      0.65       311
     comp.graphics       0.88      0.92      0.90       384
         sci.space       0.88      0.81      0.84       378
talk.religion.misc       0.74      0.18      0.29       245

          accuracy                           0.74      1318
         macro avg       0.76      0.69      0.67      1318
      weighted avg