In [150]:
from sklearn import datasets
train = datasets.load_files("BBC", encoding="latin1")

In [151]:
from sklearn import model_selection
trainingData, testingData, trainingTarget, testingTarget = model_selection.train_test_split(train.data, train.target, train_size=0.8, test_size=0.2, random_state=None)

In [171]:
#MultinomialNB default values, try 1
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(trainingData, trainingTarget)
predicted = text_clf.predict(testingData)

In [172]:
#(b)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(testingTarget, predicted))

[[ 99   0   2   0   0]
 [  1  57   5   1   3]
 [  2   0  79   1   0]
 [  0   0   0 104   0]
 [  1   0   1   4  85]]


In [155]:
#(c) and (d)
from sklearn.metrics import classification_report
target_names = ['business', 'entertainment', 'politics', 'sports', 'tech']
print(classification_report(testingTarget, predicted, target_names=target_names))

               precision    recall  f1-score   support

     business       0.96      0.98      0.97       101
entertainment       1.00      0.85      0.92        67
     politics       0.91      0.96      0.93        82
       sports       0.95      1.00      0.97       104
         tech       0.97      0.93      0.95        91

     accuracy                           0.95       445
    macro avg       0.96      0.95      0.95       445
 weighted avg       0.95      0.95      0.95       445



In [156]:
#(d)
from sklearn.metrics import accuracy_score
print(accuracy_score(testingTarget, predicted))

0.952808988764045


In [182]:
#(h)
x = text_clf["vect"].fit_transform(train.data).toarray().sum(axis=0)
sum = 0
for n in x:
    sum += n
sum

836357

In [183]:
#(f)
len(x)

29421

In [184]:
#(j)
y = [i for i in x if i == 1]
len(y)

10005

In [167]:
#MultinomialNB default values, try 2
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(trainingData, trainingTarget)
predicted = text_clf.predict(testingData)

In [168]:
#(b)
print(confusion_matrix(testingTarget, predicted))

[[ 99   0   2   0   0]
 [  1  57   5   1   3]
 [  2   0  79   1   0]
 [  0   0   0 104   0]
 [  1   0   1   4  85]]


In [169]:
#(c) and (d)
target_names = ['business', 'entertainment', 'politics', 'sports', 'tech']
print(classification_report(testingTarget, predicted, target_names=target_names))

               precision    recall  f1-score   support

     business       0.96      0.98      0.97       101
entertainment       1.00      0.85      0.92        67
     politics       0.91      0.96      0.93        82
       sports       0.95      1.00      0.97       104
         tech       0.97      0.93      0.95        91

     accuracy                           0.95       445
    macro avg       0.96      0.95      0.95       445
 weighted avg       0.95      0.95      0.95       445



In [204]:
#MultinomialNB smoothing value of 0.0001
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.0001)),
])
text_clf.fit(trainingData, trainingTarget)
predicted = text_clf.predict(testingData)

In [187]:
#(b)
print(confusion_matrix(testingTarget, predicted))

[[ 95   0   2   0   4]
 [  1  61   1   0   4]
 [  0   0  82   0   0]
 [  0   0   0 104   0]
 [  0   1   1   0  89]]


In [188]:
#(c) and (d)
target_names = ['business', 'entertainment', 'politics', 'sports', 'tech']
print(classification_report(testingTarget, predicted, target_names=target_names))

               precision    recall  f1-score   support

     business       0.99      0.94      0.96       101
entertainment       0.98      0.91      0.95        67
     politics       0.95      1.00      0.98        82
       sports       1.00      1.00      1.00       104
         tech       0.92      0.98      0.95        91

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445



In [189]:
#(d)
print(accuracy_score(testingTarget, predicted))

0.9685393258426966


In [206]:
#MultinomialNB smoothing value of 0.9
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.9)),
])
text_clf.fit(trainingData, trainingTarget)
predicted = text_clf.predict(testingData)

In [198]:
#(b)
print(confusion_matrix(testingTarget, predicted))

[[ 99   0   2   0   0]
 [  1  58   4   1   3]
 [  0   0  81   1   0]
 [  0   0   0 104   0]
 [  1   0   1   4  85]]


In [199]:
#(c) and (d)
target_names = ['business', 'entertainment', 'politics', 'sports', 'tech']
print(classification_report(testingTarget, predicted, target_names=target_names))

               precision    recall  f1-score   support

     business       0.98      0.98      0.98       101
entertainment       1.00      0.87      0.93        67
     politics       0.92      0.99      0.95        82
       sports       0.95      1.00      0.97       104
         tech       0.97      0.93      0.95        91

     accuracy                           0.96       445
    macro avg       0.96      0.95      0.96       445
 weighted avg       0.96      0.96      0.96       445



In [200]:
#(d)
print(accuracy_score(testingTarget, predicted))

0.9595505617977528
