# Required Libraries

In [4]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from pprint import pprint
import numpy as np

# Dataset
Load the dataset and print the classes

In [5]:
# Load the Reuters dataset
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))

classes = list(newsgroups_train.target_names)
pprint(classes)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


Let's see the first example. It's a letter form.

In [6]:
print("SHAPE ", newsgroups_train.filenames.shape)
print("----------------------------------------")
print(newsgroups_train.data[0])
print("----------------------------------------")
print(newsgroups_train.target[0], "=>", classes[newsgroups_train.target[0]])

SHAPE  (11314,)
----------------------------------------
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.
----------------------------------------
7 => rec.autos


# Preprocessing

Preprocess every example: remove stopwords and compute the Tf-idf vector representation.

In [7]:
# Extract features from the dataset
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)
y_train = newsgroups_train.target
y_test = newsgroups_test.target

Let's see what's inside after the transformation

In [17]:
print(X_train[0])
print(X_train.shape)
print(len(newsgroups_train.data), len(vectorizer.vocabulary_))

  (0, 59071)	0.10043853867312116
  (0, 57250)	0.1063473585616558
  (0, 41874)	0.224548896412017
  (0, 49800)	0.11869932893481257
  (0, 46690)	0.12504220873599214
  (0, 73174)	0.16142029533900565
  (0, 99608)	0.09418459052541318
  (0, 84050)	0.16329311028814825
  (0, 37208)	0.1434127293323407
  (0, 62594)	0.13037295035007848
  (0, 87913)	0.25808578247347563
  (0, 54493)	0.06961997844491917
  (0, 23430)	0.12937103288512333
  (0, 77676)	0.12197186951739486
  (0, 81450)	0.1461308934288897
  (0, 24583)	0.19644480500804062
  (0, 16806)	0.1407774554706102
  (0, 83208)	0.11339406589538423
  (0, 76269)	0.08978258481915573
  (0, 34742)	0.17300821242559045
  (0, 24108)	0.24723134514216435
  (0, 25437)	0.10548299054214269
  (0, 11174)	0.20599311323287353
  (0, 35902)	0.1266709604197344
  (0, 9843)	0.20797700857530224
  (0, 55606)	0.13822596989753821
  (0, 57247)	0.1352084247105906
  (0, 84312)	0.16368392505928514
  (0, 34741)	0.14847880131844235
  (0, 31927)	0.10526008886822914
  (0, 80420)	0.1270

Each dimension corresponds to a word in the original text. For example:

In [18]:
print(vectorizer.get_feature_names_out()[25717], X_train[0, 25717])
print(vectorizer.get_feature_names_out()[80420], X_train[0, 80420])

car 0.46579831435138974
saw 0.127069039671221


# Naive Bayes

## Multinomial NB

Let's define a Multinomial model and train it on the same dataset.

In [23]:
# define the Multinomial Naive Bayes model
MultinomialNB_model = MultinomialNB(alpha=.01)

# train
MultinomialNB_model.fit(X_train, y_train)

# predict
y_pred = MultinomialNB_model.predict(X_test[:500])
print(classification_report(y_test[:500], y_pred, target_names=classes))

                          precision    recall  f1-score   support

             alt.atheism       0.44      0.19      0.27        21
           comp.graphics       0.58      0.67      0.62        21
 comp.os.ms-windows.misc       0.65      0.50      0.57        26
comp.sys.ibm.pc.hardware       0.70      0.76      0.73        34
   comp.sys.mac.hardware       0.81      0.74      0.77        34
          comp.windows.x       0.83      0.73      0.78        26
            misc.forsale       0.74      0.77      0.76        22
               rec.autos       0.79      0.79      0.79        28
         rec.motorcycles       0.85      0.70      0.77        33
      rec.sport.baseball       1.00      0.88      0.94        25
        rec.sport.hockey       0.67      0.96      0.79        27
               sci.crypt       0.71      0.85      0.77        20
         sci.electronics       0.74      0.58      0.65        24
                 sci.med       0.75      0.91      0.82        23
         

We can extract for each class the most important features (words in our case) and print them.

In [24]:
# show the top 10 features
def show_top10(classifier, vectorizer, categories):
  feature_names = np.asarray(vectorizer.get_feature_names_out())
  for i, category in enumerate(categories):
    top10 = np.argsort(classifier.feature_count_[i])[-10:]
    print("%s: \t\t %s" % (category, " ".join(feature_names[top10])))

show_top10(MultinomialNB_model, vectorizer, classes)

alt.atheism: 		 islam atheists say just religion atheism think don people god
comp.graphics: 		 looking format 3d know program file files thanks image graphics
comp.os.ms-windows.misc: 		 card problem thanks driver drivers use files dos file windows
comp.sys.ibm.pc.hardware: 		 monitor disk thanks pc ide controller bus card scsi drive
comp.sys.mac.hardware: 		 know monitor does quadra simms thanks problem drive apple mac
comp.windows.x: 		 using windows x11r5 use application thanks widget server motif window
misc.forsale: 		 asking email sell price condition new shipping offer 00 sale
rec.autos: 		 don ford new good dealer just engine like cars car
rec.motorcycles: 		 don just helmet riding like motorcycle ride bikes dod bike
rec.sport.baseball: 		 braves players pitching hit runs games game baseball team year
rec.sport.hockey: 		 league year nhl games season players play hockey team game
sci.crypt: 		 people use escrow nsa keys government chip clipper encryption key
sci.electronics: 	

## Multivariate NB

We can train a Bernoulli model, that is the Multivariate implementation of the NB.

In [25]:
# define the Multinomial Naive Bayes model
BernoulliNB_model = BernoulliNB(alpha=.01)

# train
BernoulliNB_model.fit(X_train, y_train)

# predict
y_pred = BernoulliNB_model.predict(X_test[:500])
print(classification_report(y_test[:500], y_pred, target_names=classes))

                          precision    recall  f1-score   support

             alt.atheism       0.36      0.43      0.39        21
           comp.graphics       0.41      0.57      0.48        21
 comp.os.ms-windows.misc       0.00      0.00      0.00        26
comp.sys.ibm.pc.hardware       0.57      0.79      0.67        34
   comp.sys.mac.hardware       0.40      0.79      0.53        34
          comp.windows.x       0.75      0.46      0.57        26
            misc.forsale       0.77      0.77      0.77        22
               rec.autos       0.48      0.79      0.59        28
         rec.motorcycles       0.50      0.73      0.59        33
      rec.sport.baseball       0.79      0.88      0.83        25
        rec.sport.hockey       1.00      0.85      0.92        27
               sci.crypt       0.83      0.50      0.62        20
         sci.electronics       0.52      0.62      0.57        24
                 sci.med       0.84      0.70      0.76        23
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


And then see the most important features for each class.

In [26]:
show_top10(BernoulliNB_model, vectorizer, classes)

alt.atheism: 		 time know does god like say just think people don
comp.graphics: 		 just don program need does use like know graphics thanks
comp.os.ms-windows.misc: 		 does dos like just file using thanks know use windows
comp.sys.ibm.pc.hardware: 		 pc problem use drive just like card does know thanks
comp.sys.mac.hardware: 		 problem don use like just thanks does apple know mac
comp.windows.x: 		 help server problem know does using like thanks use window
misc.forsale: 		 mail used edu sell condition interested shipping offer new sale
rec.autos: 		 time new think know good cars don just like car
rec.motorcycles: 		 time ride dod good think don know just like bike
rec.sport.baseball: 		 games time team good game don like just think year
rec.sport.hockey: 		 year good don think just like play hockey game team
sci.crypt: 		 chip clipper government don encryption people use like just key
sci.electronics: 		 power want good just does used don know like use
sci.med: 		 does good think time

# EXERCISE:
* (1) Find the best _alpha_ parameter for **MultinomialNB** and **BernoulliNB** models.
* (2) Plot the results taking _F1_ measure as reference.
* (3) Make a comparison between **Rocchio**, **MultinomialNB** and **BernoulliNB** model.
