### === Task ===

1) Learn about TFidVectorizer and replace CountVectorizer with TfidfVectorizer (Explanation Provided in the Lecture)
2) Put Multinomial Naive Classification into a class that can transform the data, fit the model and do prediction.
    - In the class, allow users to choose whether to use CountVectorizer or TFIDVectorizer to transform the data.

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
class MultinomialNB():
 
    def transform_data(self, train_data, test_data, method):
        if method == 'CountVectorizer':
            vectorizer = CountVectorizer()
        
        elif method == 'TfidfVectorizer':
            vectorizer = TfidfVectorizer()
            
        else : 
            raise ValueError('Method must be: "CountVectorizer" or "TFidVectorizer"')
            
        X_train = vectorizer.fit_transform(train_data)
        X_test = vectorizer.transform(test_data)
        
        #X_train = X_train.toarray()
        X_test = X_test.toarray()  #vectorizer gives a sparse matrix, that's why converts to dense matrix
        y_train = train.target
        y_test = test.target
        return X_train, y_train, X_test, y_test
            
    def likelihood(self, X_class, laplace=1):
        return ((X_class.sum(axis=0)) + laplace) / (np.sum(X_class.sum(axis=0) + laplace))
    
    def prior(self, X_class):
        return X_class.shape[0] / self.m
    
    def fit(self, X_train, y_train):
        self.m, self.n = X_train.shape
        self.classes = np.unique(y_train) 
        self.k = len(self.classes) 

        #prior1 = len(train_target[train_target==1])/len(train_target)
        #prior0 = len(train_target[train_target==0])/len(train_target)

        self.priors = np.zeros(self.k) #prior for each classes
        self.likelihoods = np.zeros((self.k, self.n)) #likehood for each class of each feature

        for idx, label in (self.classes):
            X_train_c = X_train[y_train==label]
            self.priors[idx] = self.prior(X_train_c)
            self.likelihoods[idx, :] = self.likelihood(X_train_c)

    def predict(self, X_test):
        yhat = np.log(self.priors) * X_test @ np.log(self.likelihoods.T)
        return np.argmax(yhat, axis=1)

In [5]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups()
data.target_names

In [6]:
categories = ['talk.religion.misc', 'soc.religion.christian',
              'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

train_data = train.data
test_data = test.data

NameError: name 'fetch_20newsgroups' is not defined

### TFidVectorizer

In [7]:
model = MultinomialNB()
X_train, y_train, X_test, y_test = model.transform_data(train_data, test_data, method='TfidfVectorizer')                                        
model.fit(X_train, y_train)
yhat = model.predict(X_test)

NameError: name 'train_data' is not defined

In [3]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import average_precision_score, classification_report

n_classes = len(np.unique(y_test))

print("Accuracy: ", np.sum(yhat == y_test)/len(y_test))

print("=========Average precision score=======")
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2, 3])
yhat_binarized = label_binarize(yhat, classes=[0, 1, 2, 3])

for i in range(n_classes):
    class_score = average_precision_score(y_test_binarized[:, i], yhat_binarized[:, i])
    print(f"Class {i} score: ", class_score)
    
print("=========Classification report=======")
print("Report: ", classification_report(y_test, yhat))

NameError: name 'np' is not defined

In [8]:
from sklearn.metrics import confusion_matrix

#use confusion matrix
mat = confusion_matrix(y_test, yhat)

import seaborn as sns
sns.heatmap(mat.T, annot=True, fmt="d",
           xticklabels=train.target_names, yticklabels=train.target_names)
plt.xlabel('true')
plt.ylabel('predicted')

NameError: name 'y_test' is not defined

###  CountVectorizer

In [8]:
model = MultinomialNB()
X_train, y_train, X_test, y_test = model.transform_data(train_data, test_data, method='CountVectorizer')                                        
model.fit(X_train, y_train)
yhat = model.predict(X_test)

In [9]:
n_classes = len(np.unique(y_test))

print("Accuracy: ", np.sum(yhat == y_test)/len(y_test))

print("=========Average precision score=======")
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2, 3])
yhat_binarized = label_binarize(yhat, classes=[0, 1, 2, 3])

for i in range(n_classes):
    class_score = average_precision_score(y_test_binarized[:, i], yhat_binarized[:, i])
    print(f"Class {i} score: ", class_score)
    
print("=========Classification report=======")
print("Report: ", classification_report(y_test, yhat))

NameError: name 'y_test' is not defined

In [10]:
#use confusion matrix
mat = confusion_matrix(y_test, yhat)

sns.heatmap(mat.T, annot=True, fmt="d",
           xticklabels=train.target_names, yticklabels=train.target_names)
plt.xlabel('true')
plt.ylabel('predicted')

NameError: name 'y_test' is not defined