In [1]:
import os
import string
import numpy as np
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB

In [2]:
X = []
Y = [] 
for category in os.listdir('20_newsgroups'):
#https://realpython.com/working-with-files-in-python/
    for document in os.listdir('20_newsgroups/'+category):
        with open('20_newsgroups/'+category+'/'+document, "r") as f:
            X.append((document,f.read()))
            Y.append(category)

In [3]:
for category in os.listdir('20_newsgroups'):
    for document in os.listdir('20_newsgroups/'+category):
        with open('20_newsgroups/'+category+'/'+document, "r") as f:
            X.append((document,f.read()))
            Y.append(category)

In [4]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.50, random_state=0)

In [5]:
# A list of common english words which should not affect predictions
stopwords = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
 'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
 'each', 'few', 'for', 'from', 'further', 
 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
 "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves', 'out', 'over', 'own',
 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 
 'than', 'that',"that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", 
 "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 
 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
 "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",'will', 'with', "won't", 'would', "wouldn't", 
 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 
 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
 '4th', '5th', '6th', '7th', '8th', '9th', '10th']

In [6]:
vocab = {}
for i in range(len(X_train)):
    word_list = []
    for word in X_train[i][1].split():
        word_new  = word.strip(string.punctuation).lower()
        if (len(word_new)>2)  and (word_new not in stopwords):  
            if word_new in vocab:
                vocab[word_new]+=1
            else:
                vocab[word_new]=1        

In [7]:
features = []
for key in vocab:
    features.append(key)
features=features[0:2000]

In [8]:
X_train_data = np.zeros((len(X_train),len(features)))
for i in range(len(X_train)):
    word_list = [ word.strip(string.punctuation).lower() for word in X_train[i][1].split()]
    for word in word_list:
        if word in features:
            X_train_data[i][features.index(word)] += 1

In [9]:
X_test_data = np.zeros((len(X_test),len(features)))
for i in range(len(X_test)):
    word_list = [ word.strip(string.punctuation).lower() for word in X_test[i][1].split()]
    for word in word_list:
        if word in features:
            X_test_data[i][features.index(word)] += 1

In [11]:
# Using Multinomial Naive Bayes
clf = MultinomialNB()
clf.fit(X_train_data,Y_train)
Y_test_pred = clf.predict(X_test_data)
sklearn_score_train = clf.score(X_train_data,Y_train)
print("Sklearn's score on training data :",sklearn_score_train)
sklearn_score_test = clf.score(X_test_data,Y_test)
print("Sklearn's score on testing data :",sklearn_score_test)

Sklearn's score on training data : 0.8200230034505176
Sklearn's score on testing data : 0.7872180827124069


In [18]:
# Implementing Multinomial Naive Bayes from scratch
class MultinomialNaiveBayes:
    
    def __init__(self):
         
        self.count = {}
        self.classes = None
    
    def fit(self,X_train,Y_train):     
        self.classes = set(Y_train)
        for class_l in self.classes:
            self.count[class_l] = {}
            for i in range(len(X_train[0])):
                self.count[class_l][i] = 0
            self.count[class_l]['total'] = 0
            self.count[class_l]['total_score'] = 0
        self.count['total_score'] = len(X_train)
        
        for i in range(len(X_train)):
            for j in range(len(X_train[0])):
                self.count[Y_train[i]][j]+=X_train[i][j]
                self.count[Y_train[i]]['total']+=X_train[i][j]
            self.count[Y_train[i]]['total_score']+=1
    
    def probability(self,test_score,class_l):
        
        log_prob = np.log(self.count[class_l]['total_score']) - np.log(self.count['total_score'])
        total_words = len(test_score)
        for i in range(len(test_score)):
            current_word_prob = test_score[i]*(np.log(self.count[class_l][i]+1)-np.log(self.count[class_l]['total']+total_words))
            log_prob += current_word_prob
        
        return log_prob
    
    
    def predictSingle_Point(self,test_score):
        
        best_class = None
        best_prob = None
        first_run = True
        
        for class_l in self.classes:
            log_probability_current_class = self.probability(test_score,class_l)
            if (first_run) or (log_probability_current_class > best_prob) :
                best_class = class_l
                best_prob = log_probability_current_class
                first_run = False
                
        return best_class
        
  
    def predict(self,X_test):
        Y_pred = [] 
        for i in range(len(X_test)):
            Y_pred.append( self.predictSingle_Point(X_test[i]) )
        
        return Y_pred
    
    def score(self,Y_pred,Y_true):
        count = 0
        for i in range(len(Y_pred)):
            if Y_pred[i] == Y_true[i]:
                count+=1
        return count/len(Y_pred)

In [None]:
our_clf = MultinomialNaiveBayes()
our_clf.fit(X_train_data,Y_train)
Y_test_pred = our_clf.predict(X_test_data)
our_score_test = our_clf.score(Y_test_pred,Y_test)  
print("Our score on testing data :",our_score_test)