<a href="https://colab.research.google.com/github/Nikhileswar-Komati/100D_ML/blob/master/Algorithms/NaiveBayesClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
from collections import defaultdict

In [2]:
text = "Hello, Stupid:: idiot!"
def preprocess(string):
  processed_string = re.sub('[^a-z]+', ' ', string, flags=re.IGNORECASE)
  processed_string = re.sub('(\s+)', ' ', processed_string)
  processed_string = processed_string.lower()
  return processed_string

In [3]:
print(preprocess(text))

hello stupid idiot 


In [4]:
class NaiveBayesFromScratch:

  def __init__(self, unq_classes_count):
    self.unq_classes_count = unq_classes_count

  #example refers to one sample or one observation .. the class_label is used to create bag of words for particular class
  def bag_of_words(self, example, class_label_indx):
    #if example is a numpy array retreve example[0]
    if isinstance(example, np.ndarray):
      example = example[0]

    for word in example.split():
      self.bow_dict[class_label_indx][word] += 1

  def fit(self, data, labels):

    self.data = data
    self.labels = labels
    self.bow_dict = np.array([defaultdict(lambda: 0) for _ in range(self.unq_classes_count)])
    self.unq_classes = np.unique(self.labels)

    if not isinstance(self.data, np.ndarray):
      self.data = np.array(self.data)
    if not isinstance(self.labels, np.ndarray):
      self.labels = np.array(self.labels)

    for cat_indx, cat in enumerate(self.unq_classes):
      particular_cat_data = self.data[self.labels == cat]
      cleaned_particular_cat_data = [preprocess(ele) for ele in particular_cat_data]
      cleaned_particular_cat_data = pd.DataFrame(data=cleaned_particular_cat_data)
      np.apply_along_axis(self.bag_of_words, 1, cleaned_particular_cat_data, cat_indx)

    prob_classes = np.zeros(self.unq_classes_count)
    words_in_each_class = np.zeros(self.unq_classes_count)
    all_words = []

    for cat_indx, cat in enumerate(self.unq_classes):
      prob_classes[cat_indx] = np.sum(self.labels == cat) / (self.unq_classes_count)
      words_in_each_class[cat_indx] = np.sum(np.array(list(self.bow_dict[cat_indx].values()))) + 1
      all_words += self.bow_dict[cat_indx].keys()

    self.vocab = np.unique(np.array(all_words))
    self.vocab_size = self.vocab.shape[0]
    denom_for_each_class = np.array([words_in_each_class[cat_indx] + self.vocab_size + 1 for cat_indx in range(self.unq_classes_count)])

    self.cats_info = np.array([(self.bow_dict[cat_indx], prob_classes[cat_indx], denom_for_each_class[cat_indx]) for cat_indx in range(self.unq_classes_count)])
  
  def get_prob_for_example(self, example):
    likelihood_prob = np.zeros(self.unq_classes_count)
    for cat_indx in range(self.unq_classes_count):
      for test_token in example.split():
        test_token_counts = self.cats_info[cat_indx][0].get(test_token, 0) + 1
        #now get likelihood of this test_token word                              
        test_token_prob = test_token_counts/float(self.cats_info[cat_indx][2])                              
                
        #remember why taking log? To prevent underflow!
        likelihood_prob[cat_indx] += np.log(test_token_prob)

    post_prob = np.zeros(self.unq_classes_count)
    for cat_indx in range(self.unq_classes_count):
      post_prob[cat_indx] = likelihood_prob[cat_indx] + np.log(self.cats_info[cat_indx][1])                                  
      
    return post_prob

  def predict(self, test_data):
    predictions=[] #to store prediction of each test example
    for example in test_data: 
      #preprocess the test example the same way we did for training set exampels                                  
      cleaned_example = preprocess(example) 
             
      #simply get the posterior probability of every example                                  
      post_prob = self.get_prob_for_example(cleaned_example) #get prob of this example for both classes
      #simply pick the max value and map against self.classes!
      predictions.append(self.unq_classes[np.argmax(post_prob)])
            
    return np.array(predictions) 



In [6]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset = 'train')

train_data = newsgroups_train.data #getting all trainign examples
train_labels = newsgroups_train.target #getting training labels

In [7]:
np.unique(train_labels)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [8]:
nb = NaiveBayesFromScratch(np.unique(train_labels).shape[0]) #instantiate a NB class object
print ("---------------- Training In Progress --------------------")

nb.fit(train_data,train_labels) #start tarining by calling the train function
print ('----------------- Training Completed ---------------------')

---------------- Training In Progress --------------------
----------------- Training Completed ---------------------


In [9]:
newsgroups_test = fetch_20newsgroups(subset='test') #loading test data
test_data = newsgroups_test.data #get test set examples
test_labels = newsgroups_test.target #get test set labels


In [10]:
pclasses = nb.predict(test_data) #get predcitions for test set

#check how many predcitions actually match original test labels
test_acc = np.sum(pclasses == test_labels) / float(test_labels.shape[0]) 

print ("Test Set Examples: ", test_labels.shape[0]) 
print ("Test Set Accuracy: ", test_acc * 100, "%") 

Test Set Examples:  7532
Test Set Accuracy:  78.71747211895911 %


In [13]:
cleaned_train_data = [preprocess(ele) for ele in train_data]
cleaned_test_data = [preprocess(ele) for ele in test_data]

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

clf = Pipeline([('cv', CountVectorizer()), ('nb', MultinomialNB())])

clf.fit(cleaned_train_data, train_labels)
print("------------Training Done ----------")
predictions = clf.predict(cleaned_test_data)

test_acc_sklearn = np.sum(pclasses == test_labels) / float(test_labels.shape[0]) 


print ("Test Set Examples: ", test_labels.shape[0]) 
print ("Test Set Accuracy: ", test_acc_sklearn * 100, "%") 

------------Training Done ----------
Test Set Examples:  7532
Test Set Accuracy:  78.71747211895911 %
