In [74]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
import re

In [75]:
def preprocess_string(str_arg):
    cleaned_str=re.sub('[^a-zA-Z]+',' ',str_arg,flags=re.IGNORECASE) #every char except alphabets is replacedd
    cleaned_str=re.sub('(\s+)',' ',cleaned_str) #multiple spaces are replaced by single space
    cleaned_str=cleaned_str.lower() #converting the cleaned string to lower case
    
    return cleaned_str # returning the preprocessed string 

In [76]:
my_str="Hello 23 F# cker   "
preprocess_string(my_str).split()

['hello', 'f', 'cker']

In [114]:
class NaiveBayes:
    
    def __init__(self,unique_classes):
        
        self.classes=unique_classes # Constructor is sinply passed with unique number of classes of the training sett
        
        
    def addToBow(self,example,dict_index):
        
        print("Ex 1: ",example)
        
        if isinstance(example,np.ndarray):
            example=example[0]
            print("is instance executed")
        
        print("Ex 2: ",example)
        print("dict indx:",dict_index)
        
        for token_word in example.split(): #for every word in preprocessed example
            self.bow_dicts[dict_index][token_word]+=1 #increment in its count
            
            
    def train(self,dataset,labels):
        self.examples=dataset
        self.labels=labels
        self.bow_dicts=np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        print("Init Bow Dict",self.bow_dicts)
        
        if not isinstance(self.examples,np.ndarray): self.examples=np.array(self.examples)
        if not isinstance(self.labels,np.ndarray): self.labels=np.array(self.labels)
            
        #constructing BoW for each category
        print("Self.Labels",self.labels==0)
        for cat_index,cat in enumerate(self.classes):
            all_cat_examples=self.examples[self.labels==cat]
            
        
            cleaned_examples=[preprocess_string(cat_example) for cat_example in all_cat_examples]
            print("Cleaned Ex 1: ",cleaned_examples)
            print("Cleaned Ex 1 type: ",type(cleaned_examples))
        
            cleaned_examples=pd.DataFrame(data=cleaned_examples)
            print("Cleaned Ex 2: ",cleaned_examples)
            print("Cleaned Ex 2 type: ",type(cleaned_examples))
            
            #now costruct BoW of this particular category
            np.apply_along_axis(self.addToBow,1,cleaned_examples,cat_index)
            
        prob_classes=np.empty(self.classes.shape[0])
        all_words=[]
        cat_word_counts=np.empty(self.classes.shape[0])
            
        for cat_index,cat in enumerate(self.classes):
            #Calculating prior probability p(c) for each class
            prob_classes[cat_index]=np.sum(self.labels==cat)/float(self.labels.shape[0]) 
            
            #Calculating total counts of all the words of each class 
            count=list(self.bow_dicts[cat_index].values())
            cat_word_counts[cat_index]=np.sum(np.array(count)) # |v| + 1 is remaining to be added
            
            #get all words of this category                                
            all_words+=self.bow_dicts[cat_index].keys()
                
        #combine all words of every category & make them unique to get vocabulary -V- of entire training set
        
        self.vocab=np.unique(np.array(all_words))
        self.vocab_length=self.vocab.shape[0]
        
            
        #computing denominator value                                      
        denoms=np.array([cat_word_counts[cat_index]+self.vocab_length+1 for cat_index,cat in enumerate(self.classes)])
            
            
        '''
        Now that we have everything precomputed as well, its better to organize everything in a tuple 
        rather than to have a separate list for every thing.
            
        Every element of self.cats_info has a tuple of values
        Each tuple has a dict at index 0, prior probability at index 1, denominator value at index 2
        '''
        
        self.cats_info=[(self.bow_dicts[cat_index],prob_classes[cat_index],denoms[cat_index]) for cat_index,cat in enumerate(self.classes)]                               
        self.cats_info=np.array(self.cats_info)
            
            
    def getExampleProb(self,test_example):
        likelihood_prob=np.zeros(self.classes.shape[0]) #to store probability w.r.t each clasS
        print("Likelihood Prob Init",likelihood_prob)
        #finding probability w.r.t each class of the given test example
        for cat_index,cat in enumerate(self.classes):
            
            for test_token in test_example.split(): #split the test example and get p of each test word
                print("Test token",test_token)
                print("Test Ex",test_example)
                #get total count of this test token from it's respective training dict to get numerator value                           
                test_token_counts=self.cats_info[cat_index][0].get(test_token,0)+1
                    
                #now get likelihood of this test_token word                              
                test_token_prob=test_token_counts/float(self.cats_info[cat_index][2])
                
                #remember why taking log? To prevent underflow!
                likelihood_prob[cat_index]+=np.log(test_token_prob)
                print("Likelihood Prob Vals",likelihood_prob)
                
        # we have likelihood estimate of the given example against every class but we need posterior probility
        post_prob=np.empty(self.classes.shape[0])
        print("Post Prob Init",post_prob)
        for cat_index,cat in enumerate(self.classes):
            post_prob[cat_index]=likelihood_prob[cat_index]+np.log(self.cats_info[cat_index][1])
            
        return post_prob
    
    
    def test(self,test_set):
        
        predictions=[] #to store prediction of each test example
        for example in test_set: 
                                              
            #preprocess the test example the same way we did for training set exampels                                  
            cleaned_example=preprocess_string(example) 
             
            #simply get the posterior probability of every example                                  
            post_prob=self.getExampleProb(cleaned_example) #get prob of this example for both classes
            
            #simply pick the max value and map against self.classes!
            predictions.append(self.classes[np.argmax(post_prob)])
                
        return np.array(predictions) 
    
    
        
        
    def print_data(self):
        print("Bow Dict",self.bow_dicts)
        print("Outer Bow type",type(self.bow_dicts))
        print("Inner Bow type",type(self.bow_dicts[0]))
        print("Bow Dict Shape",self.bow_dicts.shape)
        print("Bow Dict indx:0 ",self.bow_dicts[0])
        print("Bow Dict indx:1 ",self.bow_dicts[1])
        print("Self Cats Info",self.cats_info)
        print("Self Cats Shape",self.cats_info.shape)
        print("Self Cats Type",type(self.cats_info))
        print("Vocab Size",self.vocab_length)
        print("Vocab ",self.vocab)



In [120]:
import numpy as np
x = ["Simply loved it",
    "Most disgusting food i have ever had",
    "Stay away, very disgusting food",
     "Menu is absolutely perfect, loved it!",
    "A really good value for money",
     "This is a very good restaurant",
    "Terrible experience!",
    "This place has best food",
    "This place has most pathetic serving food!"]

y = [1,0,0,1,1,1,0,1,0]

In [121]:
y_labels = np.unique(y)
print(type(y_labels))

<class 'numpy.ndarray'>


In [122]:
nb = NaiveBayes(y_labels)

In [123]:
nb.train(x,y)

Init Bow Dict [defaultdict(<function NaiveBayes.train.<locals>.<listcomp>.<lambda> at 0x000001358465F790>, {})
 defaultdict(<function NaiveBayes.train.<locals>.<listcomp>.<lambda> at 0x000001358465FAF0>, {})]
Self.Labels [False  True  True False False False  True False  True]
Cleaned Ex 1:  ['most disgusting food i have ever had', 'stay away very disgusting food', 'terrible experience ', 'this place has most pathetic serving food ']
Cleaned Ex 1 type:  <class 'list'>
Cleaned Ex 2:                                              0
0        most disgusting food i have ever had
1              stay away very disgusting food
2                        terrible experience 
3  this place has most pathetic serving food 
Cleaned Ex 2 type:  <class 'pandas.core.frame.DataFrame'>
Ex 1:  ['most disgusting food i have ever had']
is instance executed
Ex 2:  most disgusting food i have ever had
dict indx: 0
Ex 1:  ['stay away very disgusting food']
is instance executed
Ex 2:  stay away very disgusting foo

In [124]:
nb.print_data()

Bow Dict [defaultdict(<function NaiveBayes.train.<locals>.<listcomp>.<lambda> at 0x000001358465F790>, {'most': 2, 'disgusting': 2, 'food': 3, 'i': 1, 'have': 1, 'ever': 1, 'had': 1, 'stay': 1, 'away': 1, 'very': 1, 'terrible': 1, 'experience': 1, 'this': 1, 'place': 1, 'has': 1, 'pathetic': 1, 'serving': 1})
 defaultdict(<function NaiveBayes.train.<locals>.<listcomp>.<lambda> at 0x000001358465FAF0>, {'simply': 1, 'loved': 2, 'it': 2, 'menu': 1, 'is': 2, 'absolutely': 1, 'perfect': 1, 'a': 2, 'really': 1, 'good': 2, 'value': 1, 'for': 1, 'money': 1, 'this': 2, 'very': 1, 'restaurant': 1, 'place': 1, 'has': 1, 'best': 1, 'food': 1})]
Outer Bow type <class 'numpy.ndarray'>
Inner Bow type <class 'collections.defaultdict'>
Bow Dict Shape (2,)
Bow Dict indx:0  defaultdict(<function NaiveBayes.train.<locals>.<listcomp>.<lambda> at 0x000001358465F790>, {'most': 2, 'disgusting': 2, 'food': 3, 'i': 1, 'have': 1, 'ever': 1, 'had': 1, 'stay': 1, 'away': 1, 'very': 1, 'terrible': 1, 'experience': 1

In [125]:
x_test = ["very good food and service"]
nb.test(x_test)

Likelihood Prob Init [0. 0.]
Test token very
Test Ex very good food and service
Likelihood Prob Vals [-3.29583687  0.        ]
Test token good
Test Ex very good food and service
Likelihood Prob Vals [-7.28482091  0.        ]
Test token food
Test Ex very good food and service
Likelihood Prob Vals [-9.8875106  0.       ]
Test token and
Test Ex very good food and service
Likelihood Prob Vals [-13.87649464   0.        ]
Test token service
Test Ex very good food and service
Likelihood Prob Vals [-17.86547869   0.        ]
Test token very
Test Ex very good food and service
Likelihood Prob Vals [-17.86547869  -3.38439026]
Test token good
Test Ex very good food and service
Likelihood Prob Vals [-17.86547869  -6.36331542]
Test token food
Test Ex very good food and service
Likelihood Prob Vals [-17.86547869  -9.74770568]
Test token and
Test Ex very good food and service
Likelihood Prob Vals [-17.86547869 -13.82524313]
Test token service
Test Ex very good food and service
Likelihood Prob Vals [-1

array([1])

In [8]:
# = {"b":2,"c":3}
d = defaultdict(lambda:"Not Present")
d["a"]=1
d["b"]=2
print(d)
l1 = list(d.values())
print(l1)
l2 = list(d)
print(l2)
print(type(d))


defaultdict(<function <lambda> at 0x00000217C3D2DC10>, {'a': 1, 'b': 2})
[1, 2]
['a', 'b']
<class 'collections.defaultdict'>


In [115]:
from sklearn.datasets import fetch_20newsgroups

In [116]:
categories=['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med'] 
newsgroups_train=fetch_20newsgroups(subset='train',categories=categories)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [117]:
train_data=newsgroups_train.data #getting all trainign examples
train_labels=newsgroups_train.target #getting training labels

In [127]:
print ("Total Number of Training Examples: ",len(train_data)) # Outputs -> Total Number of Training Examples:  2257
print ("Total Number of Training Labels: ",len(train_labels)) # Outputs -> #Total Number of Training Labels:  2257

Total Number of Training Examples:  2257
Total Number of Training Labels:  2257


In [128]:
nb=NaiveBayes(np.unique(train_labels)) #instantiate a NB class object
print ("---------------- Training In Progress --------------------")

---------------- Training In Progress --------------------


In [129]:
nb.train(train_data,train_labels) #start tarining by calling the train function
print ('----------------- Training Completed ---------------------')

Init Bow Dict [defaultdict(<function NaiveBayes.train.<locals>.<listcomp>.<lambda> at 0x000001F59D840430>, {})
 defaultdict(<function NaiveBayes.train.<locals>.<listcomp>.<lambda> at 0x000001F59D840D30>, {})
 defaultdict(<function NaiveBayes.train.<locals>.<listcomp>.<lambda> at 0x000001F59D840EE0>, {})
 defaultdict(<function NaiveBayes.train.<locals>.<listcomp>.<lambda> at 0x000001F59D840670>, {})]
----------------- Training Completed ---------------------


In [130]:
newsgroups_test=fetch_20newsgroups(subset='test',categories=categories) #loading test data
test_data=newsgroups_test.data #get test set examples
test_labels=newsgroups_test.target #get test set labels

print ("Number of Test Examples: ",len(test_data)) # Output : Number of Test Examples:  1502
print ("Number of Test Labels: ",len(test_labels)) # Output : Number of Test Labels:  1502

Number of Test Examples:  1502
Number of Test Labels:  1502


In [131]:
pclasses=nb.test(test_data) #get predcitions for test set

#check how many predcitions actually match original test labels
test_acc=np.sum(pclasses==test_labels)/float(test_labels.shape[0]) 

print ("Test Set Examples: ",test_labels.shape[0]) # Outputs : Test Set Examples:  1502
print ("Test Set Accuracy: ",test_acc*100,"%") # Outputs : Test Set Accuracy:  93.8748335553 %


Test Set Examples:  1502
Test Set Accuracy:  93.87483355525966 %


In [23]:
np.apply_along_axis?