In [2]:

from pathlib import Path
from collections import Counter
from nltk.stem import PorterStemmer
import math
from decimal import *
import numpy as np
import pandas as pd
from IPython.display import display




def truncate_float(float_number, decimal_places):
    multiplier = 10 ** decimal_places
    return int(float_number * multiplier) / multiplier




#CALCULATES THE IG OF EACH WORD
def IG(w, total_pos, total_neg, path_pos, path_neg):
    total_reviews= total_pos + total_neg
    
    
     
    files_positive = Path(path_pos).glob('*.txt')
    files_negative = Path(path_neg).glob('*.txt')
     
    
    
    P1= (total_pos/total_reviews)  #probability of positive reviews
    P0= (total_neg/total_reviews)  #probability of negative reviews
    
    entropy= -P1*math.log2(P1)-P0*math.log2(P0)
    
    
    #Probabilities of C=pos and w=1 & C=neg and w=1
    pos_counter=0
    neg_counter=0 
    total_counter=0
    
    
    for f in files_positive:
        if w in f.read_text():
            pos_counter+=1
            total_counter+=1
      
    for f in files_negative:
        if w in f.read_text():
            neg_counter+=1
            total_counter+=1
    
    
    #Probability of w=1 and w=0 
    P_w1 = total_counter/total_reviews
    #print("Probability of ", w, "=1: ", P_w1)
    P_w0 = (total_reviews- total_counter) / total_reviews
    #print("Probability of ", w, "=0: ", P_w0)

        
   
    #Probability of a C=pos and C=neg when w=1
    if P_w1!=0:
        P1_w1= pos_counter/total_counter 
        #print("probability positive reviews with ", w,"=1: ", P1_w1)
        P0_w1= neg_counter/total_counter
        #print("probability negative reviews with ", w,"=1: ", P0_w1)
    else:
        P1_w1=0
        P0_w1=0
    
    
    
    #Probability of a C=pos and C=neg when w=0
    if P_w0!=0:
        P1_w0 = (total_pos-pos_counter)/(total_reviews- total_counter)
        #print("probability positive reviews with ", w,"=0: ", P1_w0)
        P0_w0 = (total_neg-neg_counter)/(total_reviews - total_counter)
        #print("probability positive reviews with ", w,"=0: ", P0_w0)
    else:
        P1_w0 = 0
        P0_w0 = 0
    
    
    entropy_w1= (0 if P1_w1==0 else -P1_w1*math.log2(P1_w1)) - (0 if P0_w1==0 else P0_w1*math.log2(P0_w1))
    #print("Entropy of ", w, "=1: ", entropy_w1)
    entropy_w0= (0 if P1_w0==0 else -P1_w0*math.log2(P1_w0)) - (0 if P0_w0==0 else P0_w0*math.log2(P0_w0))
    #print("Entropy of ", w, "=0: ", entropy_w0)

    
    if entropy!=1.0:
        ig= truncate_float(entropy, 6) - truncate_float((P_w1*entropy_w1 + P_w0*entropy_w0), 6)
    else:
        getcontext().prec = 6
        ig= Decimal(1) - Decimal((P_w1*entropy_w1 + P_w0*entropy_w0))
       
    
    
    print("IG of ",w , "is: ", ig)
    
    return float(ig)







#FILTERS AN ARRAY OF WORDS BASED ON THEIR IG
def IG_filter(words_filtered, total_pos, total_neg, path_pos, path_neg):
    result=[]
    
    for w in words_filtered:
        ig=IG(w, total_pos, total_neg, path_pos, path_neg)
        if ig >=0.005:
            result.append(w)
    
    return result



In [3]:
class tokenizer():

    #FINDING THE MOST USEFUL ATTRIBUTES (WORDS) IN ALL THE TRAINING DATA
    #CONVERTING THE TRAINING DATA INTO VECTORS OF THE ATTRIBUTES WE CHOSE
    def tokenize(self,path_pos, path_neg):
        
        files_positive = Path(path_pos).glob('*.txt')
        files_negative = Path(path_neg).glob('*.txt')
        
        

        total_pos=0
        total_neg=0

        text= ""
        i=0
        for file in files_positive:
            #print(file.name)
            
            text= text+file.read_text()  # the file is opened and closed
            total_pos += 1
            
            
        for file in files_negative:
            #print(file.name)
            
            total_neg+=1
            text= text+file.read_text()  
            


        total_reviews= total_pos+total_neg  

        print("Total positive reviews: ",  total_pos)
        print("Total negative reviews: ", total_neg)



        #FILTERING THE REVIEWS TO KEEP THE USEFUL WORDS
    

        split_it = text.split()
        bad_chars = [';', ':'," ", "!", "*", "(", ")", '\"', ".", ",", "/", ">", "<"]

        for i in range(len(split_it)):
            for b in bad_chars:
                if b in split_it[i]:
                    split_it[i] = split_it[i].replace(b, '')


        Counters_found = Counter(split_it)
        most_occur = Counters_found.most_common(1000)



        ps = PorterStemmer()
        words=[]
        for i in range(len(most_occur)):
            words.append(most_occur[i][0])
                


        words_to_exclude = ['-', '--', ';', ':', "!", "*", " ", "(", ")", '\"', ".", 'it\'s', 'br', 'mr', 'there\'s', 'your', 'wasn\'t', 'Ms', 'can', 'do', 'were', 'how', 'get', 'will', 'also', 'been', 'some', 'into', 'because', 'about', 'out', 'me', 'up', 'down', 'my', 'mine', 'their', 'she', 'he\'s', 'the','you', 'an', 'his', 'him', 'her', 'or', 'was', 'have', 'has', 'had', 'in', 'i', 'he', 'we', 'they', 'their', 'theirs', 'which','what', 'where', 'be', 'they', 'has', 'so',  'by', 'who', 'that','this', 'those', 'your', 'these', 'on', 'there', 'and', 'to', 'a', 'it', 'its', 'for', 'if', 'then', 'is', 'at', 'are', 'of', 'no', 'as', 'but', 'with', 'there']    

        words_filtered=[]
        for i, w in enumerate(words):
            
            
            for b in bad_chars:
                if b in words[i]:
                    words[i]= words[i].replace(b,'')
                    w=words[i]
                

            
            
            if w.lower() in words_to_exclude:
                #print('removing ' +w)
                #words.remove(w)
                continue

            words_filtered.append(words[i])        
                


        words_filtered= words_filtered[20:len(words_filtered)-148]
                
        c=1
        for i in  words_filtered:
            print(c,'. '+i)
            c=c+1
            


        
        self.words_final= IG_filter(words_filtered, total_pos, total_neg, path_pos, path_neg)

            
            
        files_positive2 =Path(path_pos).glob('*.txt')
        files_negative2= Path(path_neg).glob('*.txt')
            
        #TOKENIZES EACH TEXT IN THE MOST USEFUL WORDS WE FOUND
        self.vectors= np.zeros([total_reviews, len(self.words_final)+1], dtype=int)
        i=0
        for file in files_positive2:
            i+=1
            for j,w in enumerate(self.words_final):
                if w in file.read_text():
                    self.vectors[i][j]=1
            self.vectors[i][len(self.words_final)]=1  # initializing the last cell as 1 (category cell)
    
        i=0
        for file in files_negative2:
            i+=1
            for j, w in enumerate(self.words_final):
                if w in file.read_text():
                    self.vectors[i][j]=1
            #the last cell in the negative reviews is already 0 
        
        
        vectors_df= pd.DataFrame(self.vectors, columns=self.words_final+ ['Category'])
        display(vectors_df)
        
        return self.vectors
    
    
    
    
    
    def tokenize_new(self, path_pos, path_neg ):
        
        files_positive =Path(path_pos).glob('*.txt')
        files_negative= Path(path_neg).glob('*.txt')
        
        new_vectors= []
        
        
        for f in files_positive:
            v=[]
            for w in self.words_final:
                if w in f.read_text():
                    v.append(1)
                else: 
                    v.append(0)
            v.append(1)
            new_vectors.append(v)            
                    
                    
        
        for f in files_negative:
            v=[]
            for w in self.words_final:
                if w in f.read_text():
                    v.append(1)
                else: 
                    v.append(0)
            v.append(0)
            new_vectors.append(v)
            
            
        vectors_final= np.array(new_vectors,dtype=int)
            
        vectors_df= pd.DataFrame(vectors_final, columns=self.words_final+ ['Category'])
        display(vectors_df)
        
        return vectors_final
            
            

In [4]:
array= []
array. append([1,2,3])
array.append([4,5,6])
array.append([7,8,9])
print(array)


arraynp = np.array(array, dtype=int)
arraynp

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]


array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [38]:

class Probability:
    
    def __init__(self, data, total_pos, total_neg):
        self.data= data
        self.total_pos = total_pos
        self.total_neg = total_neg
    
    
    
    
    
    def prob_tables_training_data(self):
        
        print("Starting to build the probability tables")
        self.pX_1= np.zeros([2,self.data.shape[1]], dtype=float)
        self.pX_1[1][self.data.shape[1]-1] = 1 
        self.pX_0 = np.zeros([2,self.data.shape[1]], dtype=float)
        self.pX_0[1][self.data.shape[1]-1] = 1 
        
        
        #P(X|C=0)
        for j in range(self.data.shape[1]-1):  #we use -1 in order not to count the probability of the category row too
            X1_C0 = 0
            X0_C0 = 0
            for i in range(self.data.shape[0]):
                if self.data[i][self.data.shape[1]-1] != 0:
                    continue
                
                if self.data[i][j] ==1:
                    X1_C0+=1
                else: 
                    X0_C0+=1
                
                    
            self.pX_1[0][j]= (X1_C0+1)/(self.total_neg+2)
            self.pX_0[0][j] = (X0_C0+1)/(self.total_neg+2)
            print(self.pX_1[0][j], self.pX_0[0][j])
            
            
        #P(X|C=1)
        for j in range(self.data.shape[1]-1):  #we use -1 in order not to count the probability of the category row too
            X1_C1 = 0
            X0_C1 = 0
            for i in range(self.data.shape[0]):
                if self.data[i][self.data.shape[1]-1] != 1:
                    continue
                
                if self.data[i][j] ==1:
                    X1_C1+=1
                else: 
                    X0_C1+=1
                    
            self.pX_1[1][j]= (X1_C1+1)/(self.total_pos+2)
            self.pX_0[1][j] = (X0_C1+1)/(self.total_pos+2)
            print(self.pX_1[1][j], self.pX_0[1][j])
    
        print("Finished building the probability tables")
    
    
    
    
    # def prob(self, pos,x,c):
    
    #     X_counter=0
    #     C_counter=0
    #     for i in range(self.vectors.shape[0]):
    #         if self.vectors[i][self.vectors.shape[1]-1]!=c:
    #             continue
    #         C_counter+=1
    #         if self.vectors[i][pos]==x:
    #             X_counter+=1 
                
    #     return (X_counter+1)/(C_counter+2)   #using Laplace estimator
     
    
    
    
    def P1_X(self, vector):
        p1= self.total_pos/(self.total_pos+self.total_neg)
        
        pX_C1 =1
        for i, w in enumerate(vector):
            if w==0:
                pX_C1 = pX_C1*self.pX_0[1][i]
            else: 
                pX_C1 = pX_C1*self.pX_1[1][i]
                
            
            
        return pX_C1*p1 



    def P0_X(self, vector):
        p0= self.total_neg/(self.total_pos+self.total_neg)
        
        pX_C0 =1
        for i, w in enumerate(vector):
            if w==0:
                pX_C0 = pX_C0*self.pX_0[0][i]
            else: 
                pX_C0 = pX_C0*self.pX_1[0][i]
            
            
        return pX_C0*p0
    
    
    
    
    
    
    
       
    
    # def P1_X(self, vector, total_pos, total_neg):
    #     p1= total_pos/(total_pos+total_neg)
        
    #     pX_C1= 1
        
    #     for i, w in enumerate(vector):
    #         #word= words_final[i]
    #         if w==0:
    #             pX_C1*= self.prob(i, 0, 1)
    #             print("CALCULATING prob of X when C=1")
    #         else:
    #             pX_C1*=self.prob(i, 1, 1)
    #             print("CALCULATING prob of x=0 when c=0")

    #         if i==(len(vector)-1):
    #             break
    #     return pX_C1
    
    
    
    # def P0_X(self, vector, total_pos, total_neg):
    #     p0= total_neg/(total_pos+total_neg)
        
    #     pX_C0= 1
        
    #     for i, w in enumerate(vector):
    #         #word= words_final[i]
    #         if w==0:
    #             pX_C0*= self.prob(i, 0, 0)
    #             print("CALCULATING prob of x=0 when c=0")
    #         else:
    #             pX_C0*=self.prob(i, 1, 0)
    #             print("CALCULATING prob of x=0 when c=0")

    #         if i==(len(vector)-1):
    #             break
    #     return pX_C0

In [23]:

def classification_report(y_real, y_pred):
    print(y_pred)
    reports= np.zeros([2,3])
    #For neg
    true_pos = 0
    false_pos = 0
    true_neg = 0
    false_neg = 0
    
    
    # corr_ident= 0
    # incorr_ident= 0  
    # total_neg= 0
    for i, y in enumerate(y_pred):
        if y== y_real[i] and y==1:
            true_pos+=1
        if y!= y_real[i] and y==1:
            false_pos+=1
        if y!= y_real[i] and y==0:
            false_neg+=1
        if y==y_real[i] and y==0:
            true_neg+=1

    recall= true_pos/(true_pos+false_neg)
    precision = true_pos/ (true_pos +false_pos)
    print(recall)
    print(precision)
    f1_score= 2*(precision*recall)/(precision+recall)
    
    reports[0][0]= recall
    reports[0][1]= precision
    reports[0][2]= f1_score
    
    
    
    
    #For pos
    corr_ident= 0
    incorr_ident= 0  
    total_neg= 0
    for i, y in enumerate(y_pred):
        if y== y_real[i] and y==1:
            corr_ident+=1
        if y!= y_real[i] and y==1:
            incorr_ident+=1
    
    
    
    for y in y_real:
        if y==1:
            total_neg+=1
    
    recall= corr_ident/total_neg
    precision = corr_ident/ (corr_ident +incorr_ident)
    f1_score= 2*(precision*recall)/(precision+recall)
    
    reports[1][0]= recall
    reports[1][1]= precision
    reports[1][2]= f1_score
    
    
    reports_df= pd.DataFrame(reports, columns=['Recall', 'Precision', 'F1-score'], index=['0', '1'])
    
    display(reports_df)
    return reports
    
    

In [7]:
#SPLIT FUNCTION IN ORDER TO SPLIT DATA INTO X AND Y
def split(vectors):
    y=[]
    for i in range(vectors.shape[0]):
        y.append(vectors[i][vectors.shape[1]-1])
        
    x=np.zeros([vectors.shape[0], vectors.shape[1]-1])
    for i in range(vectors.shape[0]):
        for j in range(vectors.shape[1]):
            if j==(vectors.shape[1]-1):
                continue
            x[i][j]=vectors[i][j]
            
    return x, y

In [24]:
class naive_bayes:

        
    
    def fit(self, x, y):
        self.total_pos= 0
        self.total_neg=0
        self.data= np.zeros([x.shape[0],x.shape[1]+1], dtype=int)
        for i in range(x.shape[0]):
            
            for j in range(x.shape[1]):
                #if j== self.data.shape[1]:
                 #   self.data[i][j]= y[i]
                
                self.data[i][j]= x[i][j]
                
            

        for i in range(self.data.shape[0]):
            self.data[i][self.data.shape[1]-1]= y[i]
            if y[i]== 1:
                self.total_pos+=1
            else:
                self.total_neg+=1
                    
        print("after the 2 for")
        #data_df= pd.DataFrame(self.data, columns=words_final+ ['Category'])
        #display(data_df)
        self.p= Probability(self.data, self.total_pos, self.total_neg)
        return self.data
        
        
        
        
    def predict(self, x_data):
        y_data= [0]*x_data.shape[0]
        #p=Probability(self.data)
        self.p.prob_tables_training_data()
        print("starting predictions")
        for i in range(x_data.shape[0]):
            if self.p.P1_X(x_data[i]) > self.p.P0_X(x_data[i]):
                y_data[i]= 1
            else:
                y_data[i]=0
            
        print("finished predictions")
        return y_data
            
            

In [9]:

#TRAINING THE ALGORITHM

#files_positive =Path('/Users/michail/Downloads/aclImdb/train/pos').glob('*.txt')
#files_negative= Path('/Users/michail/Downloads/aclImdb/train/neg').glob('*.txt')

path_pos= '/Users/michail/Downloads/aclImdb/train/pos'
path_neg= '/Users/michail/Downloads/aclImdb/train/neg'

t = tokenizer()

train_vectors= t.tokenize(path_pos, path_neg)




Total positive reviews:  12500
Total negative reviews:  12500
1 . much
2 . other
3 . people
4 . bad
5 . great
6 . most
7 . first
8 . made
9 . well
10 . make
11 . could
12 . them
13 . way
14 . any
15 . don't
16 . too
17 . think
18 . movies
19 . characters
20 . films
21 . seen
22 . character
23 . being
24 . many
25 . watch
26 . two
27 . 
28 . never
29 . acting
30 . plot
31 . little
32 . after
33 . know
34 . did
35 . best
36 . does
37 . love
38 . show
39 . life
40 . ever
41 . better
42 . off
43 . over
44 . say
45 . end
46 . scene
47 . still
48 . scenes
49 . such
50 . should
51 . through
52 . something
53 . go
54 . here
55 . back
56 . doesn't
57 . real
58 . thing
59 . didn't
60 . watching
61 . man
62 . I'm
63 . years
64 . actors
65 . makes
66 . find
67 . work
68 . few
69 . actually
70 . going
71 . same
72 . though
73 . funny
74 . lot
75 . before
76 . while
77 . old
78 . look
79 . why
80 . nothing
81 . part
82 . another
83 . cast
84 . quite
85 . want
86 . seems
87 . &
88 . pretty
89 . got
9

Unnamed: 0,bad,great,well,could,don't,acting,plot,best,love,life,...,ridiculous,Oh,crap,cheap,today,ways,dull,form,fantastic,Category
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
2,1,1,1,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,1
3,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,1
4,0,1,1,1,0,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
path_pos= '/Users/michail/Downloads/aclImdb/test/pos'
path_neg= '/Users/michail/Downloads/aclImdb/test/neg'

test_vectors= t.tokenize_new(path_pos, path_neg)

Unnamed: 0,bad,great,well,could,don't,acting,plot,best,love,life,...,ridiculous,Oh,crap,cheap,today,ways,dull,form,fantastic,Category
0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
24996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24997,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
24998,0,0,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [25]:
x_train, y_train= split(train_vectors)

In [39]:
nb= naive_bayes()
nb.fit(x_train, y_train)
print('fit done')
y_pred = nb.predict(x_train)


after the 2 for
fit done
Starting to build the probability tables
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.99992001

In [40]:
classification_report(y_train, y_pred)

[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

Unnamed: 0,Recall,Precision,F1-score
0,0.99872,1.0,0.99936
1,0.99872,1.0,0.99936


array([[0.99872   , 1.        , 0.99935959],
       [0.99872   , 1.        , 0.99935959]])

In [41]:
x_test, y_test= split(test_vectors)

In [42]:
y_pred = nb.predict(x_test)

Starting to build the probability tables
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.998720204767237e-05 0.9999200127979523
7.99872020476723

In [47]:
classification_report(y_test, y_pred)

[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 

Unnamed: 0,Recall,Precision,F1-score
0,0.93632,0.495953,0.648439
1,0.93632,0.495953,0.648439


array([[0.93632   , 0.49595322, 0.64843901],
       [0.93632   , 0.49595322, 0.64843901]])