In [28]:
%run pre_processing.ipynb

<class 'scipy.sparse.csr.csr_matrix'>
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
<class 'numpy.ndarray'>
['aa', 'aba', 'abandon', 'abbswinston', 'abc', 'abcnew', 'abil', 'abl', 'ablaz', 'about', 'absolut', 'abstorm', 'abus', 'accept', 'access', 'accid', 'accident', 'accionempresa', 'accord', 'account', 'accus', 'achiev', 'acid', 'acr', 'across', 'act', 'action', 'activ', 'actual', 'ad', 'add', 'address', 'admit', 'adopt', 'adult', 'advanc', 'advisori', 'af', 'affect', 'affili', 'afghan', 'afghanistan', 'afp', 'afraid', 'africa', 'after', 'afternoon', 'aftershock', 'ag', 'again']
[('ash', 169), ('australia', 192), ('collaps', 532), ('trent', 2721), ('bridg', 355), ('among', 99), ('worst', 2941), ('histori', 1240), ('england', 859), ('bundl', 381), ('great', 1140), ('michigan', 1652), ('techniqu', 2592), ('camp', 408), ('thank', 2617), ('hail', 1171), ('cnn', 523), ('tennesse', 2602), ('movi', 1714), ('theater',

In [29]:
# e-1: implement the BernoulliNB Naive Bayes classifier, without using any existing machine learning libraries.

import numpy as np

class Bernoulli_NaiveBayes:

    def __init__(self):   
        self.alpha = 1 # set smoothing factor=1(Laplace Smoothing), to avoid zero probability problems  

    def _cal_prior_prob_log(self, y, classes): # calculate the logarithm of prior probability of each class, P(y=c_k)
        self.classes = np.unique(y)
        class_num = len(self.classes) #count the number of possible types of y
        sample_num = len(y)
        
        c_num = np.count_nonzero(y == classes[:, None], axis=1) #count sample amount of each class
        prior_prob = (c_num + self.alpha) / (sample_num + class_num * self.alpha) #calculate prior probabilities(add smoothing correction)
        prior_prob_log = np.log(prior_prob) #calculate logarithm
        
        return prior_prob_log
    
    def _cal_condi_prob_log(self, X, y, classes): #calculate the logarithm of all conditional probabilities P(x^(j)|y=c_k)
        
        n = (X.shape)[1]
        K = len(classes)
        
        #create an empty multidimensional array
        #prob_log: logarithmic matrix of two conditional probabilities
        condi_prob_log = np.empty((2, K, n)) 
        
        for k, c in enumerate(classes):
            X_c = X[np.equal(y, c)] #acquire all samples of class c_k
            total_num = len(X_c)
            num_f1 = np.count_nonzero(X_c, axis=0) #count the number of samples of which feature value is 1
            condi_prob_f1 = (num_f1 + self.alpha) / (total_num + self.alpha * 2) #calculate conditional probability P(x^(j)=1|y=c_k)
            
            #calculate and store logarithm into matrix
            #prob_log[0]: store all values of log(P(x^(j)=0|y=c_k))
            #prob_log[1]: store all values of log(P(x^(j)=1|y=c_k))
            condi_prob_log[0, k] = np.log(1 - condi_prob_f1) 
            condi_prob_log[1, k] = np.log(condi_prob_f1) 
            
        return condi_prob_log
   
    def train(self, x_train, y_train): #train the model
        self.classes = np.unique(y_train) #acquire all classes  
        self.prior_prob_log = self._cal_prior_prob_log(y_train, self.classes) #calculate and store the logarithm of all prior probabilities
        self.condi_prob_log = self._cal_condi_prob_log(x_train, y_train, self.classes) #calculate and store the logarithm of all conditional probabilities

    def _predict_single_sample(self, x): #predict the label of single sample

        K = len(self.classes)
        po_prob_log = np.empty(K) #create an empty multidimensional array
        
        index_f1 = x == 1 #acquire index of feature value=1 
        index_f0 = ~index_f1 #acquire index of feature value=0

        for k in range(K): #iterate each class
            #calculate the logarithm of the numerator of the posterior probability
            po_prob_log[k] = self.prior_prob_log[k] \
                                + np.sum(self.condi_prob_log[0, k][index_f0]) \
                                + np.sum(self.condi_prob_log[1, k][index_f1])

        label = np.argmax(po_prob_log) #get the class with the highest posterior probability
        return label

    def predict(self, X): #predict samples (include single sample)
        
        if X.ndim == 1: #if only predict single sample (the dimension of the array = 1), invoke _predict_single_sample()
            return self._predict_single_sample(X) 
        else:
            #if predict multiple samples, loop call _predict_single_sample() and return a list of the predicted results 
            labels = []
            for j in range(X.shape[0]):
                label = self._predict_single_sample(X[j])
                labels.append(label)
            return labels
        
    def cal_f1_score(self,true,predict):
        
        true = list(true)
        num = len(true)
        TP = 0
        FP = 0
        TN = 0
        FN = 0
        
        for i in range(num):               
            if true[i] != predict[i]:
                if true[i] == 1:
                    FN += 1
                else:
                    FP += 1
            else:
                if true[i] == 1:
                    TP += 1
                else:
                    TN += 1

        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        F1_Score = 2 * (precision * recall) / (precision + recall)
        return F1_Score

In [30]:
# e-2: Train the Bernoulli_NaiveBayes classifier on the train set, and report its mean F 1-score on the dev set.

x_train = train_bow_vectors
y_train = np.array(y_train)
x_test = test_bow_vectors

BernoulliNB = Bernoulli_NaiveBayes()
BernoulliNB.train(x_train,y_train)
y_pred = BernoulliNB.predict(x_test)

print (BernoulliNB.cal_f1_score(y_test,y_pred))

0.7611518915866743
