# Programming Assignment 4: Naïve Bayes

In [186]:
import numpy as np
from matplotlib import pyplot
import math 
import pandas as pd
import string
import heapq
from nltk.corpus import stopwords


In [187]:
tweetdf = pd.read_csv (r'Tweets.csv')
tweetdf.columns = ['a_s','text']

neutraltag = tweetdf[tweetdf['a_s'] == 'neutral']
positivetag = tweetdf[tweetdf['a_s'] == 'positive']
negativetag = tweetdf[tweetdf['a_s'] == 'negative']

In [188]:
neutral_train = neutraltag[0:int( len(neutraltag)*0.8 )]
positive_train = positivetag[0:int( len(positivetag)*0.8 )]
negative_train = negativetag[0:int( len(negativetag)*0.8 )]    
neutral_test = neutraltag[int(len(neutraltag)*0.8): len(neutraltag)]
positive_test = positivetag[int(len(positivetag)*0.8): len(positivetag)]
negative_test = negativetag[int(len(negativetag)*0.8): len(negativetag)]


In [189]:
trainX = np.concatenate((neutral_train['text'], positive_train['text'], negative_train['text']))
testX = np.concatenate((neutral_test['text'], positive_test['text'], negative_test['text']))    
trainY = np.concatenate((neutral_train['a_s'], positive_train['a_s'], negative_train['a_s']))
testY = np.concatenate((neutral_test['a_s'], positive_test['a_s'], negative_test['a_s']))

In [190]:
arraytrain = np.array([neutral_train['text'], positive_train['text'], negative_train['text']])
arraytest = np.array([neutral_test['text'], positive_test['text'], negative_test['text']])

In [191]:
#onehotencoding
onehotencoded = pd.get_dummies(tweetdf.a_s)

#labelencoding
def labelEncoding(dataframe, newarray):
    newarray=[]
    for labels in dataframe:
        if labels == 'neutral':
            newarray.append(0)
        elif labels == 'positive':
            newarray.append(1)
        elif labels == 'negative':
            newarray.append(2)
    newarray = np.asarray(newarray)
    return newarray

In [192]:
#Now to process X data, which is the text data, we need to represent it as a bag of words and define a dictionary with reduced
#vocabulary size 
import re
# def LowerCase(lines):
#         text = text.lower()
        
# def RemoveNonWordAndDoubleSpace(lines):
#         text = re.sub(r'\W',' ',text)
#         text = re.sub(r'\s+',' ',text)
        
# array1 = []
# def CleanedDataSet(lines, array):
#     for text in lines:
#         LowerCase(lines)
#         RemoveNonWordAndDoubleSpace(lines)
#         array1.append(text)
#     array1 = pd.DataFrame(array1, columns=["text"])
#     return array1

def CleanedDataSet(data):
    empty =[]
    for text in data:
        text = text.lower()
        text = re.sub(r'\W',' ',text)
        text = re.sub(r'\s+',' ',text)
        empty.append(text)
    empty = pd.DataFrame(empty, columns=["text"])
    return empty
    



In [196]:
#Finding Frequent words for bag of words 
def findfrequency(tweets):
    wordfreq = []
    mostfreq_stop = []
    for t in tweets:
        words = t.split()
        for w in words:
            if w not in wordfreq:
                wordfreq.append(w)
    mostfreq_stop = [word for word in wordfreq if not word in stopwords.words()]
    return mostfreq_stop

In [197]:
#Bag of words

def bagofwords(tweets, totalwords):
    vectorTweets=[]
    for t in tweets:
        TokenedTweets = t.split()
        processedTweets=[]
        for tw in totalwords:
            if tw in TokenedTweets:
                processedTweets.append(t.count(tw))
            else:
                processedTweets.append(0)
        vectorTweets.append(processedTweets)
    return vectorTweets


In [None]:
#Processing X and Y datas. Y train was not needed as OHE was not needed
Y_Test = labelEncoding(testY, Y_Test)   
Final_X_Train = []
X_Train = CleanedDataSet(trainX)
X_Test = CleanedDataSet(testX)
mostfreq_stop = findfrequency(X_Train.text)


Final_X_Train.append(CleanedDataSet(arraytrain[0]))
Final_X_Train.append(CleanedDataSet(arraytrain[1]))
Final_X_Train.append(CleanedDataSet(arraytrain[2]))

Final_X_Train[0] = bagofwords(Final_X_Train[0].text, mostfreq_stop)
Final_X_Train[0] = pd.DataFrame(Final_X_Train[0])
Final_X_Train[0] = Final_X_Train[0].to_numpy()
Final_X_Train[1] = bagofwords(Final_X_Train[1].text, mostfreq_stop)
Final_X_Train[1] = pd.DataFrame(Final_X_Train[1])
Final_X_Train[1] = Final_X_Train[1].to_numpy()
Final_X_Train[2] = bagofwords(Final_X_Train[2].text, mostfreq_stop)
Final_X_Train[2] = pd.DataFrame(Final_X_Train[2])
Final_X_Train[2] = Final_X_Train[2].to_numpy()

X_Test = bagofwords(X_Test.text, mostfreq_stop)
X_Test = pd.DataFrame(X_Test)    
X_Test = X_Test.to_numpy()

X_Train = Final_X_Train
X_Test = X_Test
    

In [None]:
def trainNaiveBayes(trainX, X_Train):
    
    N_documents = (trainX.shape)[0]
    N_neutral = len(X_Train[0])
    N_positive = len(X_Train[1])
    N_negative = len(X_Train[2])
    prob_prior_neutral = N_neutral / N_documents
    prob_prior_positive = N_positive / N_documents
    prob_prior_negative = N_negative / N_documents
    log_prior_neutral = np.log(prob_prior_neutral)
    log_prior_positive = np.log(prob_prior_positive)
    log_prior_negative = np.log(prob_prior_negative)
    log_prior = np.array([log_prior_neutral, log_prior_positive, log_prior_negative])
    
    class_sum = []
    class_total = []
    likelihood = []
    
    neutral_sum = np.sum(X_Train[0], axis = 0)
    positive_sum = np.sum(X_Train[1], axis = 0)
    negative_sum = np.sum(X_Train[2], axis = 0)
    neutral_total = np.sum(neutral_sum)
    positive_total = np.sum(positive_sum)
    negative_total = np.sum(negative_sum)
    N_features = len(neutral_sum)

    neutral_sum = neutral_sum + 1   
    neutral_total = neutral_total + N_features
    positive_sum = positive_sum + 1   
    positive_total = positive_total + N_features
    negative_sum = negative_sum + 1   
    negative_total = negative_total + N_features
    
    neutral_prob = neutral_sum / neutral_total
    positive_prob = positive_sum / positive_total
    negative_prob = negative_sum / negative_total
    log_prob = np.array([np.log(neutral_prob), np.log(positive_prob), np.log(negative_prob)])

    return log_prior, log_prob

log_prior, log_prob = trainNaiveBayes(trainX, X_Train)



In [None]:
def testNaiveBayes(log_prior, log_prob, X_Test):
    predictions = []
    
    test_prob_neutral = np.dot(X_Test,log_prob[0])
    test_prob_neutral = test_prob_neutral + log_prior[0]
    
    test_prob_positive = np.dot(X_Test,log_prob[1])
    test_prob_positive = test_prob_positive + log_prior[1]
    
    test_prob_negative = np.dot(X_Test,log_prob[2])
    test_prob_negative = test_prob_negative + log_prior[2]
    
    m = len(X_Test)
    bayes_output = []
    
    for i in range(m):
        max_prob = max(test_prob_neutral[i], test_prob_positive[i], test_prob_negative[i])
        temp = -1
        if max_prob == test_prob_neutral[i]:
                temp = 0
        if max_prob == test_prob_positive[i]:
                temp = 1
        if max_prob == test_prob_negative[i]:
                temp = 2
        bayes_output.append(temp)
                
    return bayes_output

prediction_label = testNaiveBayes(log_prior, log_prob, X_Test)

In [None]:
def evaluateFinal(prediction, expected):
    
    truep = [0,0,0]
    truen = [0,0,0]
    falsep = [0,0,0]
    faslen = [0,0,0]
    confusion_matrix = np.full((3,3), 0)

    for E, P in zip(expected, prediction): 
        confusion_matrix[P][E] = confusion_matrix[P][E] + 1
        
    
    truep[0] = confusion_matrix[0][0]
    falsep[0] = (confusion_matrix[:, 0]).sum() - truep[0]
    faslen[0] = (confusion_matrix[0, :]).sum() - truep[0]
    truen[0] = (confusion_matrix.sum()) - truep[0] - falsep[0] - faslen[0]
    
    truep[1] = confusion_matrix[1][1]
    falsep[1] = (confusion_matrix[:, 1]).sum() - truep[1]
    faslen[1] = (confusion_matrix[1, :]).sum() - truep[1]
    truen[1] = (confusion_matrix.sum()) - truep[1] - falsep[1] - faslen[1]
        
    truep[2] = confusion_matrix[2][2]
    falsep[2] = (confusion_matrix[:, 2]).sum() - truep[2]
    faslen[2] = (confusion_matrix[2, :]).sum() - truep[2]
    truen[2] = (confusion_matrix.sum()) - truep[2] - falsep[2] - faslen[2]
    
    print(confusion_matrix)
    microA_precision = (sum(truep)/(sum(falsep)+sum(truep)))
    microA_recall = (sum(truep)/(sum(faslen)+sum(truep)))
    microA_accuracy = ((sum(truep)+sum(truen))/(sum(faslen)+sum(truep)+sum(falsep)+sum(truen)))
    microA_f1score = (microA_precision * microA_recall) / (microA_precision + microA_recall)
    microA_f1score = microA_f1score * 2
    
    macroA_precision = []
    macroA_recall = []
    macroA_accuracy = []
    
    macroA_precision.append(truep[0] / (truep[0] + falsep[0]))
    macroA_recall.append(truep[0] / (truep[0] + faslen[0]))
    macroA_accuracy.append((truep[0] + truen[0]) / (truep[0] + truen[0] + falsep[0] + faslen[0]))
    
    macroA_precision.append(truep[1] / (truep[1] + falsep[1]))
    macroA_recall.append(truep[1] / (truep[1] + faslen[1]))
    macroA_accuracy.append((truep[1] + truen[1]) / (truep[1] + truen[1] + falsep[1] + faslen[1]))
    
    macroA_precision.append(truep[2] / (truep[2] + falsep[2]))
    macroA_recall.append(truep[2] / (truep[2] + faslen[2]))
    macroA_accuracy.append((truep[2] + truen[2]) / (truep[2] + truen[2] + falsep[2] + faslen[2]))

    sum_macroA_precision = (sum(macroA_precision))/3
    sum_macroA_recall = (sum(macroA_recall))/3
    sum_macroA_accuracy = (sum(macroA_accuracy))/3

    macroA_f1score = (sum_macroA_precision * sum_macroA_recall) / (sum_macroA_precision + sum_macroA_recall)
    macroA_f1score  = 2 * macroA_f1score 
    
    print("\nMicro Average Precision: ", microA_precision)
    print("\nMicro Average Recall: ", microA_recall)
    print("\nMicro Average Accuracy: ", microA_accuracy)
    print("\nMicro Average F1-Score: ", microA_f1score)
    print("\n")
    print("\nMacro Average Precision: ", sum_macroA_precision)
    print("\nMacro Average Recall: ", sum_macroA_recall)
    print("\nMacro Average Accuracy: ", sum_macroA_accuracy)
    print("\nMacro Average F1-Score: ", macroA_f1score)

evaluateFinal(prediction_label, Y_Test)  