In [3]:
#Naive Bayes Classifier 
#Sandipbhai Pravinbhai Viradiya
#Student ID: 19921091

from collections import defaultdict #Loading defaultdict from collection for initilizations
import pandas as pd # Load the Pandas libraries with alias 'pd' 
import numpy as np
import re
import string
import nltk
from sklearn.model_selection import train_test_split

#NaiveBayesClassifier Start

# Classifier to give positive or negative class of Tweet
class NaiveBayesClassifier(object):
    #Constructor
    def __init__(self):
        self.logprior = {} # Variable to store log prior for both of the class
        self.loglikelihoog = {} # Variable to store log likelihood for both of the class
        self.prohibitExp = ["&quot;","&lt;","&gt;"] # Remove double quote, >, < HTML Special chars
        self.all_classes = [] # Classes in which we are gonna classify, Here it will be 0 and 1
        self.all_voc = [] # All words from all documents
         
    #Clean the tweet
    def processText(self, tweet):
        #Convert to lower case
        tweet = tweet.lower()

        #Convert www.* or https?://* to URL
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)

        #Convert @username to AT_USER
        tweet = re.sub('@[^\s]+','AT_USER',tweet)

        #Remove additional white spaces
        tweet = re.sub('[\s]+', ' ', tweet)

        #Replace #word with word
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

        #trim
        tweet = tweet.strip('\'"')

        #remove multiple dots
        tweet = re.sub(r'\.+', " ", tweet)

        #remove multiple dashes
        tweet = re.sub(r'\-+', " ", tweet)

        #replace &quot; &lt; &gt;
        for pw in self.prohibitExp:
            tweet = tweet.replace(pw,"")

        #Tokenize string
        tokens = nltk.word_tokenize(tweet)

        all_tokens = [tok for tok in tokens if tok not in string.punctuation and tok not in ["AT_USER","URL"]]

        #Remove words like 's or 'll or 'it.
        #Remove single quote before word like we have 'without, 'and etc.
        for k,tok in enumerate(all_tokens):
            if re.match(r"\'[A-Za-z0-9]{1,2}$",tok) or tok == "":
                all_tokens.remove(tok)
            elif re.match(r"\'[A-Za-z0-9]{3,}",tok):
                all_tokens[k] = tok.replace("'","")

        return all_tokens
    
    #Count n_c for prior for each class
    #Count frequency of each word for each class
    #Will return both
    def countWordInClasses(self, training_set, training_labels):
        cnt = {}
        n_c = {}
        
        # We will store all words here... 
        #in set, if we use add method, duplicate entries will get automatically replaced
        voc = set() 

        for c in self.all_classes:
            cnt[c] = defaultdict(int)
            n_c[c] = 0

        for doc, doc_label in zip(training_set, training_labels):
            n_c[doc_label] += 1
            ind_token = self.processText(doc)
            for tkn in ind_token:
                voc.add(tkn)
                cnt[doc_label][tkn] += 1

        return cnt, n_c, voc
    
    #Train using training set and training labels
    def train(self, training_set, training_labels, alpha = 1):
        #Get number of documents
        n_doc = len(training_set)

        #Unique classes
        self.all_classes = set(training_labels)

        # all_words_with_count is frequency of each word with class
        # n_c is number of word class wise to use in log-prior
        # self.all_voc contains all unique words
        all_words_with_count, n_c, self.all_voc = self.countWordInClasses(training_set, training_labels)
        total_words = len(self.all_voc) # We need this in denominator for log-likelihood

        #print(n_c)
        #print(all_words_with_count)
        for c in self.all_classes:
            
            # log-prior for this class
            self.logprior[c] = np.log(n_c[c] / n_doc)
            
            # compute log likekihood of this class
            self.loglikelihoog[c] = defaultdict(defaultdict)

            # compute words in this class
            total_cnt_in_class = 0
            for word in all_words_with_count[c]:
                total_cnt_in_class += all_words_with_count[c][word]
            
             # compute the log-likelihood for this class, for every word
            for word in self.all_voc:
                self.loglikelihoog[c][word] = np.log( (all_words_with_count[c][word] + alpha) / (total_cnt_in_class + ( alpha * total_words )) )
     
    #Predict if tweet if positive or negative from trained data
    #Here test_labels can be optional
    #If test_labels IS PASSED, it will return accuracy, precision and recall
    #If test_labels IS NOT PASSED, it will return document with predicted sentiment
    def predict(self, test_data, test_labels = []):

        pred_list = []
        true_pos = 0
        true_neg = 0
        false_pos = 0
        false_neg = 0

        for doc in test_data:
            pred = 1 # By default it is positive
            
            sums = {}
            for c in self.all_classes:
                sums[c] = 0
            
            proc_text = self.processText(doc)

            for c in sums:
                sums[c] = self.logprior[c]
                for word in proc_text:
                    if word in self.loglikelihoog[c]:
                        sums[c] += self.loglikelihoog[c][word]

            if sums[0] >= sums[1]:
                pred = 0

            pred_list.append(pred)

        if len(test_labels) > 0:
            for real, system in zip(test_labels,pred_list):
                if real == 1 and system == 1:
                    true_pos += 1
                elif real == 1 and system == 0:
                    false_neg += 1
                elif real == 0 and system == 1:
                    false_pos += 1
                elif real == 0 and system == 0:
                    true_neg += 1

            accuracy = ( true_pos + true_neg ) / len(test_labels) * 100
            precision = true_pos / ( true_pos + false_pos ) * 100
            recall = true_pos / ( true_pos + false_neg ) * 100
            print("Accuracy: ", round(accuracy,2), "%",sep="")
            print("Precision: ", round(precision,2), "%",sep="")
            print("Recall: ", round(recall,2), "%",sep="")
        else:
            for data, system in zip(test_data,pred_list):
                print(data.strip(), ' "',system, '" ', sep = "")
            
#NaiveBayesClassifier End




all_set = []
all_labels = []
training_set = []
training_labels = []
validation_set = []
validation_labels = []

# Note for Prof. Manas
# Here encoding style is UTF-8 in file but for me ISO-8859-1 words to read it.
# Please change it to UTF-8 if provided one not work in your system
data = pd.read_csv("train.csv",encoding="ISO-8859-1")
for index, row in data.iterrows():
    all_set.append(row['SentimentText'])
    all_labels.append(row['Sentiment'])

#Divide data into 80% Train and 80% Test dataset
training_set, validation_set, training_labels, validation_labels = train_test_split(all_set, all_labels, test_size=0.2)

#Track time for our train and predict algo
import time
start = time.time()

#Initialization of Naive Bayes Classifier
NBClassifier = NaiveBayesClassifier()

#Train our model on training set and labels
NBClassifier.train(training_set, training_labels, alpha=1)

#Predict sentiment from given document
#Here if validation labels is not passed, it will return document with sentiment ELSE it will return accuracy, precision and recall percentage
NBClassifier.predict(validation_set, validation_labels)

# We can only pass document to get predicted Sentinemt
#docs = ["waahhh now I'm getting sad....miss hub :-'(&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;","Rose and ood will be back in the Xmas Who special!  YAY!  Damn that's half a year away."]
#NBClassifier.predict(docs)
    
end = time.time()
print('NBClassifier Ran in {} seconds'.format(round(end - start, 3)))

Accuracy: 76.03%
Precision: 78.73%
Recall: 79.03%
NBClassifier Ran in 15.53 seconds
