# Term Project 
## Sentiment Classifier
### Name - Vismayak Mohanarajan (mohanar2)

The aim of the project is to create a sentiment classifer that analyzes data and classifies it into two categories- positive or negative. The goal is to start with a baseline system and improve it till we get a performant system. 

The first step is to import the various python libraries that would be needed in the code

In [1]:
import nltk
import numpy as np
import math
import glob
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter

## Baseline System
### Bag-Of-Words

We first use a basic Bag-of-Words approach to help analyze the text. To create a Bag-Of-Words we need to perform the following steps:-

#### Extracting Lexicons

In [2]:
#creating negative lexicon
with open("negative-words.txt",encoding = "ISO-8859-1") as file:
    content = file.readlines()
content = [x.strip() for x in content] 
negative_words = [x for x in content if not x.startswith(";")]
negative_words.remove('') # removing observed garbage value

#creating positive lexicon
with open("positive-words.txt",encoding = "ISO-8859-1") as file:
    content = file.readlines()
content = [x.strip() for x in content] 
positive_words = [x for x in content if not x.startswith(";")]
positive_words.remove('')



#### Extracting the Text

In [3]:
#function to help read the text
def read_file(filename):
    with open(filename) as file:  
        data = file.read()
    return data

#Testing for the read function 
# text = read_file("review_polarity/txt_sentoken/pos/cv199_9629.txt")
# print(text)
    

#### Cleaning the Text

In [4]:
# function to help clean the text
def clean_text(text):
    temp_text =""
    # first let us remove all the punctuation in the text 
    for i in text:
        if i not in punctuation:
             temp_text=temp_text +i
    text = temp_text
    tokens = text.split()
    #then remove all numerical text from the tokens
    tokens = [word for word in tokens if word.isalpha()]
    # the next step is to remove all stop words from the text
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

# Testing of the cleaning text function 
# words = clean_text(text=text)
# print(words)

#### Updating the Vocabulary (The vocabulary needs to be constantly updated)

In [5]:
def update_vocab(filename,vocab):
    tokens = clean_text(read_file(filename))
    vocab.update(tokens)
#testing for this function
# update_vocab("review_polarity/txt_sentoken/pos/cv199_9629.txt")
# print(vocab.most_common(50))

#### Function to read all the files in  the directory

In [6]:
#function reads all files and updates vocabulary 
def read_directory(directory,vocab):
    for name in glob.glob(directory +'cv[0-6]*'):
        #print(name)
        update_vocab(name,vocab)

#testing for the function
# read_directory("review_polarity/txt_sentoken/pos/")
# read_directory("review_polarity/txt_sentoken/neg/")
# print(len(vocab))
#print(vocab)

#### Eliminating some of the minimum occurring words and saving the vocabulary as a list

In [7]:
def save_vocab(vocab):
    minimum_occurance = 2
    voc = [i for i,j in vocab.items() if j>= minimum_occurance]
    return voc

### Creating the Vocabulary using the previous functions

In [8]:
# A counter that would be used for vocabulary
pos_vocab = Counter() # initialize the counter to be used throughout
neg_vocab = Counter()

#making vocab from training functions
read_directory("review_polarity/txt_sentoken/pos/",pos_vocab)
read_directory("review_polarity/txt_sentoken/neg/",neg_vocab)
#print([(i,pos_vocab[i]) for i in pos_vocab])

# print(pos_vocab.most_common(5000))
#Removing words that hardly occur
pos_vocabulary= save_vocab(pos_vocab) #.most_common(5000))
neg_vocabulary = save_vocab(neg_vocab) #.most_common(5000))
#print(pos_vocabulary)

#Removing words that are not in lexicon
pos_vocabulary = [i for i in pos_vocabulary if i in positive_words]
neg_vocabulary = [i for i in neg_vocabulary if i in negative_words]
#print(pos_vocabulary)
#print(len(pos_vocab.most_common(5000)))
# print(Vocabulary)
# print(vocab)

### Testing

In [9]:
def test_directory(directory):
    pos =0
    neg = 0
    pos_sum =0
    neg_sum = 0
    
    #the following lines were used to make weighted additions but the accuracy fell because of 
    #discrepancies in the neative vocab data
#     for i in pos_vocabulary:
#         pos_sum += pos_vocab[i]
#     pos_sum = pos_sum/len(pos_vocabulary)
#     for i in pos_vocabulary:
#         neg_sum += neg_vocab[i]
#     neg_sum = neg_sum/len(neg_vocabulary)
    for name in glob.glob(directory +'cv[7-9]*'):
        tokens = clean_text(read_file(name))
        pos_decision =0
        neg_decision = 0
        for i in tokens:
            if i in pos_vocabulary:
                pos_decision += 1 #* pos_vocab[i]/pos_sum   # weights were taken off cos of reason stated above
            if i in neg_vocabulary:
                neg_decision +=1 #* neg_vocab[i]/neg_sum
        if pos_decision >= neg_decision :
            pos += 1
        else:
            neg += 1
    return pos,neg

#testing the function 
# a,b = test_directory("review_polarity/txt_sentoken/pos/")
# print(a)
# print(b)

In [10]:
#let us now calculate the accuracy of the baseline system
pos,neg = test_directory("review_polarity/txt_sentoken/pos/") #we are testing on the positive files
print(pos,neg)

correct = pos # makes sense because the correct ones would be the number of postives
total = pos + neg

tp = pos      #true positive
fn = neg      #false negative 


pos,neg = test_directory("review_polarity/txt_sentoken/neg/") #we are testing on the negative files
print(pos,neg)

fp = pos     #false postive 

correct += neg 
total += pos + neg

recall = tp/(tp+fn) * 100 
precision = tp/(tp+fp) * 100
accuracy =  correct/total *100 

print ("Accuracy is "+ str(accuracy))
print ("Precision is "+ str(precision))
print ("Recall is "+ str(recall))


220 80
99 201
Accuracy is 70.16666666666667
Precision is 68.96551724137932
Recall is 73.33333333333333


### Naive Bayes Classifier 

In [11]:
pos_sum = 0
neg_sum = 0
for i in pos_vocabulary:
    pos_sum+= pos_vocab[i]
for i in neg_vocabulary:
    neg_sum+= neg_vocab[i]

In [12]:
def NB_classifier(directory):
    neg = 0
    pos = 0 
    for name in glob.glob(directory +'cv[7-9]*'):
        pos_prob =0
        neg_prob = 0
        tokens = clean_text(read_file(name))
        for i in tokens:
            if i in pos_vocabulary:
                pos_prob += math.log(pos_vocab[i]/pos_sum)
            else:
                pos_prob += math.log(1/pos_sum)
            if i in neg_vocabulary:
                neg_prob += math.log(neg_vocab[i]/pos_sum)
            else:
                neg_prob += math.log(1/neg_sum)
        if pos_prob >= neg_prob:
            pos += 1
        else:
            neg += 1
    return pos,neg

In [13]:
#let us now calculate the accuracy of the NB system
pos,neg = NB_classifier("review_polarity/txt_sentoken/pos/") #we are testing on the positive files
print(pos,neg)

correct = pos # makes sense because the correct ones would be the number of postives
total = pos + neg

tp = pos      #true positive
fn = neg      #false negative 


pos,neg = NB_classifier("review_polarity/txt_sentoken/neg/") #we are testing on the negative files
print(pos,neg)

fp = pos     #false postive 

correct += neg 
total += pos + neg

accuracy =  correct/total *100 
recall = tp/(tp+fn) * 100 
precision = tp/(tp+fp) * 100
print ("Accuracy is "+ str(accuracy))
print ("Precision is "+ str(precision))
print ("Recall is "+ str(recall))


208 92
81 219
Accuracy is 71.16666666666667
Precision is 71.97231833910035
Recall is 69.33333333333334
