# Extra Credit - Advanced Baseline System

## Yelp Reviews

Here we are going to take the same baseline system developed previously and use it on Yelp Reviews to seperate the good reviews (or those that are 3.5 stars and above) and the bad reviews (less than 3.5 stars)

The first step is to import the various python libraries that would be needed in the code

In [1]:
import nltk
import numpy as np
import math
import glob
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter

#### Extracting lists from sentiment lexicons

In [2]:
#creating negative lexicon
with open("negative-words.txt",encoding = "ISO-8859-1") as file:
    content = file.readlines()
content = [x.strip() for x in content] 
negative_words = [x for x in content if not x.startswith(";")]
negative_words.remove('') # removing observed garbage value

#creating positive lexicon
with open("positive-words.txt",encoding = "ISO-8859-1") as file:
    content = file.readlines()
content = [x.strip() for x in content] 
positive_words = [x for x in content if not x.startswith(";")]
positive_words.remove('')



#### Extracting the Text

##### Creating a list of all reviews

In [3]:
filename = "yelp_reviews/all_reviews.txt"

with open(filename) as file:
    text = file.read()

reviews = text.split("]]]")
del reviews[10391] #removing a particular garbage value
reviews[0] = '\n'+reviews[0] # adding hashtag to keep consistency

#### Evaluate each review to return star rating and text

In [4]:
def evaluate_review(review):
    stars =  review[5] 
    text = review[21:]
    return int(stars),text
    

### Cleaning Text

In [5]:
def clean_text(text):
    temp_text=""
    text = text.lower()  #remove all upper cases
    
# the following feature has a very heavy weightage in accuracy
#   helps remove punctuations
    for i in text:
        if i not in punctuation:
             temp_text=temp_text +i
    
    text=temp_text
    tokens = text.split()  #split list into elements 
    
    #then remove all numerical text from the tokens
    tokens = [word for word in tokens if word.isalpha()]
    
    # the next step is to remove all stop words from the text
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

#### Cleaning text and updating the Vocabulary (The vocabulary needs to be constantly updated)

In [6]:
def update_vocab(tokens,vocab):    
    vocab.update(tokens)
#testing for this function
# update_vocab("review_polarity/txt_sentoken/pos/cv199_9629.txt")
# print(vocab.most_common(50))

#### Splitting reviews into training and testing lists

In [8]:
training_review = reviews[:8500]
testing_review = reviews[8500:]
# print (len(training_review))
# print (len(testing_review))

#### Eliminating some of the minimum occurring words and saving the vocabulary as a list

In [9]:
def save_vocab(vocab):
    minimum_occurance = 2
    voc = [i for i,j in vocab.items() if j>= minimum_occurance]
    return voc

### Creating the Vocabulary using the previous functions

In [10]:
# A counter that would be used for vocabulary
pos_vocab = Counter() # initialize the counter to be used throughout
neg_vocab = Counter()

for i in training_review:
    stars,text = evaluate_review(i)
    tokens = clean_text(text)
    if stars > 3:
        update_vocab(tokens,pos_vocab)
    elif stars < 3:
        update_vocab(tokens,neg_vocab)
    elif stars == 3:
        if ('3.5' in tokens) and ('stars' in tokens):
            update_vocab(tokens,pos_vocab)
        else:
            update_vocab(tokens,neg_vocab)

#Removing words that hardly occur
pos_vocabulary= save_vocab(pos_vocab) #.most_common(5000))
neg_vocabulary = save_vocab(neg_vocab) #.most_common(5000))
#print(pos_vocabulary)


# the following feature was remove because it greatly reduced accuracy
#Removing words that are not in lexicon
# pos_vocabulary = [i for i in pos_vocabulary if i in positive_words]
# neg_vocabulary = [i for i in neg_vocabulary if i in negative_words]
        
# print(neg_vocabulary)

### Testing

In [11]:
def test_review():
    correct = 0
    for i in testing_review:
        stars,text = evaluate_review(i)
        tokens = clean_text(i)
        pos_decision =0
        neg_decision = 0
        for i in tokens:
            if i in pos_vocabulary:
                pos_decision += 1 #* pos_vocab[i]/pos_sum   # weights were taken off cos of reason stated above
            if i in neg_vocabulary:
                neg_decision +=1 #* neg_vocab[i]/neg_sum
        if pos_decision >= neg_decision :
            pos = 1
        else:
            pos = -1
        if stars > 3:
            if pos == 1:
                correct += 1
        elif stars < 3:
            if pos == -1:
                correct += 1
        elif stars == 3:
            if ('3.5' in tokens) and ('stars' in tokens):
                if pos == 1:
                    correct += 1
            else:
                if pos == -1:
                    correct += 1
    return correct

In [12]:
number_of_corrects = test_review() #we are testing on the positive files
accuracy =  number_of_corrects/len(testing_review) * 100

print ("Accuracy is "+ str(accuracy))

Accuracy is 76.25594923320995
