In [56]:
import sklearn as sk
import numpy as np
import pandas as pd
import string 
import nltk # import Natural Language Toolkit
nltk.download('wordnet') # download the corpus of words the NLTK library uses
from nltk.stem import WordNetLemmatizer # import the lemmatizer


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zack\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Input the file name as a string
# Outputs two lists: [listOfReviews], [listOfLabels]
def loadAndParse(inputFileName):
    # Open file
    fIn = open(inputFileName)

    # split the the file into lines
    lines = fIn.read().splitlines()

    # Now split each line on tabs to get text and label
    reviews = []
    labels = []
    for review in lines:
        messageAndLabelList = review.split('\t')
        if(len(messageAndLabelList) != 2):
            print(review)
        message = messageAndLabelList[0]
        label = messageAndLabelList[1]
        reviews.append(message)
        labels.append(label)
    return reviews, labels


In [3]:
# Get three lists of reviews and three lists of labels
yelpReviews, yelpLabels = loadAndParse('sentiment_labelled_sentences/yelp_labelled.txt')
imdbReviews, imdbLabels = loadAndParse('sentiment_labelled_sentences/imdb_labelled.txt')
amazonReviews, amazonLabels = loadAndParse('sentiment_labelled_sentences/amazon_cells_labelled.txt')

# Make two big lists: one of all reviews and one of all labels, in matching order.
allReviews = []
allLabels = []
allReviews = yelpReviews + imdbReviews + amazonReviews
allLabels = yelpLabels + imdbLabels + amazonLabels


In [4]:
# Count the number of positive and negative reviews
amazon_positives , amazon_negatives = 0, 0
for label in amazonLabels:
    label = int(label)
    if label == 1:
        amazon_positives += 1
    if label == 0:
        amazon_negatives += 1
    
print("AMAZON: There are", amazon_positives, "positive reviews.")
print("AMAZON: There are", amazon_negatives, "negative reviews.")

AMAZON: There are 500 positive reviews.
AMAZON: There are 500 negative reviews.


In [5]:
# Count the number of positive and negative reviews
imdb_positives , imdb_negatives = 0, 0
for label in imdbLabels:
    label = int(label)
    if label == 1:
        imdb_positives += 1
    if label == 0:
        imdb_negatives += 1
    
print("IMDB: There are", imdb_positives, "positive reviews.")
print("IMDB: There are", imdb_negatives, "negative reviews.")

IMDB: There are 500 positive reviews.
IMDB: There are 500 negative reviews.


In [6]:
# Count the number of positive and negative reviews
yelp_positives , yelp_negatives = 0, 0
for label in yelpLabels:
    label = int(label)
    if label == 1:
        yelp_positives += 1
    if label == 0:
        yelp_negatives += 1
    
print("YELP: There are", yelp_positives, "positive reviews.")
print("YELP: There are", yelp_negatives, "negative reviews.")

YELP: There are 500 positive reviews.
YELP: There are 500 negative reviews.


In [7]:
# A working punctuation remover. It can do whole sentences.
def stripPunctuation(input):
    translation_table = dict.fromkeys(map(ord, '"\'1234567890$#%&!()*+,-./:;<=>?@[\]^_`{|}~'), None)
    output = input.translate(translation_table)
    # from: https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
    return output

In [8]:
# A working word lemmatizer. It works on single words.
def lemmatizeWord(input):
    lemmatize = WordNetLemmatizer()
    output = lemmatize.lemmatize(input)
    return output

In [9]:
# Remove punctuation and stop words from a sentence.
def cleanAndRemoveStopWords(input):
    stopWords = ['i', 'if', 'what','who','is','a','an','and', 'at','are','as','be','by','for', 'he', 'in', 'it', 'its', 'of', 'on', 'or', 'that', 'the', 'to', 'was', 'were', 'will', 'with']
    queryWords = stripPunctuation(input).split()
    resultWords = [lemmatizeWord(word) for word in queryWords if lemmatizeWord(word.lower()) not in stopWords]
    
    resultWordsLower = []
    for word in resultWords:
        resultWordsLower.append(word.lower())
    
    return resultWordsLower

## c.) Split the training set and test sets.

In [10]:
# Create the training and test sets for each of the three websites
amazonReviewsTrain = amazonReviews[0:400]
amazonLabelsTrain = amazonLabels[0:400]
amazonReviewsTest = amazonReviews[400: 500]
amazonLabelsTest = amazonLabels[400:500]

imdbReviewsTrain = imdbReviews[0:400]
imdbLabelsTrain = imdbLabels[0:400]
imdbReviewsTest = imdbReviews[400: 500]
imdbLabelsTest = imdbLabels[400:500]

yelpReviewsTrain = yelpReviews[0:400]
yelpLabelsTrain = yelpLabels[0:400]
yelpReviewsTest = yelpReviews[400: 500]
yelpLabelsTest = yelpLabels[400:500]

## d.) Bag of words model.
Extract features and then represent each review using bag of words
model, i.e., every word in the review becomes its own element in a feature vector. In order to
do this, first, make one pass through all the reviews in the training set (**Explain why** we can’t
use testing set at this point) and build a dictionary of unique words. Then,make another pass
through the review in both the training set and testing set and count up the occurrences of
each word in your dictionary. The i th element of a review’s feature vector is the number of
occurrences of the i th dictionary word in the review. Implement the bag of words model and
report feature vectors of any two reviews in the training set.

In [11]:
# Make lists of all training reviews and labels.
allReviewsTrain = amazonReviewsTrain + imdbReviewsTrain + yelpReviewsTrain
allLabelsTrain = amazonLabelsTrain + imdbLabelsTrain + yelpLabelsTrain

allReviewsTest = amazonReviewsTest + imdbReviewsTest + yelpReviewsTest
allLabelsTest = amazonLabelsTest + imdbLabelsTest + yelpLabelsTest

In [12]:
# Define a high level function that returns a non unique list of all non stop-words.
# As input, it takes the test set or the training set of reviews.

def cleanReviews(input):
    result = []
    for review in input:
        cleanReview = cleanAndRemoveStopWords(review)
        for cleanWord in cleanReview:
            result.append(cleanWord)
    return result

allWordsNotUnique = cleanReviews(allReviewsTrain)

In [13]:
# Define a function that makes a list of unique words (corpus), and a parallel list of counts 
# of each unique word. As input, it takes a list of non-unique words.
def makeListsOfUniquesAndCounts(input):
    uniques = []
    for word in input:
        if word not in uniques:
            uniques.append(word)
    # Sort the list of uniques
    uniques.sort()
    
    #now "uniques" is an alphabetical list of all unique words in the corpus.
    counts = [0]*len(uniques) # create a list of zeros as long as the number of unique words.
    for word in input:
        index = uniques.index(word)
        counts[index] += 1 
    return uniques, counts

# Call the function to define the corpus.
corpus, counts = makeListsOfUniquesAndCounts(allWordsNotUnique)


In [14]:
# Test each review against the corpus (from the training set). This defines each feature vector.
corpusSet = set(corpus)
def populateFeatureVectors(corpus, inputReviewsList):
    # feature vectors are row vectors, each column is a feature representing one word from the corpus.
    vectorsList = []
    for review in inputReviewsList:
        cleanedReview = cleanAndRemoveStopWords(review)
        featureVector = [0]*len(corpus)
        for word in cleanedReview:
            if word in corpusSet:
                index = corpus.index(word)
                featureVector[index] += 1 
        vectorsList.append(featureVector)
    return vectorsList


In [52]:
# Clean the training data.
vectorsList = populateFeatureVectors(corpus, allReviewsTrain)
# Make the list into a numpy array.
vectorsArray = np.array(vectorsList)
# Normalize each (now cleaned) row using the log norm, because it minimizes the variance of resulting vector.
normedTrain = np.log10(vectorsArray + 1)


# Clean the test data.
cleanedTestReviews = populateFeatureVectors(corpus, allReviewsTest)
# Make the list into a numpy array.
testArray = np.array(cleanedTestReviews)
# Norm the test data.
normedTest = np.log10(testArray + 1)


In [59]:
# Use a logistic regression model to predict the labels of the list of test reviews.
from sklearn.naive_bayes import GaussianNB

# Train the model.
sk.linear_model.LogisticRegression().fit(normedTrain, allLabelsTrain)
# Make predictions.
y_pred = sk.linear_model.LogisticRegression().fit(normedTrain, allLabelsTrain).predict(normedTest)
# Print some results from the logistic regression to see how it does (it does very badly).
    # print("Predicted labels: ", list(sk.linear_model.LogisticRegression().fit(normedTrain, allLabelsTrain).predict(normedTest[:15])) )
    # print("Actual labels:    ", allLabels[:15])
# Make a confusion matrix of the logistic regression results
y_true = allLabelsTest
logisticConfusionMtx = sk.metrics.confusion_matrix(y_true, y_pred)
logisticConfusionMtxDF = pd.DataFrame(np.array(logisticConfusionMtx), index=['Actual. T:', 'Actual. F:'], columns=['Pred. T', 'Pred. F'])

print("LOGISTIC REGRESSION")
print()
print("Confusion Matrix")
print(logisticConfusionMtxDF)
print()
print("Normalized Confusion Matrix")
print(logisticConfusionMtxDF/(len(y_true))) # Print the normalized confusion matrix


LOGISTIC REGRESSION

Confusion Matrix
            Pred. T  Pred. F
Actual. T:      114       34
Actual. F:       40      112

Normalized Confusion Matrix
             Pred. T   Pred. F
Actual. T:  0.380000  0.113333
Actual. F:  0.133333  0.373333


In [60]:
# Use Naive Bayes to predict the labels of the list of test reviews.
from sklearn.naive_bayes import GaussianNB

# Train the model.
y_predNB = GaussianNB().fit(normedTrain, allLabelsTrain)
# Make predictions.
y_predNB = GaussianNB().fit(normedTrain, allLabelsTrain).predict(normedTest)
# Make a confusion matrix of the Naive Bayes prediction results
y_true = allLabelsTest
NBConfusionMtx = sk.metrics.confusion_matrix(y_true, y_predNB)
NBConfusionMtxDF = pd.DataFrame(np.array(NBConfusionMtx), index=['Actual. T:', 'Actual. F:'], columns=['Pred. T', 'Pred. F'])

print("NAIVE BAYES")
print()
print("Confusion Matrix")
print(NBConfusionMtxDF)
print()
print("Normalized Confusion Matrix")
print(NBConfusionMtxDF/(len(y_true))) # Print the normalized confusion matrix


NAIVE BAYES

Confusion Matrix
            Pred. T  Pred. F
Actual. T:       87       61
Actual. F:       26      126

Normalized Confusion Matrix
             Pred. T   Pred. F
Actual. T:  0.290000  0.203333
Actual. F:  0.086667  0.420000


In [None]:
#Is this cell Needed???

# # FOR TRAINING SET
# # Construct parallel lists of unique words and counts
# uniquesTrain, countsTrain = makeListsOfUniquesAndCounts(allWordsTrain)
# # Construct dictionary of uniques and counts 
# trainingDataDict = makeDictOfUniquesAndCounts(uniquesTrain, countsTrain)

# # FOR TEST+TRAIN COMBINED SET
# # Construct parallel lists of unique words and counts
# uniquesCombined, countsCombined = makeListsOfUniquesAndCounts(allWordsCombined)
# # Construct dictionary of uniques and counts 
# # combinedDataDict = makeDictOfUniquesAndCounts(uniquesCombined, countsCombined)



In [None]:
# Is the cell needed???

# Build lists of words from training set
# allWordsTrain = cleanReviews(allReviewsTrain)

# allWordsCombined = cleanReviews(allReviewsTrain + allReviewsTest) DELETE this line??

In [None]:
# Is this cell needed???

# # Make a dictionary from a sorted list of unique words. 
# def makeDictOfUniquesAndCounts(uniques, counts):
#     keys = uniques
#     values = counts
#     uniquesDict = {}
#     for i in range(len(keys)):
#         uniquesDict[keys[i]] = values[i] 
#     #print(uniquesDict)  
#     return uniquesDict
    
# Call the function for testing.
# makeDictOfUniquesAndCounts(uniques, counts)