In [14]:
import sklearn
import numpy as np
import pandas as pd
import string 
import nltk # import Natural Language Toolkit
nltk.download('wordnet') # download the corpus of words the NLTK library uses
from nltk.stem import WordNetLemmatizer # import the lemmatizer


[nltk_data] Downloading package wordnet to /home/vijay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
# Input the file name as a string
# Outputs two lists: [listOfReviews], [listOfLabels]
def loadAndParse(inputFileName):
    # Open file
    fIn = open(inputFileName)

    # split the the file into lines
    lines = fIn.read().splitlines()

    # Now split each line on tabs to get text and label
    reviews = []
    labels = []
    for review in lines:
        messageAndLabelList = review.split('\t')
        if(len(messageAndLabelList) != 2):
            print(review)
        message = messageAndLabelList[0]
        label = messageAndLabelList[1]
        reviews.append(message)
        labels.append(label)
    return reviews, labels


In [24]:
# Get three lists of reviews and three lists of labels
yelpReviews, yelpLabels = loadAndParse('sentiment_labelled_sentences/yelp_labelled.txt')
imdbReviews, imdbLabels = loadAndParse('sentiment_labelled_sentences/imdb_labelled.txt')
amazonReviews, amazonLabels = loadAndParse('sentiment_labelled_sentences/amazon_cells_labelled.txt')

# Make two big lists: one of all reviews and one of all labels, in matching order.
allReviews = []
allLabels = []
allReviews = yelpReviews + imdbReviews + amazonReviews
allLabels = yelpLabels + imdbLabels + amazonLabels


In [26]:
# Count the number of positive and negative reviews
amazon_positives , amazon_negatives = 0, 0
for label in amazonLabels:
    label = int(label)
    if label == 1:
        amazon_positives += 1
    if label == 0:
        amazon_negatives += 1
    
print("AMAZON: There are", amazon_positives, "positive reviews.")
print("AMAZON: There are", amazon_negatives, "negative reviews.")

AMAZON: There are 500 positive reviews.
AMAZON: There are 500 negative reviews.


In [27]:
# Count the number of positive and negative reviews
imdb_positives , imdb_negatives = 0, 0
for label in imdbLabels:
    label = int(label)
    if label == 1:
        imdb_positives += 1
    if label == 0:
        imdb_negatives += 1
    
print("IMDB: There are", imdb_positives, "positive reviews.")
print("IMDB: There are", imdb_negatives, "negative reviews.")

IMDB: There are 500 positive reviews.
IMDB: There are 500 negative reviews.


In [28]:
# Count the number of positive and negative reviews
yelp_positives , yelp_negatives = 0, 0
for label in yelpLabels:
    label = int(label)
    if label == 1:
        yelp_positives += 1
    if label == 0:
        yelp_negatives += 1
    
print("YELP: There are", yelp_positives, "positive reviews.")
print("YELP: There are", yelp_negatives, "negative reviews.")

YELP: There are 500 positive reviews.
YELP: There are 500 negative reviews.


In [29]:
# Count the number of positive and negative reviews
positives , negatives = 0, 0
for label in allLabels:
    label = int(label)
    if label == 1:
        positives += 1
    if label == 0:
        negatives += 1
    
print("There are", positives, "positive reviews.")
print("There are", negatives, "negative reviews.")


There are 1500 positive reviews.
There are 1500 negative reviews.


In [30]:
# A working punctuation remover. It can do whole sentences.
def stripPunctuation(input):
    translation_table = dict.fromkeys(map(ord, '$#%&!()*+,-./:;<=>?@[\]^_`{|}~'), None)
    output = input.translate(translation_table)
    # from: https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
    return output

In [31]:
# A working word lemmatizer. It works on single words.
def lemmatizeWord(input):
    lemmatize = WordNetLemmatizer()
    output = lemmatize.lemmatize(input)
    return output

In [None]:
def cleanAndRemoveStopWords(input):
    stopwords = ['what','who','is','a','at','is','he']
    querywords = stripPunctuation(input).split()
    resultwords  = [lemmatizeWord(word) for word in querywords if lemmatizeWord(word.lower()) not in stopwords]
    result = ' '.join(resultwords)
    return result

In [61]:
# STEPS
# 1) import each of the three files
# 2) create a list of strings from each file, where each string is a message
# 3) Clean each string: strip punctuation

# 3b) Create training set and test set

# 4) split each string into a list of individual words by splitting on spaces
# 5) add the lists to make one big list of words 
# 6) lowercase and lemmatize every word in the list
# 7) convert the list into a set to get rid of repeats. This set is the corpus.
# 8) Maybe convert the set back into a list if that's needed to iterate over the list

# ... remove stop words
