In [31]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
import pandas as pd
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer  # lemmatization
from collections import Counter, OrderedDict


In [9]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
    


def splitData(percentage):
    # A method to split the data between trainData and testData, ensuring both positive and negative reviews 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [10]:
rawData = []

In [13]:
loadData('amazon_reviews.txt')

In [14]:
rawData[0]

('1',
 'When least you think so, this product will save the day. Just keep it around just in case you need it for something.',
 '__label1__')

In [15]:
df = pd.read_csv('amazon_reviews.txt', sep="\t", header=None) # Read file

In [16]:
type(df)

pandas.core.frame.DataFrame

In [17]:
#df = pd.read_csv('amazon_reviews.txt')

In [18]:
df.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int64')

In [19]:
df.head() # Show first five rows

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
1,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
2,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
3,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
4,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...


In [20]:
df.shape

(21001, 9)

In [21]:
df[[0,8,1]]

Unnamed: 0,0,8,1
0,DOC_ID,REVIEW_TEXT,LABEL
1,1,"When least you think so, this product will sav...",__label1__
2,2,Lithium batteries are something new introduced...,__label1__
3,3,I purchased this swing for my baby. She is 6 m...,__label1__
4,4,I was looking for an inexpensive desk calcolat...,__label1__
...,...,...,...
20996,20996,"I bought these for work. I have high arches, ...",__label2__
20997,20997,Crocs are one of only two brands of shoes that...,__label2__
20998,20998,I love moccasins This fit like it was custom ...,__label2__
20999,20999,I wish these were a little more durable. I got...,__label2__


In [22]:
df[8][1]

'When least you think so, this product will save the day. Just keep it around just in case you need it for something.'

In [25]:
preProcess(df[8][1]) # 

['least', 'think', 'product', 'save', 'day', 'keep', 'around', 'case', 'need', 'something']


['least',
 'think',
 'product',
 'save',
 'day',
 'keep',
 'around',
 'case',
 'need',
 'something']

# Question 1

In [12]:
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    # DESCRIBE YOUR METHOD IN WORDS
    #Id = reviewLine['DOC_ID'].tolist()
    #Text = reviewLine['REVIEW_TEXT'].tolist()
    #Label= reviewLine['LABEL'].tolist()
    
    Id = reviewLine[0]
    Text = reviewLine[8]
    Label= reviewLine[1]
    #print(type(Id))
    return (Id,Text,Label)

In [24]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens = text.split() # tokenize
    #print('tokens type' + str(type(tokens)))
    
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))  # optional TODO
    tokens = [f for f in filter(None, [pattern.sub('', token) for token in tokens])]
    tokens = [token.lower() for token in tokens if token.isalpha()]
    stopword_list = nltk.corpus.stopwords.words('english') # remove stopwords
    tokens = [token for token in tokens if token not in stopword_list]
    wnl=WordNetLemmatizer()  # lemmatize
    tokens = [wnl.lemmatize(token) for token in tokens]
    print (tokens)
    return tokens

# Question 2

In [47]:
toFeatureVector(preProcess(df[8][1]))

['least', 'think', 'product', 'save', 'day', 'keep', 'around', 'case', 'need', 'something']


{'least': 1,
 'think': 1,
 'product': 1,
 'save': 1,
 'day': 1,
 'keep': 1,
 'around': 1,
 'case': 1,
 'need': 1,
 'something': 1}

In [42]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    featureDict = Counter(tokens)  # for now a simple count
    featureDict = dict(featureDict)
    return featureDict

In [43]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

# Question 3

In [44]:
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    # DESCRIBE YOUR METHOD IN WORDS
    for i in range(0,len(dataset),foldSize):
        continue # Replace by code that trains and tests on the 10 folds of data in the dataset
    return cv_results

In [45]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [46]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
['least', 'think', 'product', 'save', 'day', 'keep', 'around', 'case', 'need', 'something']
['lithium', 'battery', 'something', 'new', 'introduced', 'market', 'average', 'developing', 'cost', 'relatively', 'high', 'stallion', 'doesnt', 'compromise', 'quality', 'provides', 'u', 'best', 'low', 'costbr', 'many', 'built', 'technical', 'assistant', 'act', 'like', 'sensor', 'particular', 'forté', 'battery', 'keep', 'phone', 'charged', 'work', 'every', 'voltage', 'high', 'voltage', 'never', 'risked']
['purchased', 'swing', 'baby', 'month', 'pretty', 'much', 'grown', 'loud', 'doesnt', 'swing', 'well', 'beautiful', 'though', 'love', 'color', 'lot', 'setting', 'dont', 'think', 'worth', 'money']
['looking', 'inexpensive', 'desk', 'calcolatur', 'work', 'everything', 'need', 'issue', 'tilt', 'slightly', 'one', 'side', 'hit', 'key', 'rock', 'little', 'bit', '

# Evaluate on test set

In [195]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = False  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

# Questions 4 and 5
Once you're happy with your functions for Questions 1 to 3, it's advisable you make a copy of this notebook to make a new notebook, and then within it adapt and improve all three functions in the ways asked for in questions 4 and 5.