In [None]:
!pip install unicodecsv
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')
import unicodecsv                               # csv reader
import re                                       # regular expressions
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier

# To do preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np # To compute the average results

from random import shuffle # To shuffle the dataset


# To use feature selection in the Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
# load data from a file and append it to the tweetData
def loadData(path, reviewText=None):
    with open(path, 'rb') as f:
        reader = unicodecsv.reader(f, encoding='utf-8', delimiter='\t')
        print(reader)
        reader.next()
        for line in reader:
            # print(line)
            (Id, Rating, VerifiedPurchase, Category, Text, Label) = parseReviewImproved(line)
            rawData.append((Id, Rating, VerifiedPurchase, Category, Text, Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Rating, VerifiedPurchase, Category, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(Rating, VerifiedPurchase, Category, preProcess(Text)),Label))
    for (_, Rating, VerifiedPurchase, Category, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(Rating, VerifiedPurchase, Category, preProcess(Text)),Label))

In [None]:
# QUESTION 1

# the output classes
fakeLabel = 'fake'
realLabel = 'real'
labelMap = {'__label1__' : fakeLabel, '__label2__' : realLabel}

# Convert line from input file into an id/text/label tuple plus meta features
def parseReviewImproved(reviewLine):
    Id    = int(reviewLine[0])
    Rating = int(reviewLine[2])
    VerifiedPurchase = reviewLine[3]
    Category = reviewLine[4]
    Text  = reviewLine[8]
    Label = labelMap[reviewLine[1]]
    return (Id, Rating, VerifiedPurchase, Category, Text, Label)

In [None]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# input: a string of one review
def preProcess(text):
    # should return a list of tokens
    
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # lowercasing
    tokens = [t.lower() for t in tokens]

    # stopword removal- benefits are it removes rare words, though bad for bigram relations
    if False:
        stop = set(stopwords.words('english'))
        tokens = [t for t in tokens if t not in stop]
    
    # lemmatisation
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t] # ensure no empty space
    
    return tokens

print(preProcess("hello this is the, ehh... presumably, a crying situations!"))

['hello', 'this', 'is', 'the', 'ehh', 'presumably', 'a', u'cry', u'situation']


In [None]:
# QUESTION 2
featureDict = {} # the global feature dictionary

def toFeatureVector(rating, verifiedPurchase, category, tokens):
    # return a dictionary 'featureVect' where the keys are the tokens in 'words' and the values are the number of occurrences of the tokens
    # start by using binary values only:
#     baseDict = {}
    # print(tokens)
    featureVec = {}

    for w in tokens:
        try:
            featureVec[w] += 1.0/len(tokens)
        except KeyError:
            featureVec[w] = 1.0/len(tokens)
        try:
            featureDict[w] += 1.0/len(tokens)
        except KeyError:
            featureDict[w] = 1.0/len(tokens)
    
    # just get bigram binary presence or not
    for i in range(1, len(tokens)):
        bigram = tokens[i-1] + " " + tokens[i]
        try:
            featureVec[bigram] = 1 #+= 1.0/len(tokens)
        except KeyError:
            featureVec[bigram] = 1 #= 1.0/len(tokens)
        try:
            featureDict[bigram] += 1.0
        except KeyError:
            featureDict[bigram] = 1.0

    featureVec['RATING:'+str(rating)] = 1.0 #0.3
    featureVec['VP:'+verifiedPurchase] = 1.0 # 0.5
    featureVec['CATEGORY:'+category] = 1.0 #0.3
    
    try:
        featureDict['RATING:'+str(rating)] += 1.0
    except KeyError:
        featureDict['RATING:'+str(rating)] = 1.0
        
    try:
        featureDict['VP:'+verifiedPurchase] += 1.0
    except KeyError:
        featureDict['VP:'+verifiedPurchase] = 1.0
        
    try:
        featureVec['CATEGORY:'+category] += 1.0
    except KeyError:
         featureDict['CATEGORY:'+category] = 1.0
    
    return featureVec




In [None]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print "Training Classifier..."
    pipeline =  Pipeline([('tfidf', TfidfTransformer()),('chi2', SelectKBest(chi2, k=20000)),('svc', LinearSVC(loss = 'hinge'))])
    return SklearnClassifier(pipeline).train(trainData)

In [None]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(text, classifier):
    return classifier.classify(toFeatureVector(preProcess(text)))

In [None]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    results = []
    foldSize = len(dataset)/folds
    
    for i in range(0,len(dataset),foldSize):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print "Fold start on items %d - %d" % (i, i+foldSize)
        myTestData = dataset[i:i+foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(myTrainData)
        y_true = map(lambda x: x[1], myTestData)
        y_pred = predictLabels(myTestData, classifier)
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
        
    avgResults = map(np.mean,zip(*results)[:3])
    return avgResults

In [None]:
# MAIN

# loading reviews
rawData = [] # the filtered data from the dataset file (should be 21000 samples)
trainData = [] # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = [] # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# references to the data files
reviewPath = '/amazon_reviews.txt'

# do the actual stuff
print "Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData))
print "Preparing the dataset..."
loadData(reviewPath)
print "Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData))
print "Preparing training and test data..."
splitData(0.8)
print "Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData))


# print(trainData)

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
<unicodecsv.py2.UnicodeReader object at 0x7fec6b1ed290>
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData


In [None]:
cv_results = crossValidate(trainData, 10)
print cv_results

Fold start on items 0 - 1680
Training Classifier...
Fold start on items 1680 - 3360
Training Classifier...
Fold start on items 3360 - 5040
Training Classifier...
Fold start on items 5040 - 6720
Training Classifier...
Fold start on items 6720 - 8400
Training Classifier...
Fold start on items 8400 - 10080
Training Classifier...
Fold start on items 10080 - 11760
Training Classifier...
Fold start on items 11760 - 13440
Training Classifier...
Fold start on items 13440 - 15120
Training Classifier...
Fold start on items 15120 - 16800
Training Classifier...
[0.81293728234612, 0.8069047619047618, 0.8059974876323489]


In [None]:
classifier = trainClassifier(trainData)
testTrue = map(lambda t: t[1], testData)
testPred = predictLabels(testData, classifier)
finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted')
print "Done training!"
print "Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3]

Training Classifier...
Done training!
Precision: 0.818834
Recall: 0.810476
F Score:0.809226
