In [None]:
!pip install unicodecsv
import nltk
nltk.download('stopwords')
from __future__ import division
import unicodecsv                               # csv reader
import re                                       # regular expressions
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier


from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
from random import shuffle
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from collections import Counter

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# load data from a file and append it to the tweetData
def loadData(path, reviewText=None):
    with open(path, 'rb') as f:
        reader = unicodecsv.reader(f, encoding='utf-8', delimiter='\t')
        reader.next()
        for line in reader:
            (reviewId, reviewText, label) = parseReview(line)
            rawData.append((reviewId, reviewText, label))
            
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_,reviewText,label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(reviewText)),label))
    for (_,reviewText,label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(reviewText)),label))
        
        

In [None]:
# QUESTION 1

# the output classes
fakeLabel = 'fake'
realLabel = 'real'
labelMap = {'__label1__' : fakeLabel, '__label2__' : realLabel}

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    
    reviewId    = int(reviewLine[0])
    reviewText  = reviewLine[8]
    reviewLabel = labelMap[reviewLine[1]]
    
    return (reviewId, reviewText, reviewLabel)


In [None]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# input: a string of one review
def preProcess(text):
    # should return a list of tokens
    
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # lowercasing
    tokens = [t.lower() for t in tokens]

    # stopword removal
    stop = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop]
    
    # lemmatisation
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t) for t in tokens]
    
    return tokens

print(preProcess("hello this is the, ehh... presumably, a crying situations!"))

['hello', 'ehh', 'presumably', u'cry', u'situation']


In [None]:
# QUESTION 2

def toFeatureVector(words):
    # return a dictionary 'featureVect' where the keys are the tokens in 'words' and the values are the number of occurrences of the tokens
    # start by using binary values only:
    counts = Counter(words)
    return {w: counts[w]/sum(counts.values()) for w in counts.keys()}#{w: 1.0/len(words) for w in words}

In [None]:
toFeatureVector(["a", "a", "b", "c"])

{'a': 0.5, 'b': 0.25, 'c': 0.25}

In [None]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print "Training Classifier..."
    return SklearnClassifier(LinearSVC(loss='squared_hinge')).train(trainData)



In [None]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(text, classifier):
    return classifier.classify(toFeatureVector(preProcess(text)))


In [None]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    results = []
    foldSize = int(len(dataset)/folds)
    
    for i in range(0,len(dataset),int(foldSize)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print "Fold start on items %d - %d" % (i, i+foldSize)
        myTestData = dataset[i:i+foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(myTrainData)
        y_true = map(lambda x: x[1], myTestData)
        y_pred = predictLabels(myTestData, classifier)
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
        
    avgResults = map(np.mean,zip(*results)[:3])
    return avgResults

In [None]:
# MAIN

# loading reviews
rawData = [] # the filtered data from the dataset file (should be 21000 samples)
trainData = [] # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = [] # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# references to the data files
# reviewPath = '../../labs_1_and_2/Lab_2_solutions/amazon_reviews.txt'
reviewPath = '/amazon_reviews.txt'

# do the actual stuff
print "Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData))
print "Preparing the dataset..."
loadData(reviewPath)
print "Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData))
print "Preparing training and test data..."
splitData(0.8)
print "Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData))


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData


In [None]:
cv_results = crossValidate(trainData, 10)
print cv_results

Fold start on items 0 - 1680
Training Classifier...
Fold start on items 1680 - 3360
Training Classifier...
Fold start on items 3360 - 5040
Training Classifier...
Fold start on items 5040 - 6720
Training Classifier...
Fold start on items 6720 - 8400
Training Classifier...
Fold start on items 8400 - 10080
Training Classifier...
Fold start on items 10080 - 11760
Training Classifier...
Fold start on items 11760 - 13440
Training Classifier...
Fold start on items 13440 - 15120
Training Classifier...
Fold start on items 15120 - 16800
Training Classifier...
[0.656825074400999, 0.655892857142857, 0.6557201803942636]


In [None]:
print(testData[0])
classifier = trainClassifier(trainData)
testTrue = map(lambda t: t[1], testData)
testPred = predictLabels(testData, classifier)
finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted')
print "Done training!"
print "Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3]


({u'little': 0.09090909090909091, u'whenever': 0.09090909090909091, u'always': 0.09090909090909091, u'hershey': 0.09090909090909091, u'one': 0.09090909090909091, u'assortment': 0.09090909090909091, u'holiday': 0.09090909090909091, u'come': 0.09090909090909091, u'excited': 0.09090909090909091, u'best': 0.09090909090909091, u'really': 0.09090909090909091}, 'fake')
Training Classifier...
Done training!
Precision: 0.602309
Recall: 0.601905
F Score:0.601511
