In [28]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from matplotlib import pyplot as plt
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import train_test_split
import string


# Stop words, lemmatization, punctiation
# lemmatization = stronger stemming.
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Angelo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Angelo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
corpus_df = pd.read_csv("corpus.tsv", sep="\t")
corpus_df = corpus_df.set_index("DOC_ID")
corpus_df.head()

Unnamed: 0_level_0,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


## Parse through data and clean things up

In [20]:
# parses review text
# tokenizes, lemmatizes, filters stop words, creates bigrams
table = str.maketrans({key: None for key in string.punctuation})
def parseReviewText(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.translate(table)
    filtered_tokens=[]
    lemmatized_tokens = []
    for w in text.split(" "):
        if w not in stop_words:
            lemmatized_tokens.append(lemmatizer.lemmatize(w.lower()))
        filtered_tokens = [' '.join(l) for l in nltk.bigrams(lemmatized_tokens)] + lemmatized_tokens
    return filtered_tokens
corpus_df["PARSED_REVIEW_TEXT"] = corpus_df["REVIEW_TEXT"].apply(parseReviewText)


corpus_df.head()

Unnamed: 0_level_0,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,PARSED_REVIEW_TEXT
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...","[when least, least think, think product, produ..."
2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,"[lithium battery, battery something, something..."
3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,"[i purchased, purchased swing, swing baby, bab..."
4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,"[i looking, looking inexpensive, inexpensive d..."
5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,"[i use, use twice, twice week, week result, re..."


In [57]:
corpus_df.to_pickle("pickles/pickled_parsed_df.pkl")

## Generate feature vectors of each review

In [39]:
data = corpus_df[["LABEL", "RATING", "PRODUCT_CATEGORY", "VERIFIED_PURCHASE", "PARSED_REVIEW_TEXT"]].values
featureDict = {} # A global dictionary of features

# taking into account diff values
data_X = []
data_y = []
for x in data:
    # some global stuff for debugging and info
    if x[2] not in featureDict:
        featureDict[x[2]] = 1
    else:
        featureDict[x[2]] = +1
    featureDict["VP"] = 1
    featureDict["R"] = 1  


    localDict = {}

    # x is: label, rating, category, verified, [parsed text tokens]
    localDict["R"] = x[1]

    #Verified_Purchase
    if x[3] == "N":
        localDict["VP"] = 0
    else:
        localDict["VP"] = 1
        
    #Product_Category
    if x[2] not in localDict:
        localDict[x[2]] = 1
    else:
        localDict[x[2]] = +1
            
    #Text        
    for token in x[4]:
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] = +1
            
        if token not in localDict:
            localDict[token] = 1
        else:
            localDict[token] = +1

    label = "FAKE" if x[0] == "__label1__" else "REAL"
    
    data_X.append(localDict)
    data_y.append(label)  

## For testing only, split into train and test sets.

In [43]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=42)
print("Number of data points")
print("Train:", len(X_train), "Test:", len(X_test))

Number of data points
Train: 16800 Test: 4200


## Generate the classifier

In [None]:
training_set = [(X_train[x], y_train[x]) for x in range(len(X_train))]

pipeline =  Pipeline([('svc', LinearSVC(C=0.01))])
clf = SklearnClassifier(pipeline).train(training_set)

## Test Classifier on testing set

In [49]:
test_set = [(X_test[x], y_test[x]) for x in range(len(X_test))]

predictions = clf.classify_many(map(lambda t: t[0], test_set))
true_labels = list(map(lambda d: d[1], test_set))
a = accuracy_score(true_labels, predictions)
p, r, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')
print("accuracy: ", a)
print("Precision: ", p)
print("Recall: ", a)
print("f1-score: ", f1)

accuracy:  0.810952380952381
Precision:  0.8125154427084342
Recall:  0.810952380952381
f1-score:  0.8107927346407253


## Generate classifier on full set

In [51]:
full_set = (data_X, data_y)
pipeline =  Pipeline([('svc', LinearSVC(C=0.01))])
clf = SklearnClassifier(pipeline).train(training_set)

## Pickle/save classifier

In [53]:
import pickle

In [56]:
pickle.dump(clf, open("pickles/full_dataset_classifier.pkl", "wb"))