In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path, encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [3]:
# QUESTION 1

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    return (int(reviewLine[0]), reviewLine[8], 'fake' if reviewLine[1] == '__label1__' else 'real')

In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    return text.split(" ")

In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    localDict = {}
    for token in tokens:
        if token not in localDict:
            localDict[token] = 1
        else:
            localDict[token] += 1
            
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] += 1
    
    return localDict

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
# QUESTION 3
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0,len(dataset),foldSize):
        #print("current fold %f", i+1)
        trainData = dataset[0:i] + dataset[i + foldSize:len(dataset)]
        classifier = trainClassifier(trainData)
        predictions = predictLabels(dataset[i:i+foldSize], classifier)
        true_labels = list(map(lambda d: d[1], dataset[i:i+foldSize]))
        a = accuracy_score(true_labels, predictions)
        p, r, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')
        results.append((p, r, f1, a))
    #return cv_results
    cv_results = np.mean(np.array(results), axis=0)
    return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER
def predictLabels(reviewSamples, classifier):
    #return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [9]:
# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')
print("Results after cross-validations: ", crossValidate(trainData, 10))

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
89043
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results after cross-validations:  [0.64564528 0.64558217 0.64545784 0.64565476]


In [12]:
import string
from nltk.corpus import stopwords

table = str.maketrans({key: None for key in string.punctuation})
def preProcess(text):
    # Should return a list of tokens
    text = text.translate(table)
    return [word for word in text.split(" ") if word not in stopwords.words('english')]

In [13]:
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)
featureDict = {}      # A global dictionary of features

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

print("Results after cross-validations: ", crossValidate(trainData, 10))

Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
55784
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results after cross-validations:  [0.633757   0.63362466 0.63340991 0.63363095]


In [14]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
def preProcess(text):
    # Should return a list of tokens
    return [english_stemmer.stem(word) for word in text.split(" ")]

In [15]:
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)
featureDict = {}      # A global dictionary of features

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

print("Results after cross-validations: ", crossValidate(trainData, 10))

Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
67516
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results after cross-validations:  [0.63854838 0.63840924 0.63829411 0.63863095]


In [16]:
import nltk.stem
english_lemmantizer = nltk.stem.WordNetLemmatizer()
def preProcess(text):
    # Should return a list of tokens
    return [english_lemmantizer.lemmatize(word) for word in text.split(" ")]

In [17]:
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)
featureDict = {}      # A global dictionary of features

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

print("Results after cross-validations: ", crossValidate(trainData, 10))

Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
85817
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results after cross-validations:  [0.63947048 0.63943305 0.63930429 0.63940476]


In [20]:
import nltk.stem
english_lemmantizer = nltk.stem.WordNetLemmatizer()
def preProcess(text):
    # Should return a list of tokens
    lemmantized_list = [english_lemmantizer.lemmatize(word) for word in text.split(" ")]
    return [" ".join(words) for words in nltk.bigrams(lemmantized_list)] + lemmantized_list

In [21]:
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)
featureDict = {}      # A global dictionary of features

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

print("Results after cross-validations: ", crossValidate(trainData, 10))

Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
607858
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results after cross-validations:  [0.66886574 0.66870678 0.6685902  0.66875   ]


In [22]:
import nltk.stem
english_lemmantizer = nltk.stem.WordNetLemmatizer()
def preProcess(text):
    # Should return a list of tokens
    lemmantized_list = [english_lemmantizer.lemmatize(word) for word in text.split(" ")]
    return [" ".join(words) for words in nltk.bigrams(lemmantized_list)] + [" ".join(words) for words in nltk.trigrams(lemmantized_list)] + lemmantized_list

In [23]:
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)
featureDict = {}      # A global dictionary of features

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

print("Results after cross-validations: ", crossValidate(trainData, 10))

Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
1648778
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results after cross-validations:  [0.67284855 0.67261049 0.67255237 0.67285714]


In [24]:
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(C=0.01))])
    return SklearnClassifier(pipeline).train(trainData)

trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)
featureDict = {}      # A global dictionary of features

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

print("Results after cross-validations: ", crossValidate(trainData, 10))

Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
1648778
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results after cross-validations:  [0.68652719 0.68593531 0.68551913 0.68583333]


In [25]:
IDF = {}
document_count = len(rawData)
vocabulary = list(featureDict.keys())

for word in vocabulary:
    IDF[word] = np.log(document_count/ float(1 + featureDict[word]))

def toTFIDF(tokens, IDF):
    # Should return a dictionary containing features as keys, and weights as values
    TF = {}
    TFIDF = {}
    for token in tokens:
        if token not in TF:
            TF[token] = 1
        else:
            TF[token] += 1
    
    for token in tokens:
        TF[token] /= float(len(TF)) 
        TFIDF[token] = TF[token] * IDF[token]
    #print(TF)
    #print(TFIDF)
    return TFIDF

def splitDataTFIDF(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toTFIDF(preProcess(Text), IDF),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toTFIDF(preProcess(Text), IDF),Label))

In [26]:
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)
#featureDict = {}      # A global dictionary of features

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitDataTFIDF(0.8)

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(IDF), sep='\n')

print("Results after cross-validations: ", crossValidate(trainData, 10))

Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
1648778
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results after cross-validations:  [0.6769202  0.66060602 0.6495608  0.65714286]


In [27]:
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    return (int(reviewLine[0]), reviewLine[8], 'fake' if reviewLine[1] == '__label1__' else 'real', int(reviewLine[2]), reviewLine[3], reviewLine[4])

def loadData(path, Text=None):
    with open(path, encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)
        for line in reader:
            (Id, Text, Label, Rating, verified_purchase, product_category) = parseReview(line)
            rawData.append((Id, Text, Label, Rating, verified_purchase, product_category))
            preprocessedData.append((Id, preProcess(Text), Label))

def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label, Rating, verified_purchase, product_category) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text), Rating, verified_purchase, product_category),Label))
    for (_, Text, Label, Rating, verified_purchase, product_category) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text), Rating, verified_purchase, product_category),Label))

def toFeatureVector(tokens, Rating, verified_purchase, product_category):
    # Should return a dictionary containing features as keys, and weights as values
    localDict = {}
    for token in tokens:
        if token not in localDict:
            localDict[token] = 1
        else:
            localDict[token] += 1
            
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] += 1
        
        localDict["rating"] = Rating
        featureDict["rating"] = Rating
        
        localDict["verified_purchase"] = 0 if verified_purchase == "N" else 1
        featureDict["verified_purchase"] = 1
        
        localDict["product_category_"+product_category] = 1
        if product_category not in featureDict:
            featureDict[product_category] = 1
        else:
            featureDict[product_category] += 1
        
    return localDict

In [28]:
# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)
featureDict = {}      # A global dictionary of features

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

print("Results after cross-validations: ", crossValidate(trainData, 10))

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
1648791
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results after cross-validations:  [0.82373753 0.82263432 0.82241269 0.82267857]


In [32]:
print(len(trainData))
classifier = trainClassifier(trainData)
predictions = predictLabels(testData, classifier)
true_labels = list(map(lambda d: d[1], testData))
a = accuracy_score(true_labels, predictions)
p, r, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')
print("Precision: ", p)
print("Recall: ", a)
print("f1-score: ", f1)
print("accuracy: ", a)

16800
Training Classifier...
Precision:  0.8225787208404047
Recall:  0.8188095238095238
f1-score:  0.8182786894892682
accuracy:  0.8188095238095238
