# Setup

In [1]:
import gzip
import pandas as pd
import numpy as np
import random

from sklearn import model_selection, preprocessing, metrics, svm
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import json


In [2]:
data = []
with gzip.open('../Data/reviews_Movies_and_TV_5.json.gz') as f:
    for l in f: 
        data.append(json.loads(l.strip()))
        
print(len(data))

data = random.sample(data, 200000)

df = pd.DataFrame.from_dict(data)
print(len(df))

target = df['overall']
text = df['reviewText']

1697533
200000


In [3]:
train_size = 0.9
test_size = 0.1

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(text, target, test_size = test_size, random_state=109) 


# label encode the target variable
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# Feature Extraction: n-grams with TF-IDF
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(text)

# transform the training and validation data using count vectorizer object
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(X_test)

In [None]:
#Create a svm model
clf = svm.SVC(kernel='linear')
# Train the model using the training sets
clf.fit(xtrain_tfidf_ngram, y_train)
#Predict the response for test dataset
y_pred = clf.predict(xtest_tfidf_ngram)

# Model Accuracy, how often is the classifier correct?
logging.debug("train: "+ str(train_size) + "/ test: " + str(test_size))
accuracy = str(metrics.accuracy_score(y_test, y_pred))
precision = str(metrics.precision_score(y_test, y_pred, average="macro"))
f1 = str(metrics.f1_score(y_test, y_pred, average="macro"))
logging.debug("Accuracy:" +  accuracy)
logging.debug("Precision:" +  precision)
logging.debug("F1:" + f1)