# SVM and other simple classifiers for scoring 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
%matplotlib inline

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
from collections import Counter

In [4]:
sys.path.append(os.path.dirname(os.path.abspath("./aml/src/prep/utils.py")))

In [5]:
import utils

### Load data from csv file

In [6]:
# Read CSV file with preprocessed reviews into a DataFrame
path = '../data/processed_reviews.csv'
proc_reviews = pd.read_csv(path)

In [7]:
proc_reviews.head()

Unnamed: 0,ProductId,UserId,Time,SentimentPolarity,Class_Labels,Sentiment,Score,Usefulness,CleanedText
0,B001E4KFG0,A3SGXH7AUHU8GW,2011-04-27,Positive,1,positive,5,>75%,bought sever vital can dog food product found ...
1,B00813GRG4,A1D87F6ZCVE5NK,2012-09-07,Negative,0,negative,1,useless,product arriv label jumbo salt peanut peanut a...
2,B000LQOCH0,ABXLMWJIXXAIN,2008-08-18,Positive,1,positive,4,>75%,confect around centuri light pillowi citrus ge...
3,B000UA0QIQ,A395BORC6FGVXV,2011-06-13,Negative,0,negative,2,>75%,look secret ingredi robitussin believ found go...
4,B006K2ZZ7K,A1UQRSCLF8GW1T,2012-10-21,Positive,1,positive,5,useless,great taffi great price wide assort yummi taff...


In [8]:
proc_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393890 entries, 0 to 393889
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   ProductId          393890 non-null  object
 1   UserId             393890 non-null  object
 2   Time               393890 non-null  object
 3   SentimentPolarity  393890 non-null  object
 4   Class_Labels       393890 non-null  int64 
 5   Sentiment          393890 non-null  object
 6   Score              393890 non-null  int64 
 7   Usefulness         393890 non-null  object
 8   CleanedText        393884 non-null  object
dtypes: int64(2), object(7)
memory usage: 27.0+ MB


In [9]:
proc_reviews = proc_reviews[proc_reviews['CleanedText'].notna()]

In [10]:
proc_reviews.head()

Unnamed: 0,ProductId,UserId,Time,SentimentPolarity,Class_Labels,Sentiment,Score,Usefulness,CleanedText
0,B001E4KFG0,A3SGXH7AUHU8GW,2011-04-27,Positive,1,positive,5,>75%,bought sever vital can dog food product found ...
1,B00813GRG4,A1D87F6ZCVE5NK,2012-09-07,Negative,0,negative,1,useless,product arriv label jumbo salt peanut peanut a...
2,B000LQOCH0,ABXLMWJIXXAIN,2008-08-18,Positive,1,positive,4,>75%,confect around centuri light pillowi citrus ge...
3,B000UA0QIQ,A395BORC6FGVXV,2011-06-13,Negative,0,negative,2,>75%,look secret ingredi robitussin believ found go...
4,B006K2ZZ7K,A1UQRSCLF8GW1T,2012-10-21,Positive,1,positive,5,useless,great taffi great price wide assort yummi taff...


### Train - validation - test split

In [11]:
data_train, data_val_test, Y_train, Y_val_test = train_test_split(
    proc_reviews[['CleanedText','Score']], proc_reviews['Score'], test_size=0.3, random_state=42, stratify=proc_reviews['Score']
)

In [12]:
data_val, data_test, Y_val, Y_test = train_test_split(
    data_val_test['CleanedText'], Y_val_test, test_size=0.333, random_state=42, stratify=data_val_test['Score']
)

In [13]:
# Separate lemmatized text from score on X_train (score was just kept for the second splitting)
data_train = data_train['CleanedText']

In [14]:
print(type(data_train),type(data_val),type(data_test))
print(type(Y_train),type(Y_val),type(Y_test))

print(data_train.shape,Y_train.shape)
print(data_val.shape,Y_val.shape)
print(data_test.shape,Y_test.shape)

<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
(275718,) (275718,)
(78816,) (78816,)
(39350,) (39350,)


In [15]:
# Double check stratification is ok over the three splits

all_counter = Counter(list(proc_reviews['Score']))
train_counter = Counter(list(Y_train))
val_counter = Counter(list(Y_val))
test_counter = Counter(list(Y_test))

num_all = len(list(proc_reviews['Score']))
num_train = len(list(Y_train))
num_val = len(list(Y_val))
num_test = len(list(Y_test))

print(all_counter)
print(train_counter)
print(val_counter)
print(test_counter)

print(f"overall fractions = {all_counter[5]/num_all*100:.2f}% 5 - {all_counter[4]/num_all*100:.2f}% 4 - {all_counter[3]/num_all*100:.2f}% 3 - {all_counter[2]/num_all*100:.2f}% 2 - {all_counter[1]/num_all*100:.2f}% 1")
print(f"train fractions = {train_counter[5]/num_train*100:.2f}% 5 - {train_counter[4]/num_train*100:.2f}% 4 - {train_counter[3]/num_train*100:.2f}% 3 - {train_counter[2]/num_train*100:.2f}% 2 - {train_counter[1]/num_train*100:.2f}% 1")
print(f"val fractions = {val_counter[5]/num_val*100:.2f}% 5 - {val_counter[4]/num_val*100:.2f}% 4 - {val_counter[3]/num_val*100:.2f}% 3 - {val_counter[2]/num_val*100:.2f}% 2 - {val_counter[1]/num_val*100:.2f}% 1")
print(f"test fractions = {test_counter[5]/num_test*100:.2f}% 5 - {test_counter[4]/num_test*100:.2f}% 4 - {test_counter[3]/num_test*100:.2f}% 3 - {test_counter[2]/num_test*100:.2f}% 2 - {test_counter[1]/num_test*100:.2f}% 1")

Counter({5: 250928, 4: 56086, 1: 36301, 3: 29768, 2: 20801})
Counter({5: 175649, 4: 39260, 1: 25411, 3: 20837, 2: 14561})
Counter({5: 50211, 4: 11223, 1: 7263, 3: 5957, 2: 4162})
Counter({5: 25068, 4: 5603, 1: 3627, 3: 2974, 2: 2078})
overall fractions = 63.71% 5 - 14.24% 4 - 7.56% 3 - 5.28% 2 - 9.22% 1
train fractions = 63.71% 5 - 14.24% 4 - 7.56% 3 - 5.28% 2 - 9.22% 1
val fractions = 63.71% 5 - 14.24% 4 - 7.56% 3 - 5.28% 2 - 9.22% 1
test fractions = 63.71% 5 - 14.24% 4 - 7.56% 3 - 5.28% 2 - 9.22% 1


### Text vectorization strategy

* TfidfVectorizer/CountVectorizer
* FastVec

In [None]:
X_train_tfidf, X_val_tfidf, X_test_tfidf = utils.doc_vectorizer(data_train, data_val, data_test,
"tfidf", {'min_df':1, 'ngram_range':(1,4), 'max_features':100000})

In [16]:
X_train_doc2vec, X_val_doc2vec, X_test_doc2vec = utils.doc_vectorizer(data_train, data_val, data_test, "doc2vec",
                                                                      {'vector_size':2000, 'window':3, 'min_count':4, 'workers':2, 'epochs':10})

AttributeError: 'str' object has no attribute 'words'

In [None]:
X_train_cntVec, X_val_cntVec, X_test_cntVec = utils.doc_vectorizer(data_train, data_val, data_test, "countVec",
                                                                      {'min_df':1, 'ngram_range':(1,3)})

#X_train_cntVec, X_val_cntVec, X_test_cntVec = utils.doc_vectorizer(data_train, data_val, data_test, "countVec",
#                                                                      {'stop_words': None, 'min_df':1, 'max_df':1,
#                                                                       'ngram_range':(1,3), 'max_features': None})

In [None]:
# Check how many features we are left with 
print(X_train_tfidf.shape)
#print(X_train_doc2vec.shape)
#print(X_train_cntVec.shape)

### Models analysis and Evaluation

In [None]:
# Import models to try
from sklearn.svm import LinearSVC # Support vector machine
from sklearn.metrics import accuracy_score, classification_report

In [None]:
def build_svm(random_state=42, tol=1e-3, class_weight='balanced'):

    return LinearSVC(random_state=random_state, tol=tol, class_weight=class_weight)

def train_svm(_model, X_train, Y_train, X_val, Y_val, X_test, Y_test):

    _model.fit(X_train,Y_train)

    Y_val_pred = _model.predict(X_val)

    val_acc = accuracy_score(Y_val,Y_val_pred)

    Y_test_pred = _model.predict(X_test)

    test_acc = accuracy_score(Y_test,Y_test_pred)

    return _model, val_acc, test_acc    

In [None]:
model_tfidf = build_svm(random_state=42, tol=1e-4, class_weight='balanced')

model_tfidf, val_acc, test_acc = train_svm(model_tfidf, X_train_tfidf, Y_train, X_val_tfidf, Y_val, X_test_tfidf, Y_test)

In [None]:
val_acc, test_acc

In [None]:
model_cntVec = build_svm(random_state=42, tol=1e-3, class_weight='balanced')

model_cntVec, val_acc_2, test_acc_2 = train_svm(model_cntVec, X_train_cntVec, Y_train, X_val_cntVec, Y_val, X_test_cntVec, Y_test)

In [None]:
val_acc_2, test_acc_2

In [None]:
# Test normalization of features
