# SVM and other simple classifiers for scoring 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import pickle
from collections import Counter

%matplotlib inline

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
sys.path.append(os.path.dirname(os.path.abspath("./aml/src/prep/utils.py")))

In [4]:
import utils

### Load data from csv file

In [5]:
# Read CSV file with preprocessed reviews into a DataFrame
path = '../data/processed_reviews.csv'
proc_reviews = pd.read_csv(path)

In [6]:
proc_reviews.head()

Unnamed: 0,ProductId,UserId,Time,SentimentPolarity,Class_Labels,Sentiment,Usefulness,CleanedText,Score
0,B001E4KFG0,A3SGXH7AUHU8GW,2011-04-27,Positive,1,positive,>75%,bought sever vital can dog food product found ...,5
1,B00813GRG4,A1D87F6ZCVE5NK,2012-09-07,Negative,0,negative,useless,product arriv label jumbo salt peanut peanut a...,1
2,B000LQOCH0,ABXLMWJIXXAIN,2008-08-18,Positive,1,positive,>75%,confect around centuri light pillowi citrus ge...,4
3,B000UA0QIQ,A395BORC6FGVXV,2011-06-13,Negative,0,negative,>75%,look secret ingredi robitussin believ found go...,2
4,B006K2ZZ7K,A1UQRSCLF8GW1T,2012-10-21,Positive,1,positive,useless,great taffi great price wide assort yummi taff...,5


In [7]:
proc_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393890 entries, 0 to 393889
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   ProductId          393890 non-null  object
 1   UserId             393890 non-null  object
 2   Time               393890 non-null  object
 3   SentimentPolarity  393890 non-null  object
 4   Class_Labels       393890 non-null  int64 
 5   Sentiment          393890 non-null  object
 6   Usefulness         393890 non-null  object
 7   CleanedText        393884 non-null  object
 8   Score              393890 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 27.0+ MB


In [8]:
proc_reviews = proc_reviews[proc_reviews['CleanedText'].notna()]

In [9]:
proc_reviews.head()

Unnamed: 0,ProductId,UserId,Time,SentimentPolarity,Class_Labels,Sentiment,Usefulness,CleanedText,Score
0,B001E4KFG0,A3SGXH7AUHU8GW,2011-04-27,Positive,1,positive,>75%,bought sever vital can dog food product found ...,5
1,B00813GRG4,A1D87F6ZCVE5NK,2012-09-07,Negative,0,negative,useless,product arriv label jumbo salt peanut peanut a...,1
2,B000LQOCH0,ABXLMWJIXXAIN,2008-08-18,Positive,1,positive,>75%,confect around centuri light pillowi citrus ge...,4
3,B000UA0QIQ,A395BORC6FGVXV,2011-06-13,Negative,0,negative,>75%,look secret ingredi robitussin believ found go...,2
4,B006K2ZZ7K,A1UQRSCLF8GW1T,2012-10-21,Positive,1,positive,useless,great taffi great price wide assort yummi taff...,5


### Train - validation - test split

In [10]:
data_train, data_val_test, Y_train, Y_val_test = train_test_split(
    proc_reviews[['CleanedText','Class_Labels']], proc_reviews['Class_Labels'], test_size=0.3, random_state=42, stratify=proc_reviews['Class_Labels']
)

In [11]:
data_val, data_test, Y_val, Y_test = train_test_split(
    data_val_test['CleanedText'], Y_val_test, test_size=0.333, random_state=42, stratify=data_val_test['Class_Labels']
)

In [12]:
# Separate text from score on X_train (score was just kept for the second splitting)
data_train = data_train['CleanedText']

In [13]:
print(data_train.shape,Y_train.shape)
print(data_val.shape,Y_val.shape)
print(data_test.shape,Y_test.shape)

(275718,) (275718,)
(78816,) (78816,)
(39350,) (39350,)


In [14]:
# Double check stratification is ok over the three splits

all_counter = Counter(list(proc_reviews['Score']))
train_counter = Counter(list(Y_train))
val_counter = Counter(list(Y_val))
test_counter = Counter(list(Y_test))

num_all = len(list(proc_reviews['Score']))
num_train = len(list(Y_train))
num_val = len(list(Y_val))
num_test = len(list(Y_test))

print(all_counter)
print(train_counter)
print(val_counter)
print(test_counter)

print(f"overall fractions = {all_counter[5]/num_all*100:.2f}% 5 - {all_counter[4]/num_all*100:.2f}% 4 - {all_counter[3]/num_all*100:.2f}% 3 - {all_counter[2]/num_all*100:.2f}% 2 - {all_counter[1]/num_all*100:.2f}% 1")
print(f"train fractions = {train_counter[5]/num_train*100:.2f}% 5 - {train_counter[4]/num_train*100:.2f}% 4 - {train_counter[3]/num_train*100:.2f}% 3 - {train_counter[2]/num_train*100:.2f}% 2 - {train_counter[1]/num_train*100:.2f}% 1")
print(f"val fractions = {val_counter[5]/num_val*100:.2f}% 5 - {val_counter[4]/num_val*100:.2f}% 4 - {val_counter[3]/num_val*100:.2f}% 3 - {val_counter[2]/num_val*100:.2f}% 2 - {val_counter[1]/num_val*100:.2f}% 1")
print(f"test fractions = {test_counter[5]/num_test*100:.2f}% 5 - {test_counter[4]/num_test*100:.2f}% 4 - {test_counter[3]/num_test*100:.2f}% 3 - {test_counter[2]/num_test*100:.2f}% 2 - {test_counter[1]/num_test*100:.2f}% 1")

Counter({5: 250928, 4: 56086, 1: 36301, 3: 29768, 2: 20801})
Counter({1: 214909, 0: 60809})
Counter({1: 61433, 0: 17383})
Counter({1: 30672, 0: 8678})
overall fractions = 63.71% 5 - 14.24% 4 - 7.56% 3 - 5.28% 2 - 9.22% 1
train fractions = 0.00% 5 - 0.00% 4 - 0.00% 3 - 0.00% 2 - 77.95% 1
val fractions = 0.00% 5 - 0.00% 4 - 0.00% 3 - 0.00% 2 - 77.94% 1
test fractions = 0.00% 5 - 0.00% 4 - 0.00% 3 - 0.00% 2 - 77.95% 1


### Text vectorization strategy

* TfidfVectorizer/CountVectorizer
* FastVec

In [15]:
# Save vectorizer model
# Use parameters in filename to identify what was used

def get_fname(vect_type, params_dic):

    fname = ""

    for k,v in params_dic.items():
        
        fname += k + "_" + str(v) + "_"

    fname = fname.replace("(", "_")
    fname = fname.replace(")", "_")
    fname = fname.replace(",", "_")
    fname = fname.replace(" ", "")
    fname = fname.replace("__", "_")

    fname = vect_type + '_' + fname[:-1]

    return fname

def save_vectorizer(data_path, _vectorizer, vect_type, params_dic):

    vect_fname = get_fname(vect_type, params_dic)+'.pkl'

    os.makedirs(data_path+'vectorizers/',exist_ok=True)

    with open(data_path+'vectorizers/'+ vect_fname, 'wb') as f:
        pickle.dump(_vectorizer, f, protocol=pickle.HIGHEST_PROTOCOL)


In [16]:
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vect = utils.doc_vectorizer(data_train, data_val, data_test,
"tfidf", {'min_df':1, 'ngram_range':(1,3)})

#X_train_tfidf, X_val_tfidf, X_test_tfidf = utils.doc_vectorizer(data_train, data_val, data_test,
#"tfidf", {'min_df':1, 'ngram_range':(1,4), 'max_features':100000})

In [17]:
X_train_tfidf.shape

(275718, 10145002)

In [18]:
# Save vectorizer
data_path = '../data/bin_model/'
save_vectorizer(data_path, tfidf_vect, "tfidf_vect", {'min_df':1, 'ngram_range':(1,3)})

In [None]:
X_train_doc2vec, X_val_doc2vec, X_test_doc2vec, doc2vec_model = utils.doc_vectorizer(data_train, data_val, data_test, "doc2vec",
                                                                      {'vector_size':2000, 'window':3, 'min_count':4, 'workers':2, 'epochs':10})

In [None]:
X_train_cntVec, X_val_cntVec, X_test_cntVec, countVec = utils.doc_vectorizer(data_train, data_val, data_test, "countVec",
                                                                      {'min_df':1, 'ngram_range':(1,3)})

#X_train_cntVec, X_val_cntVec, X_test_cntVec = utils.doc_vectorizer(data_train, data_val, data_test, "countVec",
#                                                                      {'stop_words': None, 'min_df':1, 'max_df':1,
#                                                                       'ngram_range':(1,3), 'max_features': None})

In [None]:
# Check how many features we are left with 
print(X_train_tfidf.shape)
#print(X_train_doc2vec.shape)
#print(X_train_cntVec.shape)

### Models analysis and Evaluation

In [19]:
# Import models to try
from sklearn.svm import LinearSVC # Support vector machine
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [20]:
def build_svm(random_state=42, tol=1e-3, class_weight='balanced'):

    return LinearSVC(random_state=random_state, tol=tol, class_weight=class_weight)

def train_svm(_model, X_train, Y_train, X_val, Y_val, X_test, Y_test):

    _model.fit(X_train,Y_train)

    Y_train_pred = _model.predict(X_train)

    train_acc = accuracy_score(Y_train,Y_train_pred)

    Y_val_pred = _model.predict(X_val)

    val_acc = accuracy_score(Y_val,Y_val_pred)

    Y_test_pred = _model.predict(X_test)

    test_acc = accuracy_score(Y_test,Y_test_pred)

    print(f"train_acc: {train_acc}")
    print(f"val_acc: {val_acc}")
    print(f"test_acc: {test_acc}")

    return _model, val_acc, test_acc    

In [21]:
# Save model and metrics
def save_model_metrics(data_path, _model, model_type, model_params, X_train, Y_train, X_val, Y_val, X_test, Y_test):

    report_name = get_fname(model_type, model_params)

    Y_train_pred = _model.predict(X_train)

    Y_val_pred = _model.predict(X_val)

    Y_test_pred = _model.predict(X_test)


    train_acc = accuracy_score(Y_train,Y_train_pred)

    val_acc = accuracy_score(Y_val,Y_val_pred)

    test_acc = accuracy_score(Y_test,Y_test_pred)

    report_train = classification_report(Y_train,Y_train_pred, output_dict=True)

    report_val = classification_report(Y_val,Y_val_pred, output_dict=True)

    report_test = classification_report(Y_test,Y_test_pred, output_dict=True)

    #df_val = pd.DataFrame(report_val)#.transpose()

    #df_test = pd.DataFrame(report_test)#.transpose()

    reports_path = data_path + report_name + "/"

    os.makedirs(reports_path,exist_ok=True)

    report_path_train = reports_path + 'report_val_' + report_name + '.pkl'
    report_path_val = reports_path + 'report_val_' + report_name + '.pkl'
    report_path_test = reports_path + 'report_test_' + report_name + '.pkl'

    with open(report_path_train, 'wb') as f:
        pickle.dump(report_train, f, protocol=pickle.HIGHEST_PROTOCOL)

    with open(report_path_val, 'wb') as f:
        pickle.dump(report_val, f, protocol=pickle.HIGHEST_PROTOCOL)

    with open(report_path_test, 'wb') as f:
        pickle.dump(report_test, f, protocol=pickle.HIGHEST_PROTOCOL)

    #df_val = pd.DataFrame.from_dict(report_val)
    #df_val.to_csv(report_path_val, index = False)

    #df_test = pd.DataFrame.from_dict(report_val)
    #df_test.to_csv(report_path_test, index = False)

    print("Train dset metrics:")
    print(classification_report(Y_train,Y_train_pred))    
    print()
    print("Validation dset metrics:")
    print(classification_report(Y_val,Y_val_pred))    
    print()
    print("Test dset metrics:")
    print(classification_report(Y_test,Y_test_pred))    
    print()

    confusion_m = confusion_matrix(Y_test, Y_test_pred)

    print("Test confusion matrix")
    print(confusion_m)

    with open(reports_path + 'confusion_matrix_test.pkl', 'wb') as f:
        pickle.dump(confusion_m, f, protocol=pickle.HIGHEST_PROTOCOL)

    return

In [22]:
tfidf_params = {"random_state":42, "tol":1e-4, "class_weight":'balanced'}

model_tfidf = build_svm(**tfidf_params)

model_tfidf, val_acc, test_acc = train_svm(model_tfidf, X_train_tfidf, Y_train, X_val_tfidf, Y_val, X_test_tfidf, Y_test)



train_acc: 0.9988067518261412
val_acc: 0.9066179455948031
test_acc: 0.9049809402795426


In [23]:
data_path = '../data/bin_model/'

save_model_metrics(data_path, model_tfidf, "model_tfidf", tfidf_params, X_train_tfidf, Y_train, X_val_tfidf, Y_val, X_test_tfidf, Y_test)

TypeError: save_model_metrics() missing 2 required positional arguments: 'X_test' and 'Y_test'

In [None]:
model_cntVec = build_svm(random_state=42, tol=1e-3, class_weight='balanced')

model_cntVec, val_acc_2, test_acc_2 = train_svm(model_cntVec, X_train_cntVec, Y_train, X_val_cntVec, Y_val, X_test_cntVec, Y_test)

In [None]:
val_acc_2, test_acc_2

In [None]:
model_doc2vec = build_svm(random_state=42, tol=1e-3, class_weight='balanced')

model_doc2vec, val_acc_3, test_acc_3 = train_svm(model_doc2vec, X_train_doc2vec, Y_train, X_val_doc2vec, Y_val, X_test_doc2vec, Y_test)

In [None]:
val_acc_3, test_acc_3

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [None]:
gb_model = GradientBoostingClassifier(
    n_estimators=10, max_depth=3, random_state=10
)

In [None]:
gb = gb_model.fit(X_train_tfidf, Y_train)

In [None]:
Y_val_pred = gb.predict(X_val_tfidf)

Y_test_pred = gb.predict(X_test_tfidf)


val_acc = accuracy_score(Y_val,Y_val_pred)

test_acc = accuracy_score(Y_test,Y_test_pred)

print(val_acc, test_acc)


In [None]:
rf_model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=0)

In [None]:
rf = rf_model.fit(X_train_tfidf, Y_train)