# GBoost scoring classifier

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import utils

%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from collections import Counter

### Load data from csv file

In [5]:
# Read CSV file with preprocessed reviews into a DataFrame
path = './review-analysis-teamc/data/processed_reviews.csv'
proc_reviews = pd.read_csv(path)

In [6]:
proc_reviews.head()

Unnamed: 0.1,Unnamed: 0,id,product_id,user_id,helpfulness_numerator,helpfulness_denominator,score,time,summary,text,date,year,sentiment,helpfulness_ratio,word_count,duplicated,preprocessed_text,lemmatized_text
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,2011-04-27,2011,positive,1.0,48,False,bought several vitality canned dog food produc...,bought several vitality canned dog food produc...
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,2012-09-07,2012,negative,,31,False,product arrived labeled jumbo salted peanuts ....,product arrived labeled jumbo salted peanut .....
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,2008-08-18,2008,positive,1.0,94,False,confection around centuries light pillowy citr...,confection around century light pillowy citrus...
3,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,2012-10-21,2012,positive,,27,False,great taffy great price wide assortment yummy ...,great taffy great price wide assortment yummy ...
4,5,6,B006K2ZZ7K,ADT0SRK1MGOEU,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...,2012-07-12,2012,positive,,72,False,got wild hair taffy ordered five pound bag taf...,got wild hair taffy ordered five pound bag taf...


### Train - validation - test split

In [7]:
data_train, data_val_test, Y_train, Y_val_test = train_test_split(
    proc_reviews[['lemmatized_text','score']], proc_reviews['score'], test_size=0.2, random_state=42, stratify=proc_reviews['score']
)

In [8]:
data_val, data_test, Y_val, Y_test = train_test_split(
    data_val_test['lemmatized_text'], Y_val_test, test_size=0.333, random_state=42, stratify=data_val_test['score']
)

In [9]:
# Separate lemmatized text from score on X_train (score was just kept for the second splitting)
data_train = data_train['lemmatized_text']

In [10]:
print(type(data_train),type(data_val),type(data_test))
print(type(Y_train),type(Y_val),type(Y_test))

print(data_train.shape,Y_train.shape)
print(data_val.shape,Y_val.shape)
print(data_test.shape,Y_test.shape)

<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
(315091,) (315091,)
(52541,) (52541,)
(26232,) (26232,)


In [11]:
# Double check stratification is ok over the three splits

all_counter = Counter(list(proc_reviews['score']))
train_counter = Counter(list(Y_train))
val_counter = Counter(list(Y_val))
test_counter = Counter(list(Y_test))

num_all = len(list(proc_reviews['score']))
num_train = len(list(Y_train))
num_val = len(list(Y_val))
num_test = len(list(Y_test))

print(all_counter)
print(train_counter)
print(val_counter)
print(test_counter)

print(f"overall fractions = {all_counter[5]/num_all*100:.2f}% 5 - {all_counter[4]/num_all*100:.2f}% 4 - {all_counter[3]/num_all*100:.2f}% 3 - {all_counter[2]/num_all*100:.2f}% 2 - {all_counter[1]/num_all*100:.2f}% 1")
print(f"train fractions = {train_counter[5]/num_train*100:.2f}% 5 - {train_counter[4]/num_train*100:.2f}% 4 - {train_counter[3]/num_train*100:.2f}% 3 - {train_counter[2]/num_train*100:.2f}% 2 - {train_counter[1]/num_train*100:.2f}% 1")
print(f"val fractions = {val_counter[5]/num_val*100:.2f}% 5 - {val_counter[4]/num_val*100:.2f}% 4 - {val_counter[3]/num_val*100:.2f}% 3 - {val_counter[2]/num_val*100:.2f}% 2 - {val_counter[1]/num_val*100:.2f}% 1")
print(f"test fractions = {test_counter[5]/num_test*100:.2f}% 5 - {test_counter[4]/num_test*100:.2f}% 4 - {test_counter[3]/num_test*100:.2f}% 3 - {test_counter[2]/num_test*100:.2f}% 2 - {test_counter[1]/num_test*100:.2f}% 1")

Counter({5: 250902, 4: 56089, 1: 36299, 3: 29771, 2: 20803})
Counter({5: 200722, 4: 44871, 1: 29039, 3: 23817, 2: 16642})
Counter({5: 33470, 4: 7482, 1: 4843, 3: 3971, 2: 2775})
Counter({5: 16710, 4: 3736, 1: 2417, 3: 1983, 2: 1386})
overall fractions = 63.70% 5 - 14.24% 4 - 7.56% 3 - 5.28% 2 - 9.22% 1
train fractions = 63.70% 5 - 14.24% 4 - 7.56% 3 - 5.28% 2 - 9.22% 1
val fractions = 63.70% 5 - 14.24% 4 - 7.56% 3 - 5.28% 2 - 9.22% 1
test fractions = 63.70% 5 - 14.24% 4 - 7.56% 3 - 5.28% 2 - 9.21% 1


### Text vectorization strategy

* TfidfVectorizer
* FastVec

In [12]:
# X_train, X_val, X_test = utils.series2tfidf_vecs(data_train, data_val, data_test, min_df=1, ngram_range=(1,3))

X_train, X_val, X_test = utils.doc_vectorizer(data_train, data_val, data_test, "doc2vec", {'vector_size':200, 'window':5, 'min_count':1, 'workers':4, 'epochs':20})

: 

### Models analysis and Evaluation

In [4]:
# Import models to try
import xgboost
import sklearn 
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [None]:
xgb_model = GradientBoostingClassifier(
    n_estimators=10, max_depth=3, random_state=10
)


In [None]:
_ = xgb_model.fit(X_train, Y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
Y_pred = xgb_model.predict(X_val)

In [None]:
print(accuracy_score(Y_val,Y_pred))

0.6370263223006795
