In [1]:
# basic lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import missingno as msn
import warnings
import random
import math

# setting
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 20)

In [2]:
from sklearn.model_selection import train_test_split
import re

In [3]:
dataset = pd.read_csv('preprocessedtrain_deep.csv',
                      index_col = 0,
                      converters = {'reviewTexttokenized': eval,
                                    'summarytokenized': eval}
                     )

In [4]:
dataset.sample(3)

Unnamed: 0,rating,reviewText,summary,reviewTextCharCount,overlapScore,summaryCharCount,reviewTextUpperCount,summaryUpperCount,reviewTexttokenized,summarytokenized
1791,1,i know this book got glowing reviews and i app...,did we read the same book ?,473,0.166667,26,8,6,"[know, book, get, glow, review, applaud, autho...","[read, book, ?]"
1474,5,"this book was such a great bit of enjoyment , ...",improbably witty,614,0.0,16,13,1,"[book, great, bit, enjoyment, immediately, go,...","[improbably, witty]"
7893,5,i read this book a few years ago and thoroughl...,reaction time,352,0.0,13,8,2,"[read, book, year, ago, thoroughly, enjoy, !, ...","[reaction, time]"


In [5]:
dataset['reviewCapitalPer'] = dataset['reviewTextUpperCount'] / dataset['reviewTextCharCount']
dataset['summaryCapitalPer'] = dataset['summaryUpperCount'] / dataset['summaryCharCount']

## TF-iDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
dataset['concattoken'] = dataset.reviewTexttokenized.apply(lambda x:x) + dataset.summarytokenized.apply(lambda x:x)

In [23]:
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV

allX = dataset['concattoken']# ,'reviewCapitalPer','summaryCapitalPer'
ally = dataset['rating']
# split into train and test set
X_tr_va, X_test, y_tr_va, y_test = train_test_split(allX, ally,
                                                    test_size=1/6,
                                                    random_state = 12, 
                                                    stratify = ally)

#X_tr_va.reset_index(drop = True, inplace = True)
#X_test.reset_index(drop = True, inplace = True)
#y_tr_va.reset_index(drop = True, inplace = True)
#y_test.reset_index(drop = True, inplace = True)

SKF = StratifiedKFold(n_splits = 5, random_state = 12, shuffle  = True)
TFVec = TfidfVectorizer(tokenizer=lambda x:x,
                        lowercase = False,
                        max_df = 0.95,
                        min_df = 10,
                        max_features = 1000,
                        ngram_range = (1,1))

my_pipeline = Pipeline([('vectorizer', TFVec),
                        ('GBR', GradientBoostingRegressor())
                       ])

searching_params = {
    'GBR__learning_rate': [0.01, 0.1, 0.2],
    'GBR__n_estimators' : [20, 50, 100, 200],
    'GBR__max_depth' : [3, 5, 8],
    'GBR__subsample' : [0.3, 0.5, 0.8]
}

grid_search = GridSearchCV(my_pipeline, param_grid=searching_params,
                           cv=SKF, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_tr_va, y_tr_va)
print(grid_search.best_params_)

{'GBR__learning_rate': 0.1, 'GBR__max_depth': 5, 'GBR__n_estimators': 200, 'GBR__subsample': 0.8}


In [24]:
pred_y = grid_search.best_estimator_.predict(X_test)
pred_y = pd.DataFrame(data=pred_y)
pred_y.columns=['pred_y']
dataset['rating'].value_counts()
w1 = 1700/9000
w2 = 1500/9000
w3 = 1200/9000
w4 = 2400/9000
w5 = 2200/9000

# calculate the percentile as threshold
threshold1 = w1
threshold2 = w1+w2
threshold3 = w1+w2+w3
threshold4 = w1+w2+w3+w4
print(threshold1,threshold2,threshold3,threshold4)

0.18888888888888888 0.3555555555555555 0.4888888888888888 0.7555555555555555


In [25]:
pred_y_sort = pred_y.sort_values(by='pred_y')
t1 = np.percentile(pred_y_sort,threshold1*100)
t2 = np.percentile(pred_y_sort,threshold2*100)
t3 = np.percentile(pred_y_sort,threshold3*100)
t4 = np.percentile(pred_y_sort,threshold4*100)
print(t1,t2,t3,t4)

def cate(x):
    if x <= t1:
        return 1
    elif x > t1 and x <= t2:
        return 2
    elif x > t2 and x <= t3:
        return 3
    elif x > t3 and x <= t4:
        return 4
    elif x > t4:
        return 5 

pred_y['pred_y_cate'] = pred_y['pred_y'].apply(cate)

2.2578587650309165 2.855154103788307 3.28347581046089 4.032337779804453


Unnamed: 0,pred_y,pred_y_cate
0,3.391115,4
1,3.733731,4
2,4.298083,5
3,2.946072,3
4,4.406705,5
...,...,...
1495,3.935506,4
1496,2.073924,1
1497,1.496146,1
1498,2.671115,2


In [27]:
y_test.reset_index(drop = True, inplace = True)
test_y = pd.DataFrame(data=y_test)
test_y.columns =['test_y']

est_reg = pd.concat([pred_y,test_y],axis=1)
est_reg

Unnamed: 0,pred_y,pred_y_cate,test_y
0,3.391115,4,4
1,3.733731,4,5
2,4.298083,5,4
3,2.946072,3,2
4,4.406705,5,4
...,...,...,...
1495,3.935506,4,4
1496,2.073924,1,1
1497,1.496146,1,1
1498,2.671115,2,4


## classification

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

In [30]:
my_pipeline_cla = Pipeline([('vectorizer', TFVec),
                            ('GBC', GradientBoostingClassifier())
                           ])

searching_params = {
    'GBC__learning_rate': [0.05, 0.1, 0.2],
    'GBC__n_estimators' : [20, 50, 100, 200, 400],
    'GBC__max_depth' : [3, 5, 8],
    'GBC__subsample' : [0.3, 0.5, 0.8]
}

grid_search_cla = GridSearchCV(my_pipeline_cla, param_grid=searching_params,
                               cv=SKF, n_jobs=-1, scoring='f1_weighted')
grid_search_cla.fit(X_tr_va, y_tr_va)
print(grid_search.best_params_)

{'GBR__learning_rate': 0.1, 'GBR__max_depth': 5, 'GBR__n_estimators': 200, 'GBR__subsample': 0.8}


In [32]:
pred_y= grid_search_cla.best_estimator_.predict(X_test)
pred_y = pd.DataFrame(data=pred_y)
pred_y.columns=['pred_y']
est_cla = pd.concat([pred_y,test_y],axis=1)
est_cla

Unnamed: 0,pred_y,test_y
0,3,4
1,2,5
2,4,4
3,2,2
4,5,4
...,...,...
1495,4,4
1496,2,1
1497,1,1
1498,4,4


## Model Evaluation

In [33]:
from sklearn.metrics import accuracy_score,f1_score

columns=[ 'Accuracy', 'F1 socre']
rows=['Regression', 'Classification']
results=pd.DataFrame(0.0, columns=columns, index=rows)

results.iloc[0,0] = accuracy_score(est_reg['test_y'], est_reg['pred_y_cate'])
results.iloc[0,1] = f1_score(est_reg['test_y'], est_reg['pred_y_cate'], average = 'macro')
results.iloc[1,0] = accuracy_score(est_cla['test_y'], est_cla['pred_y'])
results.iloc[1,1] = f1_score(est_cla['test_y'], est_cla['pred_y'], average = 'macro')

results.round(4)

Unnamed: 0,Accuracy,F1 socre
Regression,0.4267,0.4266
Classification,0.468,0.4603


In [71]:
subdataset = pd.read_csv('preprocessedtest_deep.csv',
                         index_col = 0,
                         converters = {'reviewTexttokenized': eval,
                                       'summarytokenized': eval}
                     )
subdataset['concattoken'] = subdataset.reviewTexttokenized.apply(lambda x:x) + subdataset.summarytokenized.apply(lambda x:x)

sub_X = subdataset['concattoken']# ,'reviewCapitalPer','summaryCapitalPer'

X_train_tran = TFVec.fit_transform(allX)
X_sub_tran = TFVec.transform(sub_X)
X_sub_tran = X_sub_tran.toarray()
#X_sub_tran = pd.DataFrame(data = X_sub_tran, columns = rs_names)

FinalModel = GradientBoostingClassifier(learning_rate=0.05, max_depth=8,
                                        n_estimators=400, subsample=0.3)

FinalModel.fit(X_train_tran, ally)

y_sub = FinalModel.predict(X_sub_tran)
outputdf = pd.DataFrame(data=y_sub)
outputdf.columns=['Prediction']
outputdf.to_csv('submission_GBreg.csv')