### Ensemble

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_features = pd.read_csv("final_model_df_wTextualCol.csv")
print(data_features.shape)
data_features.head()

(821313, 99)


Unnamed: 0,overall,helpful_label,wordcount_summary,wordcount_reviewText,avg_sent_length,ADJ_count,VERB_count,readability_summary,readability_reviewText,summary_reviewText,...,useful,defective,honest,late,personally,replace,responsive,return,review,Sub_Cat
0,5.0,1,4,22,7.33,1,4,21.6,10.19,HDMI Nook adapter cable I am using this with a...,...,0,0,0,0,0,0,0,0,0,0
1,2.0,0,3,22,22.0,2,4,14.53,16.07,Cheap proprietary scam The cable is very wobbl...,...,0,0,0,0,0,0,0,0,0,0
2,5.0,0,6,120,20.0,11,21,2.4,12.63,A Perfdect Nook HD+ hook up This adaptor is re...,...,0,0,0,0,0,0,0,0,0,1
3,4.0,0,6,84,14.0,11,12,9.07,22.51,A nice easy to use accessory. This adapter eas...,...,0,0,0,0,0,0,0,0,0,0
4,5.0,1,7,185,37.0,14,38,8.51,19.77,This works great but read the details... This ...,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data_features = data_features.dropna()
data_features.shape

(821313, 99)

In [4]:
X = data_features.drop('helpful_label', axis=1).drop('summary_reviewText', axis=1)
print(X.shape)
y = data_features['helpful_label']
print(y.shape)

(821313, 97)
(821313,)


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5153)

### Model 1 --- Numerical Features derived from Review Text

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
features_logreg_pred_class = logreg.predict(X_test)
features_prob = logreg.predict_proba(X_test)
features_logreg_pred_prob = features_prob[:,1]

In [13]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [14]:
logreg_accuracy = accuracy_score(y_test, features_logreg_pred_class)
print('LogReg_feature Test Accuracy:', logreg_accuracy)
auc = metrics.roc_auc_score(y_test, features_logreg_pred_prob)
print('LogReg_feature Test AUC: ', auc)

LogReg_feature Test Accuracy: 0.7089305456121638
LogReg_feature Test AUC:  0.7084229342921956


### Model 2 --- CountVectorizer

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english', min_df=2)

In [35]:
X = data_features.summary_reviewText
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5153)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression()
logreg.fit(X_train_dtm, y_train)
cvect_logreg_pred_class = logreg.predict(X_test_dtm)
cvect_prob = logreg.predict_proba(X_test_dtm)
cvect_logreg_pred_prob = cvect_prob[:,1]

In [36]:
logreg_accuracy = accuracy_score(y_test, cvect_logreg_pred_class)
print('LogReg_countvect Test Accuracy:', logreg_accuracy)
auc = metrics.roc_auc_score(y_test, cvect_logreg_pred_prob)
print('LogReg_countvect Test AUC: ', auc)

LogReg_countvect Test Accuracy: 0.7220655630719479
LogReg_countvect Test AUC:  0.7252610221004843


### Model 3 --- TFIDF

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=5000, stop_words='english')

In [21]:
X = data_features.summary_reviewText
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5153)
y_train.shape
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [22]:
logreg = LogisticRegression()
logreg.fit(X_train_dtm, y_train)
tfidfvect_logreg_pred_class = logreg.predict(X_test_dtm)
tfidfvect_prob = logreg.predict_proba(X_test_dtm)
tfidfvect_logreg_pred_prob = tfidfvect_prob[:,1]

In [23]:
logreg_accuracy = accuracy_score(y_test, cvect_logreg_pred_class)
print('LogReg_tfidf Test Accuracy:', logreg_accuracy)
auc = metrics.roc_auc_score(y_test, tfidfvect_logreg_pred_prob)
print('LogReg_tfidf Test AUC: ', auc)

LogReg_tfidf Test Accuracy: 0.7076058423310881
LogReg_tfidf Test AUC:  0.7352403189692109


### Ensembling models 1, 2 and 3

In [37]:
print(features_prob[0, :])
print(cvect_prob[0, :])
print(tfidfvect_prob[0, :])

[0.75650107 0.24349893]
[0.73361028 0.26638972]
[0.68225411 0.31774589]


In [38]:
(features_prob[0, :] + cvect_prob[0, :] + tfidfvect_prob[0, :]) / 3

array([0.72412182, 0.27587818])

In [39]:
new_pred_prob = pd.DataFrame((features_prob + cvect_prob + tfidfvect_prob) / 3, columns=logreg.classes_)
new_pred_prob1 = (features_logreg_pred_prob + cvect_logreg_pred_prob + tfidfvect_logreg_pred_prob) / 3
new_pred_prob.head()

Unnamed: 0,0,1
0,0.724122,0.275878
1,0.803021,0.196979
2,0.301975,0.698025
3,0.273505,0.726495
4,0.743084,0.256916


In [40]:
new_pred_class = new_pred_prob.apply(np.argmax, axis=1)
new_pred_class.head()

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return getattr(obj, method)(*args, **kwds)


0    0
1    0
2    1
3    1
4    0
dtype: int64

In [41]:
logreg_accuracy = accuracy_score(y_test, new_pred_class)
print('Ensemble Test Accuracy:', logreg_accuracy)

auc = metrics.roc_auc_score(y_test, new_pred_prob1)
print('Ensemble Test AUC: ', auc)

Ensemble Test Accuracy: 0.7229860370429896
Ensemble Test AUC:  0.7395847851190251
