<h3>Libraries</h3>

In [1]:
from dataCleaning import *
from dataPreprocessing import *

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline   
from sklearn.ensemble import StackingClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
df = dataCleaning('./CSV/iphone_dataset.csv')
mydf = df.copy(deep = True)

In [3]:
mydf.drop(mydf[(mydf['review_helpful_vote'] == 0) & (mydf['review_text'].apply(lambda x:len(x.split())) < 10)].index,inplace = True)

In [4]:
avg_rating = mydf['review_rating'].mean()
mydf['rating_diff'] = abs(mydf['review_rating'] - avg_rating)

<h3>VADER</h3>

In [5]:
vds = SentimentIntensityAnalyzer()

In [6]:
mydf['scores'] = mydf['review_text'].apply(lambda review:vds.polarity_scores(review))
mydf['compound'] = mydf['scores'].apply(lambda x:x['compound'])
mydf['vader_score'] = mydf['compound'].apply(lambda x:1 if x >= 0  else 0)
mydf = mydf.drop(['scores','compound'],axis = 1)

<h5>Average Word length</h5>

In [7]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

mydf['avg_word_length'] = mydf['review_text'].apply(lambda x:avg_word(x))

<h5>No. of Stop Words</h5>

In [8]:
stop = stopwords.words('english')
mydf['stopwords_count'] = mydf['review_text'].apply(lambda x:len([x for x in x.split() if x in stop]))

<h5>No. of Uppercase Words</h5>

In [9]:
mydf['upper'] = mydf['review_text'].apply(lambda x:len([x for x in x.split() if x.isupper()]))

<h5>Data Preprocessing</h5>

In [10]:
mydf = dataPreprocessing(mydf)
def categorize(row):
    if row['review_helpful_vote'] == 0:
        return 0
    elif row['review_helpful_vote'] > 0 and row['review_helpful_vote'] <= 5:
        return 1
    elif row['review_helpful_vote'] > 5:
        return 2


mydf['helpfulness'] = mydf.apply(lambda row :categorize(row), axis = 1)

In [11]:
mydf['review_helpful_vote'].value_counts()
mydf.head()
# tempdf = mydf['review_helpful_vote', '']

Unnamed: 0,review_rating,review_helpful_vote,rating_diff,vader_score,avg_word_length,stopwords_count,upper,review,lemmatized,helpfulness
0,3,5087,1.102387,1,5.0,0,1,note,[note],2
1,1,2822,3.102387,0,4.952381,16,2,very bad experience with this iphone xr phone ...,"[bad, experience, iphone, xr, phone, back, cam...",2
2,5,1798,0.897613,1,5.142857,3,0,amazing phone with amazing camera coming from ...,"[amazing, phone, amazing, camera, coming, ipho...",2
3,1,1366,3.102387,1,4.8,56,4,so i got the iphone xr just today the product...,"[got, iphone, xr, today, product, look, amazin...",2
4,5,536,0.897613,1,4.063291,30,4,i have been an android user all my life until ...,"[android, user, life, decided, try, iphone, xr...",2


In [25]:
X = mydf[['lemmatized','vader_score','rating_diff','stopwords_count','avg_word_length']]
y = mydf['helpfulness']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [27]:
# X_train_merged = X_train.merge(pd.DataFrame(X_train_vect_avg, index = X_train.index), right_index = True, left_index = True)
# X_test_merged = X_test.merge(pd.DataFrame(X_test_vect_avg, index = X_test.index), right_index = True, left_index = True)
# X_train_final = X_train_merged.drop('lemmatized', axis =1)
# X_test_final = X_test_merged.drop('lemmatized', axis=1)

In [28]:
model = Word2Vec(X_train['lemmatized'], vector_size=100, window=5, min_count=2,sg=1)
words = set(model.wv.index_to_key)
X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_train.lemmatized])
X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_test.lemmatized])

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100,dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100,dtype=float))

# X_train_merged = X.merge(pd.DataFrame(X_train_vect_avg, index = X.index), right_index = True, left_index = True)
# X_train_final = X_train_merged.drop('lemmatized', axis =1)
X_train_merged = X_train.merge(pd.DataFrame(X_train_vect_avg, index = X_train.index), right_index = True, left_index = True)
X_test_merged = X_test.merge(pd.DataFrame(X_test_vect_avg, index = X_test.index), right_index = True, left_index = True)
X_train_final = X_train_merged.drop('lemmatized', axis =1)
X_test_final = X_test_merged.drop('lemmatized', axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [29]:
X_train_final

Unnamed: 0,vader_score,rating_diff,stopwords_count,avg_word_length,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
943,1,0.897613,3,5.900000,-0.127899,0.257563,-0.013530,-0.093427,0.072138,-0.250549,...,0.255343,0.125192,0.085901,0.016735,0.310131,0.147246,-0.001028,-0.093880,0.054205,-0.045240
5087,1,0.897613,2,3.303030,-0.144607,0.276041,0.004978,-0.060245,0.070993,-0.253433,...,0.254593,0.089863,0.043169,0.015506,0.288576,0.159818,-0.021959,-0.126666,0.053404,-0.025929
6407,1,0.897613,7,4.720000,-0.132909,0.271432,-0.013383,-0.093031,0.070974,-0.253801,...,0.260344,0.108958,0.090662,0.015894,0.306795,0.141915,-0.015620,-0.090366,0.044041,-0.039113
5920,1,0.897613,21,4.510204,-0.143999,0.257285,0.002420,-0.057865,0.064854,-0.239838,...,0.246970,0.063350,0.034148,0.008477,0.259435,0.151857,-0.019188,-0.132066,0.048002,-0.027288
2610,1,0.897613,0,4.000000,-0.184483,0.261519,0.022993,-0.044318,0.072306,-0.247223,...,0.268914,0.070894,0.042843,0.023537,0.267106,0.180839,-0.025567,-0.161607,0.043461,-0.022725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,0,0.897613,3,3.700000,-0.104514,0.245758,-0.052649,-0.154827,0.078687,-0.253674,...,0.270211,0.179193,0.203938,0.051176,0.378238,0.119881,0.005443,-0.031759,0.027508,-0.064530
6223,1,0.897613,5,4.750000,-0.151888,0.262793,0.015330,-0.031281,0.070935,-0.242325,...,0.251608,0.058651,-0.026401,0.001459,0.240862,0.158772,-0.008887,-0.161411,0.067566,-0.017373
5321,1,0.897613,9,4.714286,-0.133241,0.270361,-0.023627,-0.094761,0.078375,-0.253885,...,0.255575,0.126613,0.112680,0.021104,0.315636,0.141618,-0.014338,-0.077782,0.033263,-0.044082
5266,1,0.897613,3,3.571429,-0.154497,0.308364,0.060991,-0.011783,0.080398,-0.260235,...,0.259307,0.059446,-0.097181,0.001696,0.235433,0.169974,-0.000322,-0.182414,0.094649,-0.032204


<h4>GridSearch CV</h4>

In [30]:
clf1 = SVC(probability=True, random_state=42)
clf2 = GradientBoostingClassifier(random_state=42)

param1 = {}
param1['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
param1['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param1['classifier'] = [clf1]

param2 = {}
param2['classifier__n_estimators'] = [10, 50, 100, 250]
param2['classifier__max_depth'] = [5, 10, 20]
param2['classifier'] = [clf2]

pipeline = Pipeline([('classifier', clf1)])
params = [param1, param2]

In [31]:
gs = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, scoring='accuracy').fit(X_train_final.values,y_train)

In [32]:
gs.best_params_

{'classifier': SVC(C=10, probability=True, random_state=42),
 'classifier__C': 10,
 'classifier__class_weight': None}

In [33]:
gs.best_score_

0.6992843659721945

<h5>Stacking Ensemble Model</h5>

In [34]:
#Stacking Models used
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('svm', SVC()))
level0.append(('xgb',XGBClassifier()))
level0.append(('gdb',GradientBoostingClassifier()))
level1 = LogisticRegression()

# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1,cv=5)

In [35]:
scores = cross_val_score(model,X_train_final.values,y_train,n_jobs=-1,scoring='accuracy')

In [36]:
scores.mean()

0.6912047160779966