<h3>Libraries</h3>

In [124]:
from dataCleaning import *
from dataPreprocessing import *

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline   
from sklearn.ensemble import StackingClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [125]:
df = dataCleaning('./CSV/iphone_dataset.csv')
mydf = df.copy(deep = True)

In [126]:
mydf.drop(mydf[(mydf['review_helpful_vote'] == 0) & (mydf['review_text'].apply(lambda x:len(x.split())) < 10)].index,inplace = True)

In [127]:
avg_rating = mydf['review_rating'].mean()
mydf['rating_diff'] = abs(mydf['review_rating'] - avg_rating)

<h3>VADER</h3>

In [128]:
vds = SentimentIntensityAnalyzer()

In [129]:
mydf['scores'] = mydf['review_text'].apply(lambda review:vds.polarity_scores(review))
mydf['compound'] = mydf['scores'].apply(lambda x:x['compound'])
mydf['vader_score'] = mydf['compound'].apply(lambda x:1 if x >= 0  else 0)
mydf = mydf.drop(['scores','compound'],axis = 1)

<h5>Average Word length</h5>

In [130]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

mydf['avg_word_length'] = mydf['review_text'].apply(lambda x:avg_word(x))

<h5>No. of Stop Words</h5>

In [131]:
stop = stopwords.words('english')
mydf['stopwords_count'] = mydf['review_text'].apply(lambda x:len([x for x in x.split() if x in stop]))

<h5>No. of Uppercase Words</h5>

In [132]:
mydf['upper'] = mydf['review_text'].apply(lambda x:len([x for x in x.split() if x.isupper()]))

<h5>Data Preprocessing</h5>

In [133]:
mydf = dataPreprocessing(mydf)
def categorize(row):
    if row['review_helpful_vote'] == 0:
        return 0
    elif row['review_helpful_vote'] > 0 and row['review_helpful_vote'] <= 5:
        return 1
    elif row['review_helpful_vote'] > 5:
        return 2


mydf['helpfulness'] = mydf.apply(lambda row :categorize(row), axis = 1)

In [134]:
mydf['review_helpful_vote'].value_counts()
mydf.head()
# tempdf = mydf['review_helpful_vote', '']

Unnamed: 0,review_rating,review_helpful_vote,rating_diff,vader_score,avg_word_length,stopwords_count,upper,review,lemmatized,helpfulness
0,3,5087,1.102387,1,5.0,0,1,note,[note],2
1,1,2822,3.102387,0,4.952381,16,2,very bad experience with this iphone xr phone ...,"[bad, experience, iphone, xr, phone, back, cam...",2
2,5,1798,0.897613,1,5.142857,3,0,amazing phone with amazing camera coming from ...,"[amazing, phone, amazing, camera, coming, ipho...",2
3,1,1366,3.102387,1,4.8,56,4,so i got the iphone xr just today the product...,"[got, iphone, xr, today, product, look, amazin...",2
4,5,536,0.897613,1,4.063291,30,4,i have been an android user all my life until ...,"[android, user, life, decided, try, iphone, xr...",2


In [135]:
X = mydf[['lemmatized','vader_score','rating_diff','stopwords_count','avg_word_length']]
X1 = mydf[['lemmatized','vader_score']]
X2 = mydf[['vader_score','rating_diff','stopwords_count','avg_word_length']]

y = mydf['helpfulness']
y1 = mydf['helpfulness']
y2 = mydf['helpfulness']


In [136]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
X1_train,X1_test,y1_train, y1_test = train_test_split(X1,y1,test_size = 0.3)
X2_train,X2_test,y2_train,y2_test = train_test_split(X2,y2,test_size = 0.3)

In [137]:
# X_train_merged = X_train.merge(pd.DataFrame(X_train_vect_avg, index = X_train.index), right_index = True, left_index = True)
# X_test_merged = X_test.merge(pd.DataFrame(X_test_vect_avg, index = X_test.index), right_index = True, left_index = True)
# X_train_final = X_train_merged.drop('lemmatized', axis =1)
# X_test_final = X_test_merged.drop('lemmatized', axis=1)

In [None]:
model = Word2Vec(X_train['lemmatized'], vector_size=100, window=5, min_count=2,sg=1)
words = set(model.wv.index_to_key)
X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_train.lemmatized])
X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_test.lemmatized])

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100,dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100,dtype=float))

# X_train_merged = X.merge(pd.DataFrame(X_train_vect_avg, index = X.index), right_index = True, left_index = True)
# X_train_final = X_train_merged.drop('lemmatized', axis =1)
X_train_merged = X_train.merge(pd.DataFrame(X_train_vect_avg, index = X_train.index), right_index = True, left_index = True)
X_test_merged = X_test.merge(pd.DataFrame(X_test_vect_avg, index = X_test.index), right_index = True, left_index = True)
X_train_final = X_train_merged.drop('lemmatized', axis =1)
X_test_final = X_test_merged.drop('lemmatized', axis=1)

In [None]:
model1 = Word2Vec(X1_train['lemmatized'], vector_size=100, window=5, min_count=2,sg=1)
words1 = set(model.wv.index_to_key)
X1_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X1_train.lemmatized])
X1_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X1_test.lemmatized])

X1_train_vect_avg = []
for v in X1_train_vect:
    if v.size:
        X1_train_vect_avg.append(v.mean(axis=0))
    else:
        X1_train_vect_avg.append(np.zeros(100,dtype=float))

X1_test_vect_avg = []
for v in X1_test_vect:
    if v.size:
        X1_test_vect_avg.append(v.mean(axis=0))
    else:
        X1_test_vect_avg.append(np.zeros(100,dtype=float))

# X_train_merged = X.merge(pd.DataFrame(X_train_vect_avg, index = X.index), right_index = True, left_index = True)
# X_train_final = X_train_merged.drop('lemmatized', axis =1)
X1_train_merged = X1_train.merge(pd.DataFrame(X1_train_vect_avg, index = X1_train.index), right_index = True, left_index = True)
X1_test_merged = X1_test.merge(pd.DataFrame(X1_test_vect_avg, index = X1_test.index), right_index = True, left_index = True)
X1_train_final = X1_train_merged.drop('lemmatized', axis =1)
X1_test_final = X1_test_merged.drop('lemmatized', axis=1)

<h4>GridSearch CV</h4>

In [140]:
# clf1 = SVC(probability=True, random_state=42)
# clf2 = GradientBoostingClassifier(random_state=42)

# param1 = {}
# param1['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
# param1['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
# param1['classifier'] = [clf1]

# param2 = {}
# param2['classifier__n_estimators'] = [10, 50, 100, 250]
# param2['classifier__max_depth'] = [5, 10, 20]
# param2['classifier'] = [clf2]

# pipeline = Pipeline([('classifier', clf1)])
# params = [param1, param2]

In [141]:
# gs = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, scoring='accuracy').fit(X_train_final.values,y_train)
# gs.best_params_, gs.best_score_

Logistic Regression

In [None]:
lr = LogisticRegression(random_state =42)
lr_model = lr.fit(X_train_final.values,y_train)
lr_pred = lr_model.predict(X_test_final.values)
print(accuracy_score(y_test,lr_pred))

In [143]:
lr = LogisticRegression(random_state =42)
lr1_model = lr.fit(X1_train_final.values,y1_train)
lr1_pred = lr1_model.predict(X1_test_final.values)
print(accuracy_score(y1_test,lr1_pred))

0.602510460251046


In [144]:
lr = LogisticRegression(random_state =42)
lr2_model = lr.fit(X2_train.values,y2_train)
lr2_pred = lr2_model.predict(X2_test.values)
print(accuracy_score(y2_test,lr2_pred))

0.6244769874476988


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Support Vector Machines

In [145]:
svc = SVC(random_state = 42)
svc_model = svc.fit(X_train_final.values,y_train)
svc_pred = svc_model.predict(X_test_final.values)
print(accuracy_score(y_test,svc_pred))

0.6882845188284519


In [146]:
svc = SVC(random_state = 42)
svc1_model = svc.fit(X1_train_final.values,y1_train)
svc1_pred = svc1_model.predict(X1_test_final.values)
print(accuracy_score(y1_test,svc1_pred))

0.5930962343096234


In [147]:
svc = SVC(random_state = 42)
svc2_model = svc.fit(X2_train.values,y2_train)
svc2_pred = svc2_model.predict(X2_test.values)
print(accuracy_score(y2_test,svc2_pred))

0.6914225941422594


KNN

In [148]:
knn = KNeighborsClassifier()
knn_model = knn.fit(X_train_final.values,y_train)
knn_pred = knn_model.predict(X_test_final.values)
print(accuracy_score(y_test,knn_pred))

0.643305439330544


In [149]:
knn = KNeighborsClassifier()
knn1_model = knn.fit(X1_train_final.values,y1_train)
knn1_pred = knn1_model.predict(X1_test_final.values)
print(accuracy_score(y1_test,knn1_pred))

0.5889121338912134


In [150]:
# knn = KNeighborsClassifier()
# knn2_model = knn.fit(X2_train.values,y2_train)
# knn2_pred = knn_model.predict(X2_test.values)
# print(accuracy_score(y2_test,knn2_pred))

Decision Tree Classifier

In [151]:
dtc = DecisionTreeClassifier(random_state = 42)
dtc_model = dtc.fit(X_train_final.values,y_train)
dtc_pred = dtc_model.predict(X_test_final.values)
print(accuracy_score(y_test,dtc_pred))

0.5993723849372385


In [152]:
dtc1 = DecisionTreeClassifier(random_state = 42)
dtc1_model = dtc.fit(X1_train_final.values,y1_train)
dtc1_pred = dtc_model.predict(X1_test_final.values)
print(accuracy_score(y1_test,dtc1_pred))

0.5209205020920502


In [153]:
dtc2 = DecisionTreeClassifier(random_state = 42)
dtc2_model = dtc.fit(X2_train.values,y2_train)
dtc2_pred = dtc2_model.predict(X2_test.values)
print(accuracy_score(y2_test,dtc2_pred))

0.6119246861924686


Stacking 

In [154]:
#Stacking Models used
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('svm', SVC()))
level0.append(('knn',KNeighborsClassifier()))
level0.append(('dtc',DecisionTreeClassifier()))
level1 = SVC()

# define the stacking ensemble

In [155]:
model = StackingClassifier(estimators=level0, final_estimator=level1,cv=5)
scores = cross_val_score(model,X_train_final.values,y_train,n_jobs=-1,scoring='accuracy')
scores.mean()

0.6768418400765859

In [156]:
model1 = StackingClassifier(estimators=level0, final_estimator=level1,cv=5)
scores1 = cross_val_score(model,X1_train_final.values,y1_train,n_jobs=-1,scoring='accuracy')
scores1.mean()

0.6009825162493072

In [157]:
model2 = StackingClassifier(estimators=level0, final_estimator=level1,cv=5)
scores2 = cross_val_score(model,X2_train.values,y2_train,n_jobs=-1,scoring='accuracy')
scores2.mean()

0.6714586587393561