<h3>Libraries</h3>

In [1]:
from dataCleaning import *
from dataPreprocessing import *

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline   

from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from nltk.probability import FreqDist
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
df = dataCleaning('./CSV/iphone_dataset.csv')
mydf = df.copy(deep = True)

In [3]:
mydf.drop(mydf[(mydf['review_helpful_vote'] == 0) & (mydf['review_text'].apply(lambda x:len(x.split())) < 10)].index,inplace = True)

In [4]:
avg_rating = mydf['review_rating'].mean()
mydf['rating_diff'] = abs(mydf['review_rating'] - avg_rating)

<h3>VADER</h3>

In [5]:
vds = SentimentIntensityAnalyzer()

In [6]:
mydf['scores'] = mydf['review_text'].apply(lambda review:vds.polarity_scores(review))
mydf['compound'] = mydf['scores'].apply(lambda x:x['compound'])
mydf['vader_score'] = mydf['compound'].apply(lambda x:1 if x >= 0  else 0)
mydf = mydf.drop(['scores','compound'],axis = 1)

<h5>Average Word length</h5>

In [7]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

mydf['avg_word_length'] = mydf['review_text'].apply(lambda x:avg_word(x))

<h5>No. of Stop Words</h5>

In [8]:
stop = stopwords.words('english')
mydf['stopwords_count'] = mydf['review_text'].apply(lambda x:len([x for x in x.split() if x in stop]))

<h5>No. of Uppercase Words</h5>

In [9]:
mydf['upper'] = mydf['review_text'].apply(lambda x:len([x for x in x.split() if x.isupper()]))

<h5>Data Preprocessing</h5>

In [10]:
mydf = dataPreprocessing(mydf)
def categorize(row):
    if row['review_helpful_vote'] == 0:
        return 0
    elif row['review_helpful_vote'] > 0 and row['review_helpful_vote'] <= 5:
        return 1
    elif row['review_helpful_vote'] > 5:
        return 2


mydf['helpfulness'] = mydf.apply(lambda row :categorize(row), axis = 1)

In [11]:
mydf['review_helpful_vote'].value_counts()
mydf.head()
# tempdf = mydf['review_helpful_vote', '']

Unnamed: 0,review_rating,review_helpful_vote,rating_diff,vader_score,avg_word_length,stopwords_count,upper,review,lemmatized,helpfulness
0,3,5087,1.102387,1,5.0,0,1,note,[note],2
1,1,2822,3.102387,0,4.952381,16,2,very bad experience with this iphone xr phone ...,"[bad, experience, iphone, xr, phone, back, cam...",2
2,5,1798,0.897613,1,5.142857,3,0,amazing phone with amazing camera coming from ...,"[amazing, phone, amazing, camera, coming, ipho...",2
3,1,1366,3.102387,1,4.8,56,4,so i got the iphone xr just today the product...,"[got, iphone, xr, today, product, look, amazin...",2
4,5,536,0.897613,1,4.063291,30,4,i have been an android user all my life until ...,"[android, user, life, decided, try, iphone, xr...",2


In [12]:
X = mydf[['lemmatized','vader_score','rating_diff','stopwords_count','avg_word_length']]
y = mydf['helpfulness']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [14]:
# X_train_merged = X_train.merge(pd.DataFrame(X_train_vect_avg, index = X_train.index), right_index = True, left_index = True)
# X_test_merged = X_test.merge(pd.DataFrame(X_test_vect_avg, index = X_test.index), right_index = True, left_index = True)
# X_train_final = X_train_merged.drop('lemmatized', axis =1)
# X_test_final = X_test_merged.drop('lemmatized', axis=1)

In [None]:
model = Word2Vec(X_train['lemmatized'], vector_size=100, window=5, min_count=2,sg=1)
words = set(model.wv.index_to_key)
X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_train.lemmatized])
X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_test.lemmatized])

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100,dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100,dtype=float))

# X_train_merged = X.merge(pd.DataFrame(X_train_vect_avg, index = X.index), right_index = True, left_index = True)
# X_train_final = X_train_merged.drop('lemmatized', axis =1)
X_train_merged = X_train.merge(pd.DataFrame(X_train_vect_avg, index = X_train.index), right_index = True, left_index = True)
X_test_merged = X_test.merge(pd.DataFrame(X_test_vect_avg, index = X_test.index), right_index = True, left_index = True)
X_train_final = X_train_merged.drop('lemmatized', axis =1)
X_test_final = X_test_merged.drop('lemmatized', axis=1)

In [34]:
X_train_final

Unnamed: 0,vader_score,rating_diff,stopwords_count,avg_word_length,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
6814,1,0.897613,0,5.666667,-0.098139,0.152825,0.010380,-0.003528,0.130861,-0.346093,...,0.218150,0.184435,0.020700,0.032897,0.327030,0.207096,0.119866,-0.229111,0.032852,-0.151260
6312,1,0.897613,3,4.750000,-0.097399,0.153031,0.019838,-0.010965,0.086028,-0.309920,...,0.214124,0.120933,0.023088,0.019250,0.309029,0.206420,0.093781,-0.250889,0.042317,-0.146586
6161,1,0.897613,12,4.933333,-0.123946,0.191784,0.019944,0.002134,0.084514,-0.337393,...,0.219150,0.083186,0.009489,-0.003069,0.294291,0.196719,0.093461,-0.239879,0.052548,-0.115513
608,1,1.102387,6,6.083333,-0.123100,0.207674,0.034953,0.016403,0.069708,-0.320826,...,0.217045,0.047380,0.008706,-0.020137,0.283210,0.192724,0.082889,-0.228280,0.055099,-0.096364
1357,0,3.102387,0,7.333333,-0.119863,0.242909,0.054113,0.023635,0.038049,-0.307935,...,0.225634,0.002670,-0.012974,-0.076026,0.280811,0.148998,0.098216,-0.206389,0.084380,-0.059611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3253,1,0.897613,5,3.700000,-0.094996,0.168139,0.020153,0.005222,0.094476,-0.321732,...,0.219121,0.143297,0.020190,0.017934,0.345088,0.205627,0.109687,-0.224914,0.063924,-0.134672
6384,1,0.897613,9,4.900000,-0.132182,0.194317,0.029936,0.017549,0.083538,-0.318054,...,0.194744,0.049528,-0.001147,-0.022238,0.267147,0.175728,0.094429,-0.215122,0.070633,-0.089100
6445,1,0.897613,5,5.312500,-0.100542,0.218453,0.050636,0.017553,0.047535,-0.298541,...,0.224091,0.035951,-0.007329,-0.063152,0.288743,0.156948,0.104384,-0.201595,0.084097,-0.074337
1131,1,0.102387,9,4.818182,-0.119823,0.196505,0.033548,0.028405,0.074375,-0.323796,...,0.241129,0.062222,0.012872,0.002089,0.299427,0.215115,0.064501,-0.255742,0.033074,-0.116380


In [35]:
y_train

6814    1
6312    1
6161    1
608     1
1357    1
       ..
3253    0
6384    0
6445    1
1131    0
650     0
Name: helpfulness, Length: 2228, dtype: int64

<h4>GridSearch CV</h4>

In [17]:
clf1 = SVC(probability=True, random_state=42)
clf2 = GradientBoostingClassifier(random_state=42)

param1 = {}
param1['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
param1['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param1['classifier'] = [clf1]

param2 = {}
param2['classifier__n_estimators'] = [10, 50, 100, 250]
param2['classifier__max_depth'] = [5, 10, 20]
param2['classifier'] = [clf2]

pipeline = Pipeline([('classifier', clf1)])
params = [param1, param2]

In [40]:
gs = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, scoring='f1_micro').fit(X_train_final.values,y_train)

In [41]:
gs.best_params_

{'classifier': SVC(C=10, probability=True, random_state=42),
 'classifier__C': 10,
 'classifier__class_weight': None}

In [42]:
gs.best_score_

0.6858018353993366