In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
import warnings
warnings.filterwarnings('ignore')

#read the json data
review_data=pd.read_json('Musical_Instruments_5.json.zip',lines=True)
review_data[['reviewText','overall']].head()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Unnamed: 0,reviewText,overall
0,"Not much to write about here, but it does exac...",5
1,The product does exactly as it should and is q...,5
2,The primary job of this device is to block the...,5
3,Nice windscreen protects my MXL mic and preven...,5
4,This pop filter is great. It looks and perform...,5


In [None]:
lemmatizer=WordNetLemmatizer()
review_data['cleaned_review_text']=review_data['reviewText'].apply(lambda x :''.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(re.sub(r'([^\s\w]|_)+','',str(x)))]))

#print the new cleaned data
review_data[['cleaned_review_text','reviewText','overall']].head()



#create the TF-IDF frequency
tfidf_model=TfidfVectorizer(max_features=500)
tfidf_df=pd.DataFrame(tfidf_model.fit_transform(review_data['cleaned_review_text']).todense())
tfidf_df.columns=sorted(tfidf_model.vocabulary_)
tfidf_df.head()

#set the target data
review_data['target']=review_data['overall'].apply(lambda x:0 if x<=4 else 1)
review_data['target'].value_counts()



1    6938
0    3323
Name: target, dtype: int64

In [None]:
#implement Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(tfidf_df,review_data['target'])
predicted_labels=logreg.predict(tfidf_df)
logreg.predict_proba(tfidf_df)[:,1]

#compare the result of classification using crosstab
review_data['predicted_labels']=predicted_labels
pd.crosstab(review_data['target'],review_data['predicted_labels'])



predicted_labels,1
target,Unnamed: 1_level_1
0,3323
1,6938


In [None]:
#implement NaiveBayes
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
nb.fit(tfidf_df,review_data['target'])
predicted_labels=nb.predict(tfidf_df)
nb.predict_proba(tfidf_df)[:,1]

#compare the result of classification using crosstab
review_data['predicted_labels_nb']=predicted_labels
pd.crosstab(review_data['target'],review_data['predicted_labels_nb'])



predicted_labels_nb,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3323,0
1,6585,353


In [None]:
#implement KNN and compare result
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(tfidf_df,review_data['target'])
review_data['predicted_labels_knn']=knn.predict(tfidf_df)
pd.crosstab(review_data['target'],review_data['predicted_labels_knn'])


predicted_labels_knn,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,84,3239
1,40,6898


In [None]:
from sklearn.linear_model import LinearRegression
linreg=LinearRegression()
linreg.fit(tfidf_df,review_data['overall'])
linreg.coef_


array([ 0.51081497,  0.51081497,  0.51081497,  0.51081497,  0.51081497,
        0.51081497,  0.51081497,  0.51081497,  0.51081497, -1.48918503,
        0.51081497,  0.51081497,  0.51081497, -1.48918503,  0.51081497,
        0.51081497,  0.51081497,  0.51081497,  0.51081497,  0.51081497,
       -2.48918503,  0.51081497,  0.51081497, -3.48918503, -3.48918503,
       -3.48918503, -0.48918503, -1.48918503, -1.48918503,  0.51081497,
        0.51081497,  0.51081497,  0.51081497,  0.51081497, -0.48918503,
       -3.48918503,  0.51081497,  0.51081497,  0.51081497,  0.51081497,
        0.51081497,  0.51081497,  0.51081497,  0.51081497,  0.51081497,
        0.51081497, -0.48918503,  0.51081497,  0.51081497, -1.48918503,
        0.51081497,  0.51081497,  0.51081497,  0.51081497, -3.48918503,
        0.51081497,  0.51081497,  0.51081497,  0.51081497,  0.51081497,
        0.51081497,  0.51081497,  0.51081497,  0.51081497,  0.51081497,
        0.51081497,  0.51081497,  0.51081497,  0.51081497, -0.48

In [None]:
#find interception
linreg.intercept_

#prediction
linreg.predict(tfidf_df)

#predict overall score
review_data['predicted_score_from_linear_regression']=linreg.predict(tfidf_df)
review_data[['overall','predicted_score_from_linear_regression']].head(10)




Unnamed: 0,overall,predicted_score_from_linear_regression
0,5,4.489185
1,5,4.489185
2,5,4.489185
3,5,4.489185
4,5,4.489185
5,5,5.0
6,5,4.489185
7,3,4.489185
8,5,5.0
9,5,4.489185


In [None]:
#implement decision tree classifier
from sklearn import tree
dtc=tree.DecisionTreeClassifier()
dtc=dtc.fit(tfidf_df,review_data['target'])
review_data['predicted_labels_dtc']=dtc.predict(tfidf_df)

pd.crosstab(review_data['target'],review_data['predicted_labels_dtc'])



predicted_labels_dtc,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,153,3170
1,0,6938


In [None]:
#implement decision tree regression
from sklearn import tree
dtr=tree.DecisionTreeRegressor()
dtr=dtr.fit(tfidf_df,review_data['overall'])
review_data['predicted_values_dtr']=dtr.predict(tfidf_df)
review_data[['predicted_values_dtr','overall']].head(10)



Unnamed: 0,predicted_values_dtr,overall
0,4.489185,5
1,4.489185,5
2,4.489185,5
3,4.489185,5
4,4.489185,5
5,5.0,5
6,4.489185,5
7,4.489185,3
8,5.0,5
9,4.489185,5


In [None]:
#generic function for regression models
def reg_model(model_type,X_train,y):
  model=model_type.fit(X_train,y)
  predicted_values=model.predict(tfidf_df)
  return predicted_values




In [None]:


#implement Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rfg=RandomForestRegressor(n_estimators=20,max_depth=4,max_features='sqrt',random_state=1)
review_data['predicted_values_rfg']=reg_model(rfg,tfidf_df,review_data['overall'])
review_data[['overall','predicted_values_rfg']].head(10)




Unnamed: 0,overall,predicted_values_rfg
0,5,4.492019
1,5,4.492019
2,5,4.492019
3,5,4.492019
4,5,4.492019
5,5,4.492019
6,5,4.492019
7,3,4.492019
8,5,4.492019
9,5,4.492019


In [None]:
#implement GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
gbr=GradientBoostingRegressor(n_estimators=20,max_depth=4,max_features='sqrt',random_state=1)
review_data['predicted_values_gbr']=reg_model(gbr,tfidf_df,review_data['overall'])
review_data[['overall','predicted_values_rfg']].head(10)




Unnamed: 0,overall,predicted_values_rfg
0,5,4.492019
1,5,4.492019
2,5,4.492019
3,5,4.492019
4,5,4.492019
5,5,4.492019
6,5,4.492019
7,3,4.492019
8,5,4.492019
9,5,4.492019


In [None]:

#implement XGBRegressor
from xgboost import XGBRegressor
xgbr=XGBRegressor(n_estimators=20,learning_rate=0.03,max_depth=5,subsample=0.6,colsample_bytree=0.6,reg_alpha=10,seed=42)
review_data['predicted_values_xgbr']=reg_model(xgbr,tfidf_df,review_data['overall'])
review_data[['overall','predicted_values_xgbr']].head(10)





Unnamed: 0,overall,predicted_values_xgbr
0,5,2.321243
1,5,2.321243
2,5,2.321243
3,5,2.321243
4,5,2.321243
5,5,2.321243
6,5,2.321243
7,3,2.321243
8,5,2.321243
9,5,2.321243


In [None]:
#generic function for classier models
def clf_model(model_type,X_train,y):
  model=model_type.fit(X_train,y)
  predicted_labels=model.predict(tfidf_df)
  return predicted_labels

#implement random forest classifier
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=20,max_depth=4,max_features='sqrt',random_state=1)
review_data['predicted_labels_rfc']=clf_model(rfc,tfidf_df,review_data['target'])
pd.crosstab(review_data['target'],review_data['predicted_labels_rfc'])



predicted_labels_rfc,1
target,Unnamed: 1_level_1
0,3323
1,6938


In [None]:
#implement GBM classifier
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier(n_estimators=2,max_depth=3,max_features='sqrt',random_state=1)
review_data['predicted_labels_gbc']=clf_model(gbc,tfidf_df,review_data['target'])
pd.crosstab(review_data['target'],review_data['predicted_labels_gbc'])


predicted_labels_gbc,1
target,Unnamed: 1_level_1
0,3323
1,6938


In [None]:
#implement XGBoost classifier
from xgboost import XGBClassifier
xgb_clf=XGBClassifier(n_estimators=20,learning_rate=0.03,max_depth=5,subsample=0.6,colsample_bytree=0.6,reg_alpha=10,seed=42)
review_data['predicted_labels_xgbc']=clf_model(xgb_clf,tfidf_df,review_data['target'])
pd.crosstab(review_data['target'],review_data['predicted_labels_xgbc'])

predicted_labels_xgbc,1
target,Unnamed: 1_level_1
0,3323
1,6938
