# Train sentiment prediction model based on amazon food reviews
This dataset consists of reviews of fine foods from amazon. The data span a period of more than 10 years, including all ~500,000 reviews up to October 2012. Reviews include product and user information, ratings, and a plain text review. It also includes reviews from all other Amazon categories.

https://www.kaggle.com/snap/amazon-fine-food-reviews

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

#=====================================================#
# Create train and test dataset from kaggle data
#=====================================================#
df = pd.read_csv('Amazon_food.csv') #read csv
amazon = df[df['Score'] != 3] #remove neutral score/ feedback rating (i.e. =3)
X = amazon['Text'] #extract reviews
y = amazon['Score'].map({1:0, 2:0, 4:1, 5:1}) #extract score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=0) # 80/20 split


#=====================================================#
# Train the model
#=====================================================#
tfidf = TfidfVectorizer(stop_words = 'english') #for converting reviews to matrix of TF-IDF features
lr = LogisticRegression(random_state=0, solver='lbfgs') #using logistic regression as prediction model
clf_pipeline = Pipeline([('tfidf', tfidf), ('lr', lr)]) #create pipeline of vectorizing and prediction steps
clf_pipeline.fit(X_train, y_train) #fit training data into pipeline
print ('Accuracy: {}'.format(clf_pipeline.score(X_test, y_test))) #get accuracy


#=====================================================#
# Save the model to disk
#=====================================================#
filename = 'lr_model.sav'
joblib.dump(clf_pipeline, filename) #persist model into file 


#=====================================================#
# Extract postive and negative sentiments
#=====================================================#
num = 20 #number of features

word = tfidf.get_feature_names() #get sentiment words/ features
coef = clf_pipeline.named_steps['lr'].coef_.tolist()[0] #get coefficient of features (feature importances) 
coeff_df = pd.DataFrame({'Word' : word, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient'], ascending=[False]) #sort descending by coefficient.

print('\n'+'Top {} positive sentiments'.format(num))
print(coeff_df.head(num).to_string(index=False))
print('\n'+'Top {} negative sentiments'.format(num))   
print(coeff_df.tail(num).to_string(index=False))



Accuracy: 0.935205347888516

Top 20 positive sentiments
       Word  Coefficient
      great    14.056236
  delicious    12.569577
       best    11.917344
    perfect    10.765441
  excellent     9.978509
      loves     9.678879
     highly     8.902305
       love     8.513440
  wonderful     8.197428
    amazing     7.540611
 pleasantly     7.471089
       good     7.316470
     hooked     7.281038
    awesome     7.221646
       nice     7.099952
   favorite     6.921111
    pleased     6.900758
      yummy     6.697903
     smooth     6.697484
       glad     6.421410

Top 20 negative sentiments
           Word  Coefficient
          waste    -5.656552
           yuck    -5.713675
   unacceptable    -5.917392
    undrinkable    -5.942775
          worse    -6.134979
          stale    -6.234432
     disgusting    -6.409609
      tasteless    -6.449217
           weak    -6.489440
          bland    -6.605334
          threw    -6.798688
         return    -6.892879
  unfortunatel