# Train sentiment prediction model based on amazon food reviews
This dataset consists of reviews of fine foods from amazon. The data span a period of more than 10 years, including all ~500,000 reviews up to October 2012. Reviews include product and user information, ratings, and a plain text review. It also includes reviews from all other Amazon categories.

https://www.kaggle.com/snap/amazon-fine-food-reviews

### Unigram + Bigram

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

#=====================================================#
# Create train and test dataset from kaggle data
#=====================================================#
df = pd.read_csv('Amazon_food.csv') #read csv
amazon = df[df['Score'] != 3] #remove neutral score/ feedback rating (i.e. =3)
X = amazon['Text'] #extract reviews
y = amazon['Score'].map({1:0, 2:0, 4:1, 5:1}) #extract score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=0) # 80/20 split


#=====================================================#
# Train the model
#=====================================================#
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words = 'english') #for converting reviews to matrix of TF-IDF features
lr = LogisticRegression(random_state=0, solver='lbfgs') #using logistic regression as prediction model
clf_pipeline = Pipeline([('tfidf', tfidf), ('lr', lr)]) #create pipeline of vectorizing and prediction steps
clf_pipeline.fit(X_train, y_train) #fit training data into pipeline
print ('Accuracy: {}'.format(clf_pipeline.score(X_test, y_test))) #get accuracy


#=====================================================#
# Save the model to disk
#=====================================================#
filename = 'lr_model.sav'
joblib.dump(clf_pipeline, filename) #persist model into file 


#=====================================================#
# Extract postive and negative sentiments
#=====================================================#
num = 20 #number of features

word = tfidf.get_feature_names() #get sentiment words/ features
coef = clf_pipeline.named_steps['lr'].coef_.tolist()[0] #get coefficient of features (feature importances) 
coeff_df = pd.DataFrame({'Word' : word, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient'], ascending=[False]) #sort descending by coefficient.

print('\n'+'Top {} positive sentiments'.format(num))
print(coeff_df.head(num).to_string(index=False))
print('\n'+'Top {} negative sentiments'.format(num))   
print(coeff_df.tail(num).to_string(index=False))



Accuracy: 0.9487177048962088

Top 20 positive sentiments
             Word  Coefficient
            great    21.086667
             best    17.959149
        delicious    17.431698
          perfect    14.991942
            loves    13.772703
             love    13.432053
        excellent    13.406393
        wonderful    11.630547
             good    11.433786
             nice    10.750654
         favorite    10.629300
          amazing     9.941800
          awesome     9.549840
          pleased     9.233244
             easy     9.184555
            happy     8.988801
           smooth     8.802296
            yummy     8.801215
            tasty     8.652572
 highly recommend     8.611649

Top 20 negative sentiments
           Word  Coefficient
          maybe    -7.921479
         hoping    -8.101693
          money    -8.208116
      tasteless    -8.289617
        thought    -8.531554
          worse    -8.570159
     disgusting    -9.004960
          bland    -9.366393
   

### Unigram result

In [None]:
"""
Accuracy: 0.935205347888516

Top 20 positive sentiments
       Word  Coefficient
      great    14.056236
  delicious    12.569577
       best    11.917344
    perfect    10.765441
  excellent     9.978509
      loves     9.678879
     highly     8.902305
       love     8.513440
  wonderful     8.197428
    amazing     7.540611
 pleasantly     7.471089
       good     7.316470
     hooked     7.281038
    awesome     7.221646
       nice     7.099952
   favorite     6.921111
    pleased     6.900758
      yummy     6.697903
     smooth     6.697484
       glad     6.421410

Top 20 negative sentiments
           Word  Coefficient
          waste    -5.656552
           yuck    -5.713675
   unacceptable    -5.917392
    undrinkable    -5.942775
          worse    -6.134979
          stale    -6.234432
     disgusting    -6.409609
      tasteless    -6.449217
           weak    -6.489440
          bland    -6.605334
          threw    -6.798688
         return    -6.892879
  unfortunately    -7.706355
       horrible    -7.912724
 disappointment    -8.059012
          awful    -8.314880
   disappointed    -8.523823
       terrible    -9.373132
  disappointing    -9.852226
          worst   -11.512352
"""

### Bigram result

In [None]:
"""
Accuracy: 0.9206279775206109

Top 20 positive sentiments
                 Word  Coefficient
     highly recommend    14.685871
           just right    10.431683
   highly recommended     9.097240
        great product     9.026870
 pleasantly surprised     8.575248
         tastes great     8.531111
         great flavor     7.523886
          really good     7.471172
             far best     7.212022
         best tasting     6.611659
           love stuff     6.536895
        great tasting     6.434235
          taste great     6.413230
            dogs love     6.292232
          great price     6.265570
          great taste     6.044865
            dog loves     6.037769
        thanks amazon     5.905563
        free shipping     5.898499
            just love     5.860198

Top 20 negative sentiments
                 Word  Coefficient
           throw away    -7.044931
         wasted money    -7.187337
          tastes like    -7.212991
            bad batch    -7.230827
      quality control    -7.340720
            way sweet    -7.386776
  really disappointed    -7.561356
          wanted like    -7.856835
           save money    -8.110463
           high hopes    -8.114120
              don buy    -8.188355
             did like    -8.414479
             ll stick    -8.453697
 disappointed product    -8.515903
              won buy    -8.700307
           threw away    -9.161024
         buyer beware    -9.472091
          tasted like   -10.302652
           won buying   -10.492836
          waste money   -16.873954
"""

### Trigram result

In [None]:
"""
Accuracy: 0.878474368361496

Top 20 positive sentiments
                     Word  Coefficient
 highly recommend product     7.592842
              br br great     5.794580
             br br highly     5.404922
            goes long way     5.238101
           love love love     4.803285
      br highly recommend     4.463377
    br highly recommended     3.976873
        amazon best price     3.901483
         best gluten free     3.808341
          taste just like     3.767422
            br br overall     3.670306
         great cup coffee     3.616616
      great product great     3.604893
          just right size     3.577530
        hard time finding     3.528331
    amazon subscribe save     3.376714
              br br enjoy     3.290670
               br br love     3.290012
         tastes just like     3.232508
          ve tried brands     3.227960

Top 20 negative sentiments
                    Word  Coefficient
         didn taste good    -4.508626
       wish read reviews    -4.634678
 amazon customer service    -4.674480
             br br sorry    -4.713612
         threw rest away    -4.797813
      br br disappointed    -5.017063
        waste time money    -5.347787
         does taste like    -5.481824
          save money buy    -5.522929
  really looking forward    -5.586859
        doesn taste good    -5.753869
          did taste like    -5.819811
    complete waste money    -5.894788
         didn taste like    -5.903783
     ended throwing away    -6.316694
       total waste money    -7.423270
        doesn taste like    -7.428759
           got bad batch    -8.011932
      really wanted like    -9.301142
         don waste money   -15.864513
"""