<a href="https://colab.research.google.com/github/PearlSikka/language-ninja/blob/master/Sentiment_Analysis_using_LSTM_on_Amazon_food_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The project is to understand the process of classifying sentiments from reviews of fine foods from amazon using Long Short Term Memory networks.

In [None]:
import os
import pathlib

# Upload the API token.
def get_kaggle():
  try:
    import kaggle
    return kaggle
  except OSError:
    pass

  token_file = pathlib.Path("~/.kaggle/kaggle.json").expanduser()
  token_file.parent.mkdir(exist_ok=True, parents=True)

  try:
    from google.colab import files
  except ImportError:
    raise ValueError("Could not find kaggle token.")

  uploaded = files.upload()
  token_content = uploaded.get('kaggle.json', None)
  if token_content:
    token_file.write_bytes(token_content)
    token_file.chmod(0o600)
  else:
    raise ValueError('Need a file named "kaggle.json"')
  
  import kaggle
  return kaggle


kaggle = get_kaggle()

In [None]:
!kaggle datasets download -d snap/amazon-fine-food-reviews                          #downloading kaggle dataset

In [None]:
!unzip amazon-fine-food-reviews.zip -d train

In [None]:
#loading libraries 

import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from keras.layers import Dense,SpatialDropout1D,LSTM,Embedding


In [None]:
data=pd.read_csv('/content/train/Reviews.csv')

In [None]:
data.head()

In [None]:
data.describe

In [None]:
data=data[['Text','Score']]       #filtering columns Text, Score 

In [None]:
data.head()

In [None]:
data=data[data.Score !=3]    #removing neutral reviews

In [None]:
data.isnull().any()

In [None]:
rename_dict={1:0,2:0,4:1,5:1}                                                        #reviews having score 1, 2 -> negative, 4,5->positive 

In [None]:
rename_dict

In [None]:
data =data.replace({"Score": rename_dict})

In [None]:
data.head()

In [None]:
stemmer=PorterStemmer()

In [None]:
def clean_word(text):                                                       
  split_sent= text.split()
  cleaned_Word=" ".join(stemmer.stem(i) for i in split_sent if                              #stemming the words
                        i not in stopwords.words('english'))
  return cleaned_Word

In [None]:
data1=data[data['Score']==0][:4000]

In [None]:
data1=data1.append(data[data['Score']==1][:4000])

In [None]:
pos_cnt=data1[data1.Score==1]
neg_cnt=data1[data1.Score==0]

print(pos_cnt.shape)
print(neg_cnt.shape)

In [None]:
tokenizer=Tokenizer(num_words=1000,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ')       #tokenizer to tokenize the sentences

In [None]:
tokenizer.fit_on_texts(data1['Text'])

In [None]:
print(tokenizer.word_index)

In [None]:
print(len(tokenizer.word_index))

In [None]:
sequences=tokenizer.texts_to_sequences(data1['Text'].values)

In [None]:
print(sequences[0:10])

In [None]:
padded=pad_sequences(sequences,maxlen=40,padding='post')                          #post padding sequences

In [None]:
print(padded[0:10])

In [None]:
X=padded

In [None]:
embed_dim = 10
lstm_out = 100

model = Sequential()                                                             #sequential model    
model.add(Embedding(1000, embed_dim,input_length = X.shape[1]))                  #Embedding layer     
model.add(SpatialDropout1D(0.4))                                                 #Dropout layer
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))                    #LSTM layer
model.add(Dense(1,activation='softmax'))                                          
model.compile(loss = 'categorical_crossentropy', optimizer='adam',               
              metrics = ['accuracy'])   
print(model.summary())


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Y=data1['Score'].values

In [None]:
Y.shape

In [None]:
X.shape

In [None]:
#train_X,train_Y,test_X,test_Y= train_test_split(X,Y,test_size=0.25)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
model.fit(X_train,Y_train,batch_size=32,epochs=4)                                #training model 

In [None]:
validation_size = 1000                                                           #validating model

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = 32)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

In [None]:
# twt = ['not tasty']
# twt = tokenizer.texts_to_sequences(twt)
# twt = pad_sequences(twt, 40, dtype='int32', value=0)
# sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
# print(int(sentiment))

# #print(np.argmax(sentiment))

# if((sentiment) == 0):
#     print("negative")
# elif ((sentiment) == 1):
#     print("positive")