In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, SimpleRNN

## Load Dataset

In [36]:
df = pd.read_csv('../input/yelpdataset/yelp.csv')
df = df[['sentence','label']]    # Keeping only the neccessary columns
print(df.shape)
df.head()

## Data Analysis

In [37]:
df['label'].value_counts().sort_index().plot.bar()

In [28]:
df['sentence'].str.len().plot.hist()

## Data Pre-processing

In [39]:
# transform text to lowercase
df['sentence'] = df['sentence'].apply(lambda x: x.lower())   

In [40]:
X = df['sentence']
y = df['label']

print(X[1])
print(y[1])

In [41]:
# Sentence Tokenization of X column
tokenizer = Tokenizer(num_words = 1000)
tokenizer.fit_on_texts(X)   # num_words = how many top frequency word we want to have

X = tokenizer.texts_to_sequences(X)

vocab_size = len(tokenizer.word_index) + 1 

In [42]:
print(X[1])
print(y[1])

In [43]:
# Identify max length 
max_length = 0 
for review_number in range(len(X)):
    numberOfWords = len(X[review_number])
    if(numberOfWords) > (max_length):
        max_length = numberOfWords
        
print(max_length)

In [44]:
# Add Padding
X = pad_sequences(X, padding = 'post', maxlen = max_length)

print(X[1, :])

In [45]:
# One Hot Encoding of y column
num_classes = 2    # num_class is 2 for negative or positive
y = to_categorical(y, num_classes)

y.shape

In [46]:
X = np.array(X).reshape( ( X.shape[0], X.shape[1], 1 ) )
print(X.shape)

In [47]:
# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

print(X_train[1])
print(y_train[1])

## Build model

In [55]:
def vanilla_rnn():
    mymodel = Sequential()
    mymodel.add(SimpleRNN(50, input_shape = (max_length, 1), return_sequences = False))
    mymodel.add(Dense(num_classes))
    mymodel.add(Activation('softmax'))
    
    mymodel.summary()
    
    adam = optimizers.Adam(lr = 0.001)
    mymodel.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['Accuracy'])
    
    return mymodel

In [56]:
mymodel = KerasClassifier(build_fn = vanilla_rnn, epochs = 20, batch_size = 50)
mymodel.fit(X_train, y_train)

In [57]:
y_pred = mymodel.predict(X_test)
yTest = np.argmax(y_test, axis = 1)

print(accuracy_score(y_pred, yTest))

In [59]:
rev = ["Food is not so good"]
rev = tokenizer.texts_to_sequences(rev)
rev = np.array(rev)

rev = pad_sequences( rev, padding = 'post', maxlen =  max_length)
rev = rev.reshape( (rev.shape[0], rev.shape[1], 1) )
print(rev.shape)

prediction = mymodel.predict(np.array(rev))
print(prediction)