In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)
dataset_path = 'gdrive/My Drive/Projects/Sentiment Analysis/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import pandas as pd
import numpy as np

train_df = pd.read_excel(dataset_path + "train.xlsx", encoding = 'latin-1')
test_df = pd.read_excel(dataset_path + "test.xlsx", encoding = 'latin-1')
X_train, y_train = train_df['Review'], train_df['Sentiment']
X_test, y_test = test_df['Review'], test_df['Sentiment']

# Preprocessing



1.   All punctuation marks are removed from the data and replaced by " "
2.   All special characters are removed from both the train and test reviews
3. All words are converted to their lower case versions
4. Stopwords are removed from the reviews



In [0]:
from nltk.corpus import stopwords
import nltk
import string
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
stop_words = stopwords.words('english')
X_train_mod, y_train_mod = [], []
X_test_mod, y_test_mod = [], []
words = []
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
for ind, review in enumerate(X_train):
  try:
    word_review = review.split(' ')
    word_review = [w.translate(table) for w in word_review]
    word_review = [w for w in word_review if w.isalpha()]
    word_review = [w.lower() for w in word_review if w not in stop_words]
    word_review = [lmtzr.lemmatize(w) for w in word_review]
    X_train_mod.append(' '.join(word_review))
    y_train_mod.append(y_train[ind])
    words += word_review
  except:
    continue
for ind, review in enumerate(X_test):
  try:
    word_review = review.split(' ')
    word_review = [w.translate(table) for w in word_review]
    word_review = [w for w in word_review if w.isalpha()]
    word_review = [w.lower() for w in word_review if w not in stop_words]
    word_review = [lmtzr.lemmatize(w) for w in word_review]
    X_test_mod.append(' '.join(word_review))
    y_test_mod.append(y_test[ind])
    words += word_review
  except:
    continue
    
X_total = X_train_mod + X_test_mod

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Embedding 

The Glove vector embedding that has been downloaded is used to create an embedding vector for each word and stored in the dictionary **embedding**


In [0]:
embedding = {}
f = open('gdrive/My Drive/glove.840B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    try:
      coefs = np.asarray(values[1:], dtype='float32')
      embedding[word] = coefs
    except:
      continue
    
f.close()


Keras is used to create sequences for all the data and the maximum length of reviews across both train and test data sets is used to pad the sequences


In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_len = max(len(l.split()) for l in X_total)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_total)
sequences = tokenizer.texts_to_sequences(X_train_mod)

word_index = tokenizer.word_index
print(len(word_index))

review_pad = pad_sequences(sequences, maxlen = max_len)
print(review_pad.shape)


Using TensorFlow backend.


72959
(24999, 1225)


In [0]:
y_train_mod = np.asarray(y_train_mod).reshape(len(y_train_mod), 1)


(24999, 1)


An embedding matrix is created for the words in the sequence

In [0]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 300))
for word, i in word_index.items():
  if i > num_words:
    continue
  embedding_vector = embedding.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
    
print(num_words)

72960


# Model


*   A **Bidirectional GRU of 64 layers with dropouts of 0.2** is followed by a **Dense layer of 1** unit
*  **Sigmoid activation** is used in the last layer and the loss is **binary cross-entropy**
* As we are using pretraine GLoVE model we have kept the embedding layer as untrainable



In [0]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional
from keras.initializers import Constant


model = Sequential()
embedding_layer = Embedding(num_words,
                           300,
                           embeddings_initializer = Constant(embedding_matrix),
                           input_length = max_len,
                           trainable=False)
model.add(embedding_layer)
model.add(Bidirectional(GRU(units=64, dropout=0.2, recurrent_dropout = 0.2)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

W0623 04:10:04.974187 140687264831360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0623 04:10:05.012596 140687264831360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0623 04:10:05.579812 140687264831360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0623 04:10:05.769470 140687264831360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0623 04:10:05.781176 

Validation data is 20% of the train data

In [0]:
val_split = 0.2
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = y_train_mod[indices]
num_validation_samples = int(val_split*review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train_pad = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test_pad = sentiment[-num_validation_samples:]

In [0]:
from keras.callbacks import ModelCheckpoint
checkpoint_path = "cp-{epoch:04d}.ckpt"

cp_callback = ModelCheckpoint(checkpoint_path, verbose=1,
                              save_weights_only=True,
                              period=5)


In [0]:
model.fit(X_train_pad, y_train_pad, batch_size = 128, epochs = 25,
          validation_data = (X_test_pad, y_test_pad),
          callbacks = [cp_callback], verbose = 2)

In [0]:
model.save('trained_model.h5')

In [0]:
sequences = tokenizer.texts_to_sequences(X_test_mod)

word_index = tokenizer.word_index
print(len(word_index))

test_pad = pad_sequences(sequences, maxlen = max_len)
print(test_pad.shape)

72959
(24998, 1225)


In [0]:
from keras.models import load_model
model = load_model('trained_model.h5')
loss, acc = model.evaluate(test_pad, np.asarray(y_test_mod))
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

Restored model, accuracy: 86.70%
