In [5]:
!pip install ipynb
import numpy as np
np.random.seed(100)
from keras import Model
from keras.models import Sequential
from keras.layers import Dense, LSTM, Input
import spacy
import numpy as np
from nltk.corpus import movie_reviews
from random import shuffle
import nltk
!python -m spacy download en_core_web_lg
import en_core_web_lg
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
nltk.download('movie_reviews')
nltk.download('punkt')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Load a spacy model for english
en_model = en_core_web_lg.load()

In [None]:
for token in en_model('I am taking a lecture on sentiment analysis.'):
    print(token, token.pos_, token.lemma_, token.vector.shape)

I PRON -PRON- (300,)
am AUX be (300,)
taking VERB take (300,)
a DET a (300,)
lecture NOUN lecture (300,)
on ADP on (300,)
sentiment NOUN sentiment (300,)
analysis NOUN analysis (300,)
. PUNCT . (300,)


In [10]:
def create_data_label_pairs(n=100):
    """Create data samples and corresponding labels."""
    postive_file_ids = movie_reviews.fileids(categories=["pos"])[: n]
    negative_file_ids = movie_reviews.fileids(categories=["neg"])[: n]
    sentence_lengths_pos, sentence_lengths_neg = list(), list()
    data_samples, labels = list(), list()
    for file_id in postive_file_ids:
        print(file_id)
        text = movie_reviews.raw(file_id)
        sentence_lengths_pos.append(len(nltk.sent_tokenize(text)))
        for sent in nltk.sent_tokenize(text):
            data_samples.append(sent)
            labels.append("pos")
    for file_id in negative_file_ids:
        print(file_id)
        text = movie_reviews.raw(file_id)
        sentence_lengths_neg.append(len(nltk.sent_tokenize(text)))
        for sent in nltk.sent_tokenize(text):
            data_samples.append(sent)
            labels.append("neg")
    print('Avg Length for POS Samples =', np.mean(sentence_lengths_pos))
    print('Avg Length for NEG Samples =', np.mean(sentence_lengths_neg))
    print('Avg Length for All Samples =', np.mean(sentence_lengths_pos + sentence_lengths_neg))
    return data_samples, labels

In [11]:
def shuffle_items(items):
    """Shuffle items in a list."""
    shuffle(items)
    return items

In [12]:
def select_items_based_on_indexes(items, indexes):
    """Select items in a list on specified indexes."""
    return [items[index] for index in indexes]

In [13]:
# create new samples of data
data_samples, labels = create_data_label_pairs(100)
all_indexes = list(range(len(data_samples)))
shuffle_items(all_indexes)
data_samples = select_items_based_on_indexes(data_samples, all_indexes)
labels = select_items_based_on_indexes(labels, all_indexes)
# Split train and test data
split_point = int(0.8 * len(data_samples))
print(split_point)
train_data, train_labels = data_samples[: split_point], labels[: split_point]
test_data, test_labels = data_samples[split_point:], labels[split_point:]
print(len(test_data))

pos/cv000_29590.txt
pos/cv001_18431.txt
pos/cv002_15918.txt
pos/cv003_11664.txt
pos/cv004_11636.txt
pos/cv005_29443.txt
pos/cv006_15448.txt
pos/cv007_4968.txt
pos/cv008_29435.txt
pos/cv009_29592.txt
pos/cv010_29198.txt
pos/cv011_12166.txt
pos/cv012_29576.txt
pos/cv013_10159.txt
pos/cv014_13924.txt
pos/cv015_29439.txt
pos/cv016_4659.txt
pos/cv017_22464.txt
pos/cv018_20137.txt
pos/cv019_14482.txt
pos/cv020_8825.txt
pos/cv021_15838.txt
pos/cv022_12864.txt
pos/cv023_12672.txt
pos/cv024_6778.txt
pos/cv025_3108.txt
pos/cv026_29325.txt
pos/cv027_25219.txt
pos/cv028_26746.txt
pos/cv029_18643.txt
pos/cv030_21593.txt
pos/cv031_18452.txt
pos/cv032_22550.txt
pos/cv033_24444.txt
pos/cv034_29647.txt
pos/cv035_3954.txt
pos/cv036_16831.txt
pos/cv037_18510.txt
pos/cv038_9749.txt
pos/cv039_6170.txt
pos/cv040_8276.txt
pos/cv041_21113.txt
pos/cv042_10982.txt
pos/cv043_15013.txt
pos/cv044_16969.txt
pos/cv045_23923.txt
pos/cv046_10188.txt
pos/cv047_1754.txt
pos/cv048_16828.txt
pos/cv049_20471.txt
pos/cv050_

In [14]:
index_to_label_dict = {0: "neg", 1: "pos"}
label_to_index_dict = {"neg": 0, "pos": 1}

In [15]:
def create_sentence_vectors(sentences, maxlen=50):
    """Create sentence vector for each sentence."""
    sentence_vectors = list()
    for i, sentence in enumerate(sentences):
        parsed = en_model(sentence)
        temp_vector = list()
        for token in parsed:
            temp_vector.append(token.vector.tolist())
        sentence_vectors.append(temp_vector)
    return pad_sequences(sentence_vectors, maxlen=maxlen, padding='post')

In [16]:
# maximum sentence length is assumed = 50
train_sentence_vectors = create_sentence_vectors(train_data, 50)
print(len(train_data))
print(train_sentence_vectors.shape)

5569
(5569, 50, 300)


In [34]:
train_labels_indexed = np.array([label_to_index_dict[label] for label in train_labels])
train_labels_indexed = train_labels_indexed.reshape((train_labels_indexed.shape[0], -1))
print(train_labels_indexed.shape)

(5569, 1)


In [37]:
# Define a model and train
input_layer = Input(shape=(50, 300), name='input')
lstm_layer = LSTM(300, name='lstm')(input_layer)
output_layer = Dense(2, activation='softmax', name='output')(lstm_layer)
model = Model(input_layer, output_layer)
print(model.summary())

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 50, 300)]         0         
_________________________________________________________________
lstm (LSTM)                  (None, 300)               721200    
_________________________________________________________________
output (Dense)               (None, 2)                 602       
Total params: 721,802
Trainable params: 721,802
Non-trainable params: 0
_________________________________________________________________
None


In [38]:
# Compile the model
model.compile(optimizer='Adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [44]:
# Train on the data
train_labels_indexed = train_labels_indexed.reshape((train_labels_indexed.shape[0], 1))
model.fit(train_sentence_vectors, train_labels_indexed, batch_size=8, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f12eb3bda50>

In [45]:
# predict on test data
test_sentence_vectors = create_sentence_vectors(test_data, 50)
predicted_sentiment_vectors = model.predict(test_sentence_vectors)
print(predicted_sentiment_vectors.shape)

(1393, 2)


In [46]:
# Evaluation
predicted_sentiments = list()
for pred in predicted_sentiment_vectors:
  predicted_sentiments.append(index_to_label_dict[np.argmax(pred)])
print(classification_report(test_labels, predicted_sentiments))

              precision    recall  f1-score   support

         neg       0.57      0.76      0.66       700
         pos       0.64      0.43      0.51       693

    accuracy                           0.60      1393
   macro avg       0.61      0.60      0.58      1393
weighted avg       0.61      0.60      0.58      1393

