Import Statements

In [1]:
import csv
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, LSTM
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Load the data set

In [2]:
data = [] 
with open("./reviews.csv", mode="r", errors="ignore") as csv_file: 
  reader = csv.DictReader(csv_file) 
  for row in reader: 
    data.append(row) 
df = pd.DataFrame(data)

Split the data into sentences and labels

In [3]:
sentences = df["review"].values;labels = df["sentiment"].values

Define the vocabulary size and the maximum length of sentences

In [4]:
vocab_size = 1000
maxlen = 100

Create a tokenizer and fit it on the sentences

In [5]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(sentences)

Convert the sentences into sequences of integers

In [6]:
sequences = tokenizer.texts_to_sequences(sentences)

Pad the sequences to have the same length

In [7]:
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

Split the data into training and testing sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [9]:
# calculate the maximum sequence length
max_sequence_length = max(len(sequence) for sequence in padded_sequences)

vocabulary_size = len(tokenizer.word_index) + 1

Define the neural network model

In [10]:
model = Sequential()
model.add(Embedding(vocabulary_size, 128, input_length=max_sequence_length))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Compile the model with binary crossentropy loss and adam optimizer

In [11]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Print the summary of the model

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          15905152  
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 16,036,865
Trainable params: 16,036,865
Non-trainable params: 0
_________________________________________________________________


Conversion of Incoming Data 

In [13]:
# define a function to convert the labels
def convert_labels(labels):
    return np.array([0 if label == 'negative' else 1 for label in labels], dtype=np.float32)

# convert the training labels
y_train = convert_labels(y_train)

# convert the test labels
y_test = convert_labels(y_test)

Train the model for 10 epochs with a batch size of 32

In [14]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ef000efe50>

Evaluate the model on the test set and print the accuracy score

In [15]:
y_pred = model.predict(X_test)
y_pred = np.round(y_pred).flatten()
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc}")

Accuracy: 0.8538
