# Train Model
In this notebook, we will create and train a model to predict the sentiment of a movie review.

In [1]:
import pandas as pd
import numpy as np
import re
from keras_nlp.models import Tokenizer

Using TensorFlow backend


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Flatten
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.losses import CategoricalCrossentropy
from tqdm import tqdm

In [3]:
# Load the data
df = pd.read_csv('data/preprocessed_dataset.csv')

In [4]:
df.drop(columns=['text']).head(5)

Unnamed: 0,anger,boredom,empty,excitement,amusement,joy,disgust,love,neutral,relief,...,disappointment,gratitude,grief,pride,curiosity,optimism,annoyance,approval,remorse,admiration
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_new = df.drop(columns=['text'])

In [6]:
# Find frequency of easch sentiment
df_new.sum()

anger              8194
boredom             179
empty               827
excitement         6388
amusement         11021
joy               13192
disgust            6624
love              12033
neutral           63936
relief             2815
sadness           11923
surprise           7701
nervousness       10269
confusion          7359
desire             3817
disapproval       11424
embarrassment      2476
caring             5999
realization        8785
fear               3197
disappointment     8469
gratitude         11625
grief               673
pride              1302
curiosity          9692
optimism           8715
annoyance         13618
approval          17620
remorse            2525
admiration        17131
dtype: int64

In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df_new, test_size=0.2, random_state=42)

# Initialize TextVectorization layer
max_features = 5000  # Maximum number of words to consider
sequence_length = 28  # Maximum length of a sequence

# Ensure X_train and X_test are of type str
X_train = X_train.astype(str)
X_test = X_test.astype(str)

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='int', # float?
    output_sequence_length=sequence_length
)

# Adapt the vectorize layer to the training data
vectorize_layer.adapt(X_train.values)

# Vectorize the training and testing data
X_train_vectorized = vectorize_layer(X_train.values)
X_test_vectorized = vectorize_layer(X_test.values)

# Convert y_train and y_test to numpy arrays
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [8]:
# Find longest review
max_len = 0
for review in tqdm(X_train.values):
    max_len = max(max_len, len(review.split()))
max_len

100%|██████████| 199154/199154 [00:00<00:00, 1095583.65it/s]


28

In [9]:
#llok up x_train vector
X_train_vectorized[125236]

<tf.Tensor: shape=(28,), dtype=int64, numpy=
array([   2,   16,    6, 1183,   87,  522,  259,    1,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int64)>

In [14]:
# Build the model
model = Sequential()
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss=CategoricalCrossentropy(), metrics=['accuracy'])

# Train the model
model.fit(X_train_vectorized, y_train, epochs=4, batch_size=32, validation_data=(X_test_vectorized, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_vectorized, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test Loss: 3.4787793159484863
Test Accuracy: 0.2559601664543152


In [18]:
# Train with whole dataset

# Ensure x is of type string
df['text'] = df['text'].astype(str)

# Vectorize the whole dataset
X_vectorized = vectorize_layer(df['text'].values)


# Convert y to numpy arrays
y = df_new.to_numpy()

# Compile the model
model.compile(optimizer='adam', loss=CategoricalCrossentropy(), metrics=['accuracy'])

# Train the model
model.fit(X_vectorized, y, epochs=4, batch_size=32)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x20b2bb67880>

# Save the model


In [19]:
from tensorflow.keras.models import save_model

# Save the model
save_model(model, 'models/sentiment_model')

INFO:tensorflow:Assets written to: models/sentiment_model\assets


In [20]:
# Print versions 
import tensorflow as tf
import keras_nlp
print(f'Tensorflow version: {tf.__version__}')
print(f'Keras version: {tf.keras.__version__}')
print(f'Keras NLP version: {keras_nlp.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'Pandas version: {pd.__version__}')


Tensorflow version: 2.10.1
Keras version: 2.10.0
Keras NLP version: 0.13.0.dev2024061303
Numpy version: 1.26.4
Pandas version: 2.2.1
