# Importing Dataset

In [1]:
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle as pk
import json

In [2]:
df = pl.read_csv('./data/final_dataset.csv')
df.head()

sentiment,review
i64,str
1,"""I'm reading a lot of reviews s…"
1,"""This soundtrack is my favorite…"
1,"""I truly like this soundtrack a…"
1,"""If you've played the game, you…"
1,"""I am quite sure any of you act…"


# Preprocessing

In [3]:
import re
from nltk.corpus import stopwords

In [4]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [5]:
def preprocess_text(text):
    # remove tags and convert to lowercase
    sentence = remove_tags(text).lower()

    # remove punctuation, numbers, and single characters, and collapse multiple spaces
    sentence = re.sub(r'[^a-z\s]', ' ', sentence)  # remove non-alphabetic characters
    sentence = re.sub(r'\b[a-z]\b', '', sentence)  # remove single characters
    sentence = re.sub(r'\s+', ' ', sentence).strip()  # collapse multiple spaces

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    sentence = ' '.join([word for word in sentence.split() if word not in stop_words])
    
    return sentence


In [6]:
# X = []
# sentences = list(df['review'])

# for sen in sentences:
#     X.append(preprocess_text(sen))
    
# with open('./data/preprocessed_text.txt', 'wb') as f:
#     pk.dump(X, f)

In [7]:
with open('./data/preprocessed_text.txt', 'rb') as f:
    X = pk.load(f)

In [8]:
X[2]

'truly like soundtrack enjoy video game music played game music enjoy truly relaxing peaceful disk one favorites scars time life death forest illusion fortress ancient dragons lost fragment drowned valley disk two draggons galdorb home chronomantique prisoners fate gale girlfriend likes zelbessdisk three best three garden god chronopolis fates jellyfish sea burning orphange dragon prayer tower stars dragon god radical dreamers unstealable jewel overall excellent soundtrack brought like video game music xander cross'

In [9]:
from sklearn.model_selection import train_test_split

y = df['sentiment']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preparing Embedding Layer

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-09-15 20:34:15.172245: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-15 20:34:15.251288: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-15 20:34:15.277299: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-15 20:34:15.433572: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [13]:
# tokenised_data = {
#     'X_train' : X_train,
#     'y_train' : y_train.to_list(),
#     'X_test' : X_test,
#     'y_test' : y_test.to_list()
# }

# with open('./data/tokenized_data.json', 'w') as f:
#     json.dump(tokenised_data, f)

In [14]:
# with open('./data/tokenized_data.json', 'r') as f:
#     tokenised_data = json.load(f)
    
# X_train = tokenised_data['X_train']
# X_test = tokenised_data['X_test']
# y_train = tokenised_data['y_train']
# y_test = tokenised_data['y_test']

In [15]:
# vocab_length = len(word_tokenizer.word_index) + 1
# vocab_length
vocab_length = 779656

In [16]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [17]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [18]:
embeddings_dict = dict()

with open('./data/glove.6B.100d.txt',  'r') as glove_file:
    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimentions = np.asarray(records[1:], dtype='float32')
        embeddings_dict[word] = vector_dimentions

In [19]:
# embeddings_dict

In [20]:
# with open('./data/word_tokenizer_items.txt', 'w') as f:
#     pk.dump(word_tokenizer.word_index.items(), f)

In [21]:
embedding_matrix = np.zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [22]:
embedding_matrix.shape

(779656, 100)

# LSTM Model

In [23]:
# !pip install keras-tuner

In [24]:
import keras_tuner as kt
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, Dropout, Embedding, LSTM, Flatten
from keras.models import Sequential

In [25]:
def build_lstm_model(hp):
    model = Sequential([
        Embedding(vocab_length,
                  100,
                  weights=[embedding_matrix],
                  input_length=maxlen,
                  trainable=False),
        
        LSTM(hp.Int('lstm_units', min_value=64, max_value=256, step=32)),
        
        Dense(1, activation='sigmoid')
    ])


    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')
    
    model.compile(optimizer=Adam(learning_rate=learning_rate), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])

    return model

In [26]:
tuner = kt.RandomSearch(
    build_lstm_model,            
    objective='val_accuracy',    
    max_trials=10,               
    executions_per_trial=1,      
    directory='./tuning_dir',   
    project_name='lstm_tuning'   
)

Reloading Tuner from ./tuning_dir/lstm_tuning/tuner0.json


I0000 00:00:1726412818.904421   24847 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726412819.062916   24847 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726412819.063436   24847 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726412819.064892   24847 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [27]:
# tuner.search(X_train_, y_train_, 
#              epochs=5, 
#              validation_data=(X_val, y_val),
#              batch_size=32)

In [28]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

2024-09-15 20:37:01.172534: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 311862400 exceeds 10% of free system memory.
2024-09-15 20:37:02.286104: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 311862400 exceeds 10% of free system memory.


In [29]:
lstm_model = Sequential([
    Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False),
    
    LSTM(256, return_sequences=True),
    Dropout(0.3),
    
    LSTM(512),
    Dropout(0.3),
    
    Dense(256, activation='relu'),
    Dropout(0.4),
    
    Dense(128, activation='relu'),
    Dropout(0.4),
    
    Dense(64, activation='relu'),
    Dropout(0.3),
    
    Dense(1, activation='sigmoid'),
])

2024-09-15 20:37:03.496272: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 311862400 exceeds 10% of free system memory.


In [30]:
lstm_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['acc']
)

In [31]:
lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10


2024-09-15 20:37:04.464577: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1023999200 exceeds 10% of free system memory.
2024-09-15 20:37:07.825929: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8902


[1m 2983/20000[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m13:24[0m 47ms/step - acc: 0.5335 - loss: 0.6848

KeyboardInterrupt: 

In [33]:
score = best_model.evaluate(X_test, y_test, verbose=1)

print(f'Test Score : {score[0]}\nTest Accuracy: {score[1]}')

2024-09-15 19:21:23.520660: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 320000000 exceeds 10% of free system memory.
2024-09-15 19:21:24.454370: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8902


[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 2ms/step - accuracy: 0.9044 - loss: 0.2341
Test Score : 0.23308660089969635
Test Accuracy: 0.9048100113868713


In [38]:
score = lstm_model.evaluate(X_test, y_test, verbose=1)

print(f'Test Score : {score[0]}\nTest Accuracy: {score[1]}')

[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2ms/step - acc: 0.9032 - loss: 0.2392
Test Score : 0.23849792778491974
Test Accuracy: 0.9034562706947327


In [32]:
# best_model.save('./models/Best_LSTM_model.h5')

2024-09-15 20:44:59.829545: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 311862400 exceeds 10% of free system memory.


In [33]:
# !pip install tensorflowjs

Collecting tensorflowjs
  Downloading tensorflowjs-4.21.0-py3-none-any.whl.metadata (3.2 kB)
Collecting flax>=0.7.2 (from tensorflowjs)
  Downloading flax-0.9.0-py3-none-any.whl.metadata (11 kB)
Collecting importlib_resources>=5.9.0 (from tensorflowjs)
  Downloading importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)
Collecting jax>=0.4.13 (from tensorflowjs)
  Downloading jax-0.4.31-py3-none-any.whl.metadata (22 kB)
Collecting jaxlib>=0.4.13 (from tensorflowjs)
  Downloading jaxlib-0.4.31-cp310-cp310-manylinux2014_x86_64.whl.metadata (983 bytes)
Collecting tf-keras>=2.13.0 (from tensorflowjs)
  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Collecting tensorflow-decision-forests>=1.5.0 (from tensorflowjs)
  Downloading tensorflow_decision_forests-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.0 kB)
Collecting tensorflow-hub>=0.16.1 (from tensorflowjs)
  Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl.metadata (1.3 kB)
Coll

In [34]:
# !tensorflowjs_converter --input_format keras ./models/Best_LSTM_model.h5 ./models/tf_js_models

2024-09-15 20:47:22.793449: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-15 20:47:22.879356: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-15 20:47:22.902584: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-15 20:47:23.061961: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
failed to lookup keras version from the file,
    thi