In [1]:
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle as pk
import json

In [2]:
df = pl.read_csv('./data/final_dataset.csv')
df.head()

class_index,review_text,sentiment
i64,str,i64
3,"""Gave this to my dad for a gag …",1
5,"""I hope a lot of people hear th…",2
5,"""I'm reading a lot of reviews s…",2
4,"""The music of Yasunori Misuda i…",2
5,"""Probably the greatest soundtra…",2


# Preprocessing

In [3]:
import re
from nltk.corpus import stopwords

In [4]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [5]:
def preprocess_text(text):
    # remove tags and convert to lowercase
    sentence = remove_tags(text).lower()

    # remove punctuation, numbers, and single characters, and collapse multiple spaces
    sentence = re.sub(r'[^a-z\s]', ' ', sentence)  # remove non-alphabetic characters
    sentence = re.sub(r'\b[a-z]\b', '', sentence)  # remove single characters
    sentence = re.sub(r'\s+', ' ', sentence).strip()  # collapse multiple spaces

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    sentence = ' '.join([word for word in sentence.split() if word not in stop_words])
    
    return sentence

In [6]:
# X = []
# sentences = list(df['review_text'])

# for sen in sentences:
#     X.append(preprocess_text(sen))
    
# with open('./data/preprocessed_text.txt', 'wb') as f:
#     pk.dump(X, f)

In [7]:
with open('./data/preprocessed_text.txt', 'rb') as f:
    X = pk.load(f)

In [8]:
X[2]

'reading lot reviews saying best game soundtrack figured write review disagree bit opinino yasunori mitsuda ultimate masterpiece music timeless listening years beauty simply refuses fade price tag pretty staggering must say going buy cd much money one feel would worth every penny'

In [9]:
from sklearn.model_selection import train_test_split

y = df['sentiment']

In [10]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preparing Embedding Layer

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-10-26 12:01:40.480548: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-26 12:01:40.559442: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-26 12:01:40.582977: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-26 12:01:40.738345: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
# word_tokenizer = Tokenizer()
# word_tokenizer.fit_on_texts(X_train)

# X_train = word_tokenizer.texts_to_sequences(X_train)
# X_test = word_tokenizer.texts_to_sequences(X_test)

In [13]:
# with open('./data/tokenizer.pkl', 'wb') as f:
#     pk.dump(word_tokenizer, f)

In [14]:
# tokenised_data = {
#     'X_train' : X_train,
#     'y_train' : y_train.to_list(),
#     'X_test' : X_test,
#     'y_test' : y_test.to_list()
# }

# with open('./data/tokenized_data.json', 'w') as f:
#     json.dump(tokenised_data, f)

In [15]:
with open('./data/tokenizer.pkl', 'rb') as f:
    word_tokenizer = pk.load(f)

In [16]:
with open('./data/tokenized_data.json', 'r') as f:
    tokenised_data = json.load(f)
    
X_train = tokenised_data['X_train']
X_test = tokenised_data['X_test']
y_train = tokenised_data['y_train']
y_test = tokenised_data['y_test']

In [17]:
y_train, y_test = np.array(y_train), np.array(y_test)
y_train, y_test

(array([2, 2, 2, ..., 2, 1, 0]), array([0, 0, 1, ..., 0, 0, 2]))

In [18]:
# vocab_length = len(word_tokenizer.word_index) + 1
# vocab_length
vocab_length = 741641

In [19]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [20]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [21]:
# embeddings_dict = dict()

# with open('./data/glove.6B.100d.txt',  'r') as glove_file:
#     for line in glove_file:
#         records = line.split()
#         word = records[0]
#         vector_dimentions = np.asarray(records[1:], dtype='float32')
#         embeddings_dict[word] = vector_dimentions

In [22]:
# embeddings_dict

In [23]:
# embedding_matrix = np.zeros((vocab_length, 100))
# for word, index in word_tokenizer.word_index.items():
#     embedding_vector = embeddings_dict.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[index] = embedding_vector

In [24]:
# type(embedding_matrix), embedding_matrix[1, :]

In [25]:
# with open('./data/embedding_matrix.txt', 'wb') as f:
#     pk.dump(embedding_matrix, f)

In [26]:
with open('./data/embedding_matrix.txt', 'rb') as f:
    embedding_matrix = pk.load(f)

In [27]:
type(X_train_), type(y_train), type(X_val), type(y_val), type(X_test), type(y_test)

(numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray)

In [28]:
X_train_

array([[2071, 3405,    6, ...,    0,    0,    0],
       [   7, 4648,   19, ...,    0,    0,    0],
       [1511, 1697, 1459, ...,    0,    0,    0],
       ...,
       [ 642, 2483, 1861, ...,    0,    0,    0],
       [ 129, 2200,  177, ...,    0,    0,    0],
       [  27, 1865,   54, ...,    0,    0,    0]], dtype=int32)

# LSTM Model

In [29]:
import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, GlobalMaxPooling1D, BatchNormalization
from keras.models import Sequential
from keras_tuner import RandomSearch

In [30]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

I0000 00:00:1729924330.801825   15917 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729924330.950938   15917 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729924330.951155   15917 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [31]:
gpu

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')

In [32]:
with tf.device('/GPU:0'):  # Force the model to run on the GPU
    lstm_model = Sequential([
        Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False),
        
        LSTM(224, return_sequences=True),  # Adjusted units according to the summary
        Dropout(0.3),
        
        LSTM(192, return_sequences=True),  # Adjusted units according to the summary
        Dropout(0.3),
        
        GlobalMaxPooling1D(),  # Now works as it has a 3D input
        
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(80, activation='relu'),  # Adjusted units according to the summary
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(3, activation='softmax')  # Three units for 'negative', 'neutral', 'positive'
    ])

    # Compile the model with a multi-class loss function
    lstm_model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',  # For integer labels (0, 1, 2)
        metrics=['accuracy']
    )


    # Train the model
    lstm_model_history = lstm_model.fit(
        X_train,
        y_train,
        batch_size=128,
        epochs=10,
        verbose=1,
        validation_data=(X_val, y_val)
    )

I0000 00:00:1729924330.973771   15917 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729924330.974004   15917 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729924330.974166   15917 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729924331.079735   15917 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

Epoch 1/10


2024-10-26 12:02:14.668293: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1168000000 exceeds 10% of free system memory.
2024-10-26 12:02:18.915172: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8902


[1m22813/22813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.6680 - loss: 0.7695

2024-10-26 12:10:08.641957: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 233600000 exceeds 10% of free system memory.


[1m22813/22813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m503s[0m 22ms/step - accuracy: 0.6680 - loss: 0.7695 - val_accuracy: 0.7046 - val_loss: 0.6884
Epoch 2/10
[1m22813/22813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m503s[0m 22ms/step - accuracy: 0.7306 - loss: 0.6356 - val_accuracy: 0.7324 - val_loss: 0.6427
Epoch 3/10
[1m22813/22813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m503s[0m 22ms/step - accuracy: 0.7430 - loss: 0.6086 - val_accuracy: 0.7343 - val_loss: 0.6364
Epoch 4/10
[1m22813/22813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m501s[0m 22ms/step - accuracy: 0.7498 - loss: 0.5933 - val_accuracy: 0.7504 - val_loss: 0.6120
Epoch 5/10
[1m22813/22813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m882s[0m 39ms/step - accuracy: 0.7549 - loss: 0.5822 - val_accuracy: 0.7470 - val_loss: 0.6031
Epoch 6/10
[1m    4/22813[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m21:04:21[0m 3s/step - accuracy: 0.7236 - loss: 0.6621

KeyboardInterrupt: 