# Importing Dataset

In [1]:
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle as pk
import json

In [2]:
df = pl.read_csv('./data/final_dataset.csv')
df.head()

sentiment,review
i64,str
1,"""I'm reading a lot of reviews s…"
1,"""This soundtrack is my favorite…"
1,"""I truly like this soundtrack a…"
1,"""If you've played the game, you…"
1,"""I am quite sure any of you act…"


# Preprocessing

In [3]:
import re
from nltk.corpus import stopwords

In [4]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [5]:
def preprocess_text(text):
    # remove tags and convert to lowercase
    sentence = remove_tags(text).lower()

    # remove punctuation, numbers, and single characters, and collapse multiple spaces
    sentence = re.sub(r'[^a-z\s]', ' ', sentence)  # remove non-alphabetic characters
    sentence = re.sub(r'\b[a-z]\b', '', sentence)  # remove single characters
    sentence = re.sub(r'\s+', ' ', sentence).strip()  # collapse multiple spaces

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    sentence = ' '.join([word for word in sentence.split() if word not in stop_words])
    
    return sentence


In [6]:
# X = []
# sentences = list(df['review'])

# for sen in sentences:
#     X.append(preprocess_text(sen))
    
# with open('./data/preprocessed_text.txt', 'wb') as f:
#     pk.dump(X, f)

In [7]:
with open('./data/preprocessed_text.txt', 'rb') as f:
    X = pk.load(f)

In [8]:
X[2]

'truly like soundtrack enjoy video game music played game music enjoy truly relaxing peaceful disk one favorites scars time life death forest illusion fortress ancient dragons lost fragment drowned valley disk two draggons galdorb home chronomantique prisoners fate gale girlfriend likes zelbessdisk three best three garden god chronopolis fates jellyfish sea burning orphange dragon prayer tower stars dragon god radical dreamers unstealable jewel overall excellent soundtrack brought like video game music xander cross'

In [9]:
from sklearn.model_selection import train_test_split

y = df['sentiment']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preparing Embedding Layer

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-10-27 15:20:38.873757: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-27 15:20:38.953136: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-27 15:20:38.976872: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-27 15:20:39.135548: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
# word_tokenizer = Tokenizer()
# word_tokenizer.fit_on_texts(X_train)

# X_train = word_tokenizer.texts_to_sequences(X_train)
# X_test = word_tokenizer.texts_to_sequences(X_test)

In [13]:
# with open('./data/tokenizer.pkl', 'wb') as f:
#     pk.dump(word_tokenizer, f)

In [14]:
# tokenised_data = {
#     'X_train' : X_train,
#     'y_train' : y_train.to_list(),
#     'X_test' : X_test,
#     'y_test' : y_test.to_list()
# }

# with open('./data/tokenized_data.json', 'w') as f:
#     json.dump(tokenised_data, f)

In [15]:
with open('./data/tokenized_data.json', 'r') as f:
    tokenised_data = json.load(f)
    
X_train = tokenised_data['X_train']
X_test = tokenised_data['X_test']
y_train = tokenised_data['y_train']
y_test = tokenised_data['y_test']

In [16]:
y_train, y_test = np.array(y_train), np.array(y_test)

In [17]:
# vocab_length = len(word_tokenizer.word_index) + 1
# vocab_length
vocab_length = 779656

In [18]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [19]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [20]:
# embeddings_dict = dict()

# with open('./data/glove.6B.100d.txt',  'r') as glove_file:
#     for line in glove_file:
#         records = line.split()
#         word = records[0]
#         vector_dimentions = np.asarray(records[1:], dtype='float32')
#         embeddings_dict[word] = vector_dimentions

In [21]:
# embeddings_dict

In [22]:
# with open('./data/word_tokenizer_items.txt', 'w') as f:
#     pk.dump(word_tokenizer.word_index.items(), f)

In [23]:
# embedding_matrix = np.zeros((vocab_length, 100))
# for word, index in word_tokenizer.word_index.items():
#     embedding_vector = embeddings_dict.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[index] = embedding_vector

In [24]:
# type(embedding_matrix), embedding_matrix[1, :]

In [25]:
# with open('./data/embedding_matrix.txt', 'wb') as f:
#     pk.dump(embedding_matrix, f)

In [26]:
with open('./data/embedding_matrix.txt', 'rb') as f:
    embedding_matrix = pk.load(f)

# LSTM Model

In [27]:
# !pip install keras-tuner

In [28]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, GlobalMaxPooling1D, Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow_model_optimization.sparsity import keras as sparsity

In [29]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

I0000 00:00:1730022671.538841   17985 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1730022671.768153   17985 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1730022671.768366   17985 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [34]:
from keras import Sequential
from keras.layers import Input, Embedding, LSTM, Dropout, GlobalMaxPooling1D, Dense, BatchNormalization
from keras.optimizers import Adam
from tensorflow_model_optimization.sparsity import keras as sparsity

# Define pruning parameters
pruning_params = {
    'pruning_schedule': sparsity.ConstantSparsity(0.5, begin_step=200, frequency=100)
}

# Define the hypermodel that takes hyperparameters as input
def build_model(hp):
    model = Sequential()
    
    # Define input layer
    model.add(Input(shape=(maxlen,)))

    # Embedding Layer
    model.add(Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False))

    # First LSTM layer with pruning
    lstm_units_1 = hp.Int('lstm_units_1', min_value=64, max_value=256, step=32)
    pruned_lstm_1 = sparsity.prune_low_magnitude(LSTM(units=lstm_units_1, return_sequences=True), **pruning_params)
    model.add(pruned_lstm_1)
    model.add(Dropout(hp.Float('dropout_1', min_value=0.1, max_value=0.5, step=0.1)))

    # Second LSTM layer with pruning
    lstm_units_2 = hp.Int('lstm_units_2', min_value=64, max_value=256, step=32)
    pruned_lstm_2 = sparsity.prune_low_magnitude(LSTM(units=lstm_units_2, return_sequences=True), **pruning_params)
    model.add(pruned_lstm_2)
    model.add(Dropout(hp.Float('dropout_2', min_value=0.1, max_value=0.5, step=0.1)))

    # GlobalMaxPooling1D
    model.add(GlobalMaxPooling1D())

    # Dense layers with pruning
    dense_units_1 = hp.Int('dense_units_1', min_value=64, max_value=256, step=32)
    pruned_dense_1 = sparsity.prune_low_magnitude(Dense(units=dense_units_1, activation='relu'), **pruning_params)
    model.add(pruned_dense_1)
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_3', min_value=0.1, max_value=0.5, step=0.1)))

    dense_units_2 = hp.Int('dense_units_2', min_value=32, max_value=128, step=16)
    pruned_dense_2 = sparsity.prune_low_magnitude(Dense(units=dense_units_2, activation='relu'), **pruning_params)
    model.add(pruned_dense_2)
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_4', min_value=0.1, max_value=0.5, step=0.1)))

    # Output Layer
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model with a tunable learning rate
    model.compile(
        optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model


In [35]:
from keras_tuner import RandomSearch

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',  # Note: use 'val_accuracy' instead of 'val_acc'
    max_trials=10,
    executions_per_trial=1,
    directory='tuning_dir',
    project_name='lstm_tuning_enhanced'
)

: 

In [32]:
tuner.search(
    X_train, 
    y_train, 
    epochs=5,  # Reduce epochs for faster tuning
    batch_size=64,  # Default batch size
    validation_data=(X_val, y_val),
    verbose=1
)


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
160               |160               |lstm_units_1
0.2               |0.2               |dropout_1
64                |64                |lstm_units_2
0.3               |0.3               |dropout_2
192               |192               |dense_units_1
0.3               |0.3               |dropout_3
112               |112               |dense_units_2
0.4               |0.4               |dropout_4
0.00061963        |0.00061963        |learning_rate



2024-10-27 15:19:12.549282: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 311862400 exceeds 10% of free system memory.


FatalTypeError: Expected the model-building function, or HyperModel.build() to return a valid Keras Model instance. Received: <tf_keras.src.engine.sequential.Sequential object at 0x7018ed68bac0> of type <class 'tf_keras.src.engine.sequential.Sequential'>.

In [None]:
# Retrieve the best model
best_model_enhanced = tuner.get_best_models(num_models=1)[0]

# Summary of the best hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best Hyperparameters:", best_hyperparameters.values)

In [None]:
best_model_enhanced.summary()

In [47]:
# score = best_model_enhanced.evaluate(X_test, y_test, verbose=1)

# print(f'Test Score : {score[0]}\nTest Accuracy: {score[1]}')

In [48]:
# best_model_enhanced.save('./models/lstm_model_enhanced.h5')

In [None]:
askhdbaksdh

In [None]:
model = tf.keras.models.load_model('./models/lstm_model_enhanced.h5')

In [50]:
with open('./data/tokenizer.pkl', 'rb') as f:
    tokenizer = pk.load(f)

In [83]:
def model_predict(text, tokenizer, model):
    clean_text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([clean_text])
    padded_sequence = pad_sequences(sequence, maxlen=100)
    prediction = model.predict(padded_sequence)
    prediction = 'Negative' if prediction< 0.5 else 'Positive'
    
    return prediction

In [None]:
text = "This DVD will be a disappointment if you get it hoping to see some substantial portion of the acts of the various comics listed on the cover. All you get here are snippets of performance, at best. The rest is just loose-leaf reminiscence about the good old days in Boston, in the early 80's, when a lot of comics were hanging out together and getting their start.It's like a frat house reunion. There's a lot of lame nostalgia. There are quite a few guffaws recalling jokes (practical and otherwise)perpetrated - back then. But you had to have been there to appreciate all the basically good ol' boy camaraderie. If you weren't actually a part of that scene, all this joshing and jostling will fall flat.If you want to actually hear some of these comics' routines - you will have to look elsewhere."

pred = model_predict(text=text, tokenizer=tokenizer, model=model)
print(pred)