# Sentiment Analysis on IMDb reviews using BiLSTM and Electra
Author: Tiberiu Rociu  
Student ID: 2006061

# Introduction

    TODO: Add introduction

# Representation Learning

    TODO: Add representation plan

# Algorithms

    TODO: Add algorithm description

# Evaluation

    TODO: Add evaluation plan

In [5]:
import pandas as pd

dataset_splits = {'train': 'Dataset_IMDB_Sentiment/plain_text/train-00000-of-00001.parquet', 'test': 'Dataset_IMDB_Sentiment/plain_text/test-00000-of-00001.parquet', 'unsupervised': 'Dataset_IMDB_Sentiment/plain_text/unsupervised-00000-of-00001.parquet'}
imdb_train_df = pd.read_parquet(dataset_splits["train"])
imdb_test_df = pd.read_parquet(dataset_splits['test'])

In [6]:
# Visualising the dataset 
print(imdb_train_df.head(5))

print(len(imdb_train_df['text'][0]))
print(len(imdb_train_df))

                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0
1640
25000


## Pre-processing

In [4]:
import nltk

nltk.download('punkt') 
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tiberiu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tiberiu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin # This allows the custom class to work with the Pipeline
from nltk.tokenize import word_tokenize, RegexpTokenizer # Import the tokenize package and regex tokenizer
from nltk.corpus import stopwords # Import stopwords for stopwords removal
from nltk.stem import SnowballStemmer, WordNetLemmatizer # Import the snowball stemmer (also known as Porter2)
import re # Import regex
import numpy as np

class pre_process(BaseEstimator, TransformerMixin): # TODO Further improve on this pre-processing
    def __init__(self):
      # Initialize objects for efficiency
      self.stemmer = SnowballStemmer("english")
      self.lemmatizer = WordNetLemmatizer()
      self.tokenizer = self.tokenizer = RegexpTokenizer(r'\w+')
      self.stopwords = stopwords.words("english")
      return None

    def fit(self, X, y=None):
        return self 

    def transform(self, X, y=None):
      processed_text = []
      for text in X:
        text = re.sub(r'\d+', '', text)  # Remove numbers
        token_text = self.tokenizer.tokenize(text) 
        normalised_text = [token.lower() for token in token_text if token.isalpha()]

        no_stopwords_text = [token for token in normalised_text if token not in self.stopwords]

        # Using lemmatization or stemming TODO I should decide on one in the end
        # processed_text += [[self.stemmer.stem(word) for word in swr_text]] # Applying stemmer
        processed_text.append([self.lemmatizer.lemmatize(word) for word in no_stopwords_text]) # Applying the lemmatizer

      #  Return tokens as a list of words (this ensure compatibility with FastText)
      return processed_text

In [5]:
from keras.preprocessing.sequence import pad_sequences

# The FastText class
class FastTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_path, max_sequence_length):
        self.model_path = model_path
        self.max_sequence_length = max_sequence_length
        self.vector_dim = 300
        self.word_vectors = self.load_fasttext_vectors()

    def load_fasttext_vectors(self):
        # Loading the FastText word vectors from the .vec file
        word_vectors = {}
        with open(self.model_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                word = parts[0]
                vector = np.array(parts[1:], dtype=np.float32)
                word_vectors[word] = vector
        return word_vectors

    def fit(self, X, y=None):
        return self

    def transform(self, tokens):
        # Convert each tokenized text into a list of word vectors
        tokenized_vectors = [
            [
                self.word_vectors[word] if word in self.word_vectors else np.zeros(self.vector_dim)
                for word in token
            ]
            for token in tokens
        ]

        # Reshape data to 3D to make it compatible with the BiLSTM GRU model
        # Pad sequences to ensure consistent dimensions: sequence length, vector dim
        padded_data = pad_sequences(
            [np.array(seq) for seq in tokenized_vectors],
            maxlen=self.max_sequence_length,
            dtype='float32',
            padding='post',
            truncating='post',
        )

        return np.array(padded_data)

## Algorithm training

In [6]:
# Showcasing the pre processing and vectorization outputs used in the final model training pipelines
from sklearn.pipeline import Pipeline # Adding the pipeline functionality

# Showcasing the pre-process pipeline output
pre_process_pipeline =  Pipeline([
  ('prep', pre_process()) # Custom pre-processing method
  ])

pre_process_example = pre_process_pipeline.fit_transform(imdb_train_df['text'][:1])
print(f"Below is an example of the pre-processed pipeline output:")
print(f"\tThe original text (as tokens): {imdb_train_df['text'][0]}")
print(f"\tThe pre-processed text: {pre_process_example[0]}")

# Showcasing the FastText vectorization pipeline output with 3D shape data
fast_text_pipeline = Pipeline([
  ("prep", pre_process()), # Custom pre-processing method
  ("fast_text_vectorizer", FastTextVectorizer(
    model_path = "Libraries/FastText/wiki-news-300d-1M.vec",
    max_sequence_length = 250
    )), # FastText word to vector dictionary
  ])

fast_text_pipeline = fast_text_pipeline.fit_transform(imdb_train_df['text'][:1])
print(f"\tThe 3D reshaped data shape: {fast_text_pipeline.shape}")
print(f"\tThe 3D reshaped data representation (sample): {fast_text_pipeline[0][:10]}")


Below is an example of the pre-processed pipeline output:
	The original text (as tokens): I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this w

In [7]:
# Calculate the average tokens per imdb review. 
# This average will be used for the max sequence length in FastText, to avoid truncating and losing potential semantics information
pre_process_example = pre_process_pipeline.fit_transform(imdb_train_df['text'].values)

# Calculate token counts for each pre-processed text
token_lengths = [len(tokens) for tokens in pre_process_example]

# Calculate the statistics
average_tokens = sum(token_lengths) / len(token_lengths)
max_tokens = max(token_lengths)
min_tokens = min(token_lengths)
max_seq_len_90th  = int(np.percentile(token_lengths, 90))

print(f"Average number of tokens: {average_tokens}")
print(f"Maximum number of tokens: {max_tokens}")
print(f"Minimum number of tokens: {min_tokens}")
print(f"90th percentile token length: {max_seq_len_90th}")

Average number of tokens: 123.56692
Maximum number of tokens: 1442
Minimum number of tokens: 4
90th percentile token length: 246


In [24]:
# Adding the model wrapped in a custom Keras model to work with the Pipeline
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, GRU, Dense, Dropout
from sklearn.base import BaseEstimator, ClassifierMixin
from keras import Input

class BiLSTM_GRU_Model(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, num_classes, dropout_rate):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.dropout_rate = dropout_rate
        self.model = self.build_model()
    
    def build_model(self):
        model = Sequential()
        model.add(Input(shape=self.input_shape))
        model.add(Bidirectional(LSTM(32, return_sequences=True)))
        model.add(GRU(16))
        model.add(Dropout(self.dropout_rate))
        model.add(Dense(self.num_classes, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=10, batch_size=16, verbose=1)
        return self
    
    def predict(self, X):
        predictions = self.model.predict(X)
        return (predictions > 0.5).astype(int)

In [25]:
from sklearn.metrics import accuracy_score, classification_report

max_sequence_length = 250 # This is the 90th percentile review token length score (rounded)
input_shape = (max_sequence_length, 300)  # Match the reshaped dimensions: max_sequence_length x vector_dim

bilstm_gru_NLP_pipeline = Pipeline([
  ("prep", pre_process()), # Custom pre-processing method
  ("fast_text_vectorizer", FastTextVectorizer(
    model_path = "Libraries/FastText/wiki-news-300d-1M.vec",
    max_sequence_length = max_sequence_length
    )), # FastText word to vector dictionary
    ("model", BiLSTM_GRU_Model(input_shape=input_shape, num_classes=1, dropout_rate=0.3))  # BiLSTM GRU model
  ])

X_train = imdb_train_df['text'].values
y_train = imdb_train_df['label'].values

print(f"Len x_train {len(X_train)}")
print(f"Len y_train {len(y_train)}")
print(f"Shape X_train {X_train.shape}")
print(f"Shape y_train {y_train.shape}")

Len x_train 25000
Len y_train 25000
Shape X_train (25000,)
Shape y_train (25000,)


In [26]:
# Fit the pipeline
bilstm_gru_NLP_pipeline.fit(X_train, y_train)

Epoch 1/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 54ms/step - accuracy: 0.5113 - loss: 0.6920
Epoch 2/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 53ms/step - accuracy: 0.5388 - loss: 0.6816
Epoch 3/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 53ms/step - accuracy: 0.6932 - loss: 0.5552
Epoch 4/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 52ms/step - accuracy: 0.8733 - loss: 0.3108
Epoch 5/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 53ms/step - accuracy: 0.8860 - loss: 0.2808
Epoch 6/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 55ms/step - accuracy: 0.8952 - loss: 0.2637
Epoch 7/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 53ms/step - accuracy: 0.9066 - loss: 0.2397
Epoch 8/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 54ms/step - accuracy: 0.9154 - loss: 0.2220
Epoch 9/

In [27]:
# Make the predictions
X_test = imdb_test_df['text'].values
y_pred = bilstm_gru_NLP_pipeline.predict(X_test)
y_test = imdb_test_df['label'].values
print("Predictions:", y_pred)

# Compare predictions with the actual ones
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step
Predictions: [[0]
 [0]
 [0]
 ...
 [1]
 [0]
 [1]]
Accuracy: 0.89

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.90      0.89     12500
           1       0.90      0.87      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000



# TESTING THE 2ND PIPELINE

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import ElectraTokenizer, ElectraForSequenceClassification, TrainingArguments, Trainer

# Transforming into HuggingFace Dataset objects
imdb_train_hf = Dataset.from_pandas(imdb_train_df[['text', 'label']])
imdb_test_hf = Dataset.from_pandas(imdb_test_df[['text', 'label']])

# Loading the Electra tokeniser to use for the Electra model
electra_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator', do_lower_case=True)

In [128]:
# Electra tokenising function
def electra_tokenize_function(data):
    return tokenizer(data['text'], padding = "max_length", truncation = True, return_attention_mask = True, max_length = 512)

In [129]:
# Tokenising the dataset
imdb_train_hf = imdb_train_hf.map(electra_tokenize_function, batched=True)
imdb_test_hf = imdb_test_hf.map(electra_tokenize_function, batched=True)

Map: 100%|██████████| 25000/25000 [00:44<00:00, 560.35 examples/s]
Map: 100%|██████████| 25000/25000 [00:43<00:00, 571.64 examples/s]


In [130]:
# Loading the Electra model and setting the training parameters
electra_model = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=2)

electra_training_arguments = TrainingArguments(
    output_dir = './results',  
    eval_strategy = "epoch", 
    learning_rate = 2e-5,  
    per_device_train_batch_size = 32, # Making the model use the GPU for training
    per_device_eval_batch_size = 32, 
    num_train_epochs = 7,
    weight_decay = 0.01, 
    warmup_steps = 200,
    logging_dir = './logs',
    logging_steps = 500,
    save_steps = 500,
    do_train = True,
    do_eval = True,
    fp16 = True
)

electra_trainer = Trainer(
    model = electra_model,
    args = electra_training_arguments,
    train_dataset = imdb_train_hf,
    eval_dataset = imdb_test_hf,
    processing_class = tokenizer
)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [131]:
# Making sure the script recognizes the GPU, to ensure the model is in fact training on it
print("Is CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Device Name:", torch.cuda.get_device_name(0))

Is CUDA available: True
GPU Device Name: NVIDIA GeForce RTX 3080


In [None]:
# Train the Electra model to fine tune it to the IMDB dataset
# NOTE FOR MARKING PURPOSES: Below this code block there will be code that will load the trained model already.
# Retraining the model will probably take a long time if a dedicated GPU is not used.
electra_trainer.train()

  9%|▉         | 500/5474 [01:01<09:47,  8.46it/s]

{'loss': 0.4595, 'grad_norm': 8.129602432250977, 'learning_rate': 1.8866135760333716e-05, 'epoch': 0.64}


 14%|█▍        | 781/5474 [01:34<08:59,  8.69it/s]
 14%|█▍        | 783/5474 [02:10<10:50:44,  8.32s/it]

{'eval_loss': 0.22111102938652039, 'eval_runtime': 35.6266, 'eval_samples_per_second': 701.723, 'eval_steps_per_second': 21.95, 'epoch': 1.0}


 18%|█▊        | 1000/5474 [02:36<08:47,  8.48it/s]  

{'loss': 0.2467, 'grad_norm': 3.4743406772613525, 'learning_rate': 1.697004171406902e-05, 'epoch': 1.28}


 27%|██▋       | 1500/5474 [03:35<07:48,  8.49it/s]

{'loss': 0.2097, 'grad_norm': 7.339658260345459, 'learning_rate': 1.5073947667804325e-05, 'epoch': 1.92}


 29%|██▊       | 1563/5474 [03:43<07:30,  8.69it/s]
 29%|██▊       | 1565/5474 [04:19<9:10:34,  8.45s/it]

{'eval_loss': 0.2104008048772812, 'eval_runtime': 36.1802, 'eval_samples_per_second': 690.986, 'eval_steps_per_second': 21.614, 'epoch': 2.0}


 37%|███▋      | 2000/5474 [05:11<06:59,  8.28it/s]  

{'loss': 0.1768, 'grad_norm': 10.491612434387207, 'learning_rate': 1.3181645809632159e-05, 'epoch': 2.56}


 43%|████▎     | 2345/5474 [05:52<05:59,  8.69it/s]
 43%|████▎     | 2347/5474 [06:28<7:08:43,  8.23s/it]

{'eval_loss': 0.21426326036453247, 'eval_runtime': 35.2065, 'eval_samples_per_second': 710.096, 'eval_steps_per_second': 22.212, 'epoch': 3.0}


 46%|████▌     | 2500/5474 [06:46<05:50,  8.49it/s]  

{'loss': 0.1644, 'grad_norm': 13.929616928100586, 'learning_rate': 1.1285551763367465e-05, 'epoch': 3.2}


 46%|████▌     | 2528/5474 [06:49<05:47,  8.48it/s]

In [124]:
# Saving the Electra model
electra_model.save_pretrained('./electra_custom_trained_model')
electra_tokenizer.save_pretrained('./electra_custom_trained_model')

('./electra_custom_trained_model\\tokenizer_config.json',
 './electra_custom_trained_model\\special_tokens_map.json',
 './electra_custom_trained_model\\vocab.txt',
 './electra_custom_trained_model\\added_tokens.json')

In [125]:
# Loading the custom Electra model and tokenizer
electra_custom_model = ElectraForSequenceClassification.from_pretrained('./electra_custom_trained_model')
electra_custom_tokenizer = ElectraTokenizer.from_pretrained('./electra_custom_trained_model')

# Create a new Trainer instance with the custom model
custom_electra_trainer = Trainer(
    model = electra_custom_model,
    processing_class = electra_custom_tokenizer,
    args = electra_training_arguments,
    eval_dataset = imdb_test_hf
)

In [97]:
# Evaluate the model on the test dataset
results = custom_electra_trainer.evaluate()
print("Evaluation results:", results)

100%|██████████| 782/782 [00:17<00:00, 43.88it/s]

Evaluation results: {'eval_loss': 0.41617077589035034, 'eval_model_preparation_time': 0.0015, 'eval_runtime': 17.8484, 'eval_samples_per_second': 1400.686, 'eval_steps_per_second': 43.813}





In [126]:
# Generating the predictions on the test dataset
def predict_with_trainer(trainer, dataset):
    predictions = trainer.predict(dataset)
    preds = predictions.predictions.argmax(axis=1)
    return preds, predictions.label_ids

predictions, labels = predict_with_trainer(custom_electra_trainer, imdb_test_hf)

# Showing the model accuracy
accuracy = accuracy_score(labels, predictions)
print(f"Custom Electra model accuracy on the test dataset: {accuracy * 100:.2f}%")

100%|██████████| 782/782 [00:35<00:00, 22.12it/s]

Custom Electra model accuracy on the test dataset: 92.82%





In [91]:
# Showing the classification report
report = classification_report(labels, predictions, target_names=['Negative', 'Positive'])
print("\nClassification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

    Negative       0.93      0.88      0.91     12500
    Positive       0.89      0.94      0.91     12500

    accuracy                           0.91     25000
   macro avg       0.91      0.91      0.91     25000
weighted avg       0.91      0.91      0.91     25000

