In [1]:
import pandas as pd

dataset_splits = {'train': 'Dataset_IMDB_Sentiment/plain_text/train-00000-of-00001.parquet', 'test': 'Dataset_IMDB_Sentiment/plain_text/test-00000-of-00001.parquet', 'unsupervised': 'Dataset_IMDB_Sentiment/plain_text/unsupervised-00000-of-00001.parquet'}
imdb_train_df = pd.read_parquet(dataset_splits["train"])
imdb_test_df = pd.read_parquet(dataset_splits['test'])

In [2]:
# Visualising the dataset 
print(imdb_train_df.head(5))

print(len(imdb_train_df['text'][0]))
print(len(imdb_train_df))

                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0
1640
25000


## Pre-processing

In [3]:
import nltk

nltk.download('punkt') 
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tiberiu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tiberiu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin # This allows the custom class to work with the Pipeline
from nltk.tokenize import word_tokenize, RegexpTokenizer # Import the tokenize package and regex tokenizer
from nltk.corpus import stopwords # Import stopwords for stopwords removal
from nltk.stem import SnowballStemmer, WordNetLemmatizer # Import the snowball stemmer (also known as Porter2)
import re # Import regex
import numpy as np

class pre_process(BaseEstimator, TransformerMixin): # TODO Further improve on this pre-processing
    def __init__(self):
      # Initialize objects for efficiency
      self.stemmer = SnowballStemmer("english")
      self.lemmatizer = WordNetLemmatizer()
      self.tokenizer = self.tokenizer = RegexpTokenizer(r'\w+')
      self.stopwords = stopwords.words("english")
      return None

    def fit(self, X, y=None):
        return self 

    def transform(self, X, y=None):
      processed_text = []
      for text in X:
        text = re.sub(r'\d+', '', text)  # Remove numbers NOTE: I added this
        token_text = self.tokenizer.tokenize(text) 
        normalised_text = [token.lower() for token in token_text if token.isalpha()]

        no_stopwords_text = [token for token in normalised_text if token not in self.stopwords]

        # Using lemmatization or stemming TODO I should decide on one in the end
        # processed_text += [[self.stemmer.stem(word) for word in swr_text]] # Applying stemmer
        processed_text.append([self.lemmatizer.lemmatize(word) for word in no_stopwords_text]) # Applying the lemmatizer

      #  Return tokens as a list of words (this ensure compatibility with FastText)
      return processed_text

In [None]:
# The FastText class
class FastTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_path, max_sequence_length):
        self.model_path = model_path
        self.max_sequence_length = max_sequence_length
        self.vector_dim = 300
        self.word_vectors = self.load_fasttext_vectors()

    def load_fasttext_vectors(self):
        # Loading the FastText word vectors from the .vec file
        word_vectors = {}
        with open(self.model_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                word = parts[0]
                vector = np.array(parts[1:], dtype=np.float32)
                word_vectors[word] = vector
        return word_vectors

    def fit(self, X, y=None):
        return self

    def transform(self, tokens):
        # Convert each tokenized text into a list of word vectors
        tokenized_vectors = [
            [
                self.word_vectors[word] if word in self.word_vectors else np.zeros(self.vector_dim)
                for word in token
            ]
            for token in tokens
        ]

        # Reshape data to 3D to make it compatible with the BiLSTM GRU model
        # Pad sequences to ensure consistent dimensions: sequence length, vector dim
        padded_data = pad_sequences(
            [np.array(seq) for seq in tokenized_vectors],
            maxlen=self.max_sequence_length,
            dtype='float32',
            padding='post',
            truncating='post',
        )

        return np.array(padded_data)

## Algorithm training

In [50]:
# Showcasing the pre processing and vectorization outputs used in the final model training pipelines
from sklearn.pipeline import Pipeline # Adding the pipeline functionality

# Showcasing the pre-process pipeline output
pre_process_pipeline =  Pipeline([
  ('prep', pre_process()) # Custom pre-processing method
  ])

pre_process_example = pre_process_pipeline.fit_transform(imdb_train_df['text'][:1])
print(f"Below is an example of the pre-processed pipeline output:")
print(f"\tThe original text (as tokens): {imdb_train_df['text'][0]}")
print(f"\tThe pre-processed text: {pre_process_example[0]}")

# Showcasing the FastText vectorization pipeline output with 3D shape data
fast_text_pipeline = Pipeline([
  ("prep", pre_process()), # Custom pre-processing method
  ("fast_text_vectorizer", FastTextVectorizer(
    model_path = "Libraries/FastText/wiki-news-300d-1M.vec",
    max_sequence_length = 250
    )), # FastText word to vector dictionary
  ])

fast_text_pipeline = fast_text_pipeline.fit_transform(imdb_train_df['text'][:1])
print(f"\tThe 3D reshaped data shape: {fast_text_pipeline.shape}")
print(f"\tThe 3D reshaped data representation (sample): {fast_text_pipeline[0][:10]}")


Below is an example of the pre-processed pipeline output:
	The original text (as tokens): I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this w

In [49]:
# Calculate the average tokens per imdb review. 
# This average will be used for the max sequence length in FastText, to avoid truncating and losing potential semantics information
pre_process_example = pre_process_pipeline.fit_transform(imdb_train_df['text'].values)

# Calculate token counts for each pre-processed text
token_lengths = [len(tokens) for tokens in pre_process_example]

# Calculate the statistics
average_tokens = sum(token_lengths) / len(token_lengths)
max_tokens = max(token_lengths)
min_tokens = min(token_lengths)
max_seq_len_90th  = int(np.percentile(token_lengths, 90))

print(f"Average number of tokens: {average_tokens}")
print(f"Maximum number of tokens: {max_tokens}")
print(f"Minimum number of tokens: {min_tokens}")
print(f"90th percentile token length: {max_seq_len_90th}")

Average number of tokens: 123.56692
Maximum number of tokens: 1442
Minimum number of tokens: 4
90th percentile token length: 246


In [58]:
# Adding the model wrapped in a custom Keras model to work with the Pipeline
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, GRU, Dense, Dropout
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline

class BiLSTM_GRU_Model(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, num_classes, dropout_rate):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.dropout_rate = dropout_rate
        self.model = self.build_model()
    
    def build_model(self):
        model = Sequential()
        model.add(Bidirectional(LSTM(32, return_sequences=True), input_shape=self.input_shape))
        model.add(GRU(16))
        model.add(Dropout(self.dropout_rate))
        model.add(Dense(self.num_classes, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=5, batch_size=16, verbose=1)
        return self
    
    def predict(self, X):
        predictions = self.model.predict(X)
        return predictions[:, 1] * 100

In [None]:
from sklearn.metrics import accuracy_score, classification_report

max_sequence_length = 250 # This is the 90th percentile review token length score (rounded)
input_shape = (max_sequence_length, 300)  # Match the reshaped dimensions: max_sequence_length x vector_dim

bilstm_gru_NLP_pipeline = Pipeline([
  ("prep", pre_process()), # Custom pre-processing method
  ("fast_text_vectorizer", FastTextVectorizer(
    model_path = "Libraries/FastText/wiki-news-300d-1M.vec",
    max_sequence_length = max_sequence_length
    )), # FastText word to vector dictionary
    ("model", BiLSTM_GRU_Model(input_shape=input_shape, num_classes=2, dropout_rate=0.3))  # BiLSTM GRU model
  ])

X_train = imdb_train_df['text'].values
y_train = imdb_train_df['label'].values

# Fit the pipeline
first_NLP_pipeline.fit(X_train, y_train)

# Make the predictions
X_test = imdb_test_df['text'].values
y_pred = first_NLP_pipeline.predict(X_test)
y_test = imdb_test_df['label'].values
print("Predictions:", y_pred)

# Compare predictions with the actual ones
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 47ms/step - accuracy: 0.9998 - loss: 9.9003e-04
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 45ms/step - accuracy: 0.9969 - loss: 0.0099
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 43ms/step - accuracy: 0.9985 - loss: 0.0044
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 42ms/step - accuracy: 0.9985 - loss: 0.0049
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 43ms/step - accuracy: 0.9993 - loss: 0.0027
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 43ms/step - accuracy: 0.9995 - loss: 0.0024
Epoch 7/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 40ms/step - accuracy: 0.9992 - loss: 0.0031
Epoch 8/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 39ms/step - accuracy: 0.9968 - loss: 0.0084
Epoch 9/10
[1m505/7

In [None]:
# TODO: This might be my 2nd model to compare BiLSTM with. I might have to use ElectraTokenizer instead of FastText for this one.

from sklearn.pipeline import Pipeline # Adding the pipeline functionality
from transformers import ElectraTokenizer, ElectraModel 

# Load the tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraModel.from_pretrained('google/electra-small-discriminator')

0        [101, 1045, 12524, 1045, 2572, 8025, 1011, 375...
1        [101, 1000, 1045, 2572, 8025, 1024, 3756, 1000...
2        [101, 2065, 2069, 2000, 4468, 2437, 2023, 2828...
3        [101, 2023, 2143, 2001, 2763, 4427, 2011, 2643...
4        [101, 2821, 1010, 2567, 1012, 1012, 1012, 2044...
                               ...                        
24995    [101, 1037, 2718, 2012, 1996, 2051, 2021, 2085...
24996    [101, 1045, 2293, 2023, 3185, 2066, 2053, 2060...
24997    [101, 2023, 2143, 1998, 2009, 1005, 1055, 8297...
24998    [101, 1005, 1996, 7357, 1997, 6287, 18506, 100...
24999    [101, 1996, 2466, 6401, 2105, 6287, 18506, 204...
Name: tokenized_reviews, Length: 25000, dtype: object
