In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


<div class="alert alert-block alert-info"> <b><span style = "color:#000000">
 Load the Preprocess the Dataset </span> </b>
    
* The IMDb dataset is loaded using pandas.read_csv. <br><br>
    
* HTML tags are removed from the reviews to clean the text data. <br><br>

* The dataset size is adjusted for quicker training and testing, using a small sample (test_size=0.02). <br><br>
    
* This small sample is then split into training and testing sets. <br><br>
    
--------------
-----
    

</div>

In [24]:
# Load the dataset 
df = pd.read_csv('IMDB_Dataset.csv')

# Simple preprocessing: removing possible HTML tags
df['review'] = df['review'].str.replace('<.*?>', ' ')

# Adjust the dataset size for quicker training and testing
_, sample_data, _, sample_labels = train_test_split(df['review'], df['sentiment'], test_size=0.02, random_state=42)

# split the small sample into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(sample_data, sample_labels, test_size=0.5, random_state=42)



<div class="alert alert-block alert-info"> <b><span style = "color:#000000">
 Text Tokenization and Padding </span> </b>
    
* The Tokenizer is configured to only consider the top 10,000 words. <br><br>
    
* Text data (reviews) is converted into sequences of integers, where each integer represents a specific word-token. <br><br>

* Sequences are padded to ensure they have the same length for model input, using pad_sequences. <br>
        
--------------
-----
    

</div>

In [25]:
# Tokenizing the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data)
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

# Padding sequences to ensure uniform input size
max_length = max(max(len(x) for x in train_sequences), max(len(x) for x in test_sequences))
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')


<div class="alert alert-block alert-info"> <b><span style = "color:#000000">
 Building the Transformer Model </span> </b>
    
* Defines a transformer_encoder function that includes a multi-head self-attention mechanism and 
a feed-forward network, fundamental components of a transformer. <br><br>
    
* Constructs the neural network model with an embedding layer, followed by the transformer encoder and global average pooling, culminating in a dense layer for binary classification (positive or negative sentiment). <br><br>

* The model is compiled with the Adam optimizer and binary cross-entropy loss, suitable for binary classification tasks. <br>
        
--------------
-----
    

</div>

In [26]:
import tensorflow as tf

def transformer_encoder(inputs, num_heads, ff_dim):
    # Multi-head self-attention
    attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(inputs, inputs)
    attention_output = tf.keras.layers.Dropout(0.1)(attention_output)
    attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention_output)
    
    # Feed-forward layer
    ff_output = tf.keras.Sequential([
        tf.keras.layers.Dense(ff_dim, activation='relu'),
        tf.keras.layers.Dense(inputs.shape[-1])
    ])(attention_output)
    ff_output = tf.keras.layers.Dropout(0.1)(ff_output)
    return tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_output + ff_output)


In [27]:
# Building the model
inputs = tf.keras.layers.Input(shape=(max_length,))
embedding_layer = tf.keras.layers.Embedding(input_dim=10000, output_dim=64)(inputs)
transformer_block = transformer_encoder(embedding_layer, num_heads=2, ff_dim=32)
global_average_pooling = tf.keras.layers.GlobalAveragePooling1D()(transformer_block)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(global_average_pooling)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 1007)]               0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, 1007, 64)             640000    ['input_4[0][0]']             
                                                                                                  
 multi_head_attention_3 (Mu  (None, 1007, 64)             16640     ['embedding_3[0][0]',         
 ltiHeadAttention)                                                   'embedding_3[0][0]']         
                                                                                                  
 dropout_6 (Dropout)         (None, 1007, 64)             0         ['multi_head_attention_3

<div class="alert alert-block alert-info"> <b><span style = "color:#000000">
 Training the Model </span> </b>
    
* Converts sentiment labels (positive/negative) into numeric format (1/0) for model training. <br><br>
    
* Trains the model on the padded training sequences with corresponding labels, using a portion of the data for validation. <br><br>
--------------
-----
    

</div>

In [28]:
# Convert labels to numeric
train_labels = train_labels.replace({'positive': 1, 'negative': 0})
test_labels = test_labels.replace({'positive': 1, 'negative': 0})

# Training the model
history = model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<div class="alert alert-block alert-info"> <b><span style = "color:#000000">
 Evaluating Model and making Predictions </span> </b>
    
* Evaluates the trained model on the test set to obtain loss and accuracy metrics, providing insights into its performance. <br><br>
    
* Demonstrates making a prediction with the model on a new sample text, showcasing its practical application. <br><br>
--------------
-----
    

</div>

In [29]:
# Evaluate the model
test_loss, test_acc = model.evaluate(test_padded, test_labels)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')

# Making a prediction
sample_text = ["This movie was a great journey full of emotion and excitement"]
sample_sequence = tokenizer.texts_to_sequences(sample_text)
sample_padded = pad_sequences(sample_sequence, maxlen=max_length, padding='post')
prediction = model.predict(sample_padded)
print(f'Sentiment Prediction: {"Positive" if prediction[0] > 0.5 else "Negative"}')


Test Loss: 0.6465569138526917, Test Accuracy: 0.7699999809265137
Sentiment Prediction: Positive
