In [None]:

import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import random
import os
import time
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, LSTM, Dense, Dropout, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.callbacks import EarlyStopping, CSVLogger

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

EMBEDDING_DIM = 100
RNN_UNITS = 128
TRANSFORMER_HEADS = 4       
TRANSFORMER_FF_DIM = 512    
TRANSFORMER_BLOCKS = 2      
DROPOUT_RATE = 0.2          
BATCH_SIZE = 128           
EPOCHS = 20                 
PATIENCE = 5                

os.makedirs('../models', exist_ok=True)
os.makedirs('../results/tables', exist_ok=True)
os.makedirs('../results/training_logs', exist_ok=True)

print(" Loading preprcoessed data")
data = np.load('../data/processed/ready_data.npz')
X_train, y_train = data['X_train'], data['y_train']
X_val, y_val = data['X_val'], data['y_val']

# Loading token limits
max_sequence_len = int(data['max_sequence_len'])
total_words = int(data['total_words'])
print(f"Data has been loaded successfully. Vocab: {total_words}, SeqLen: {max_sequence_len}")


#  Simple RNN with 2 layers one is stacked on top of the other  
def build_rnn(vocab_size, seq_length):
    model = Sequential([
        Embedding(vocab_size, EMBEDDING_DIM, input_length=seq_length-1),
        SimpleRNN(RNN_UNITS, return_sequences=True, dropout=DROPOUT_RATE), 
        SimpleRNN(RNN_UNITS, return_sequences=False, dropout=DROPOUT_RATE),
        Dense(vocab_size, activation='softmax')
    ], name="RNN")
    return model

#LSTM with 2 layers one is stacked on top of the other
def build_lstm(vocab_size, seq_length):
    model = Sequential([
        Embedding(vocab_size, EMBEDDING_DIM, input_length=seq_length-1),
        # Layer 1
        LSTM(RNN_UNITS, return_sequences=True, dropout=DROPOUT_RATE),
        # Layer 2
        LSTM(RNN_UNITS, return_sequences=False, dropout=DROPOUT_RATE),
        Dense(vocab_size, activation='softmax')
    ], name="LSTM")
    return model

#Transformer Block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) # Multi-Head Attention for Self-Attention between words so that the model can focus on different parts of the input sequence
        self.ffn = Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim),]) # Feed-Forward Network to process the output of the attention mechanism
        self.layernorm1 = LayerNormalization(epsilon=1e-6) # Layer Normalization to stabilize and speed up training
        self.layernorm2 = LayerNormalization(epsilon=1e-6) # Layer Normalization to stabilize and speed up training
        self.dropout1 = Dropout(rate) # Dropout for regularization
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=None):
        attn_output = self.att(inputs, inputs) # Self-Attention for the input sequence
        attn_output = self.dropout1(attn_output, training=training) # Dropout for regularization
        out1 = self.layernorm1(inputs + attn_output) # Residual Connection and Layer Normalization
        ffn_output = self.ffn(out1) # Feed-Forward Network
        ffn_output = self.dropout2(ffn_output, training=training) # Dropout for regularization
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__() # Initializing the layer
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim) # Token Embedding to convert word indices to dense vectors
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim) # Position Embedding to encode the position of each word in the sequence by adding positional information to the token embeddings

    def call(self, x):
        maxlen = tf.shape(x)[-1] # Getting the maximum length of the input sequence
        positions = tf.range(start=0, limit=maxlen, delta=1) # Creating a range of positions from 0 to maxlen
        positions = self.pos_emb(positions) # Getting the positional embeddings
        x = self.token_emb(x) # Getting the token embeddings
        return x + positions # Adding token and positional embeddings

# Transformer (2 Blocks) 
def build_transformer(vocab_size, seq_length):
    inputs = Input(shape=(seq_length-1,))
    embedding_layer = TokenAndPositionEmbedding(seq_length-1, vocab_size, EMBEDDING_DIM)
    x = embedding_layer(inputs)
    
    # Block 1
    transformer_block1 = TransformerBlock(EMBEDDING_DIM, TRANSFORMER_HEADS, TRANSFORMER_FF_DIM, rate=DROPOUT_RATE)
    x = transformer_block1(x)
    
    # Block 2 (Stacked)
    transformer_block2 = TransformerBlock(EMBEDDING_DIM, TRANSFORMER_HEADS, TRANSFORMER_FF_DIM, rate=DROPOUT_RATE)
    x = transformer_block2(x)
    
    x = GlobalAveragePooling1D()(x) # Global Average Pooling to reduce the dimensionality of the output by taking the average of all the tokens in the sequence and then passing it through a dropout layer to prevent overfitting
    x = Dropout(DROPOUT_RATE)(x) #Randomly turns off neurons to prevent overfitting
    outputs = Dense(vocab_size, activation="softmax")(x)
    model = Model(inputs=inputs, outputs=outputs, name="Transformer")
    return model


models_to_test = ["RNN", "LSTM", "Transformer"]
optimizers_to_test = ["Adam", "RMSprop", "SGD"]

results_list = []

print(f"\n Starting Experiments: {len(models_to_test) * len(optimizers_to_test)} Combinations")

for model_name in models_to_test:
    for opt_name in optimizers_to_test:

        log_path = f"../results/training_logs/{model_name}_{opt_name}_log.csv"
        model_path = f"../models/{model_name}_{opt_name}.keras"
        
        # CHECK 1: If log exists, skip training and load results!
        if os.path.exists(log_path) and os.path.exists(model_path):
            print(f"⏩ Skipping {model_name} + {opt_name} (Already Done)")
            
            # Load logs to get the final metrics
            df_log = pd.read_csv(log_path)
            last_row = df_log.iloc[-1]
            
            results_list.append({
                "Model": model_name, 
                "Optimizer": opt_name,
                "Accuracy": round(last_row['val_accuracy'], 4),
                "Loss": round(last_row['val_loss'], 4),
                "Perplexity": round(np.exp(last_row['val_loss']), 2),
                "Training Time (s)": "Cached", # We lost the exact time, but that's okay for now
                "Layers": 2
            })
            continue

        print(f"TRAINING BASELINE: {model_name} (2 Layers) + {opt_name}")
       
        
        if model_name == "RNN":
            model = build_rnn(total_words, max_sequence_len)
        elif model_name == "LSTM":
            model = build_lstm(total_words, max_sequence_len)
        elif model_name == "Transformer":
            model = build_transformer(total_words, max_sequence_len)
            
        if opt_name == "Adam":
            opt = Adam(learning_rate=0.001)
        elif opt_name == "RMSprop":
            opt = RMSprop(learning_rate=0.001)
        elif opt_name == "SGD":
            opt = SGD(learning_rate=0.01) # Baseline SGD 0.01
            
        #  Compile
        model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
        
        #  Train
        start_time = time.time()
        history = model.fit(
            X_train, y_train,
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            validation_data=(X_val, y_val),
            callbacks=[
                EarlyStopping(patience=PATIENCE, monitor='val_loss', restore_best_weights=True),
                CSVLogger(f"../results/training_logs/{model_name}_{opt_name}_log.csv")
            ],
            verbose=1
        )
        end_time = time.time()
        train_time = end_time - start_time
        
        #  Saving Model
        model_path = f"../models/{model_name}_{opt_name}.keras"
        model.save(model_path)
        
        #  Recording Results
        final_val_acc = history.history['val_accuracy'][-1]
        final_val_loss = history.history['val_loss'][-1]
        perplexity = np.exp(final_val_loss)
        
        results_list.append({
            "Model": model_name,
            "Optimizer": opt_name,
            "Accuracy": round(final_val_acc, 4),
            "Loss": round(final_val_loss, 4),
            "Perplexity": round(perplexity, 2),
            "Training Time (s)": round(train_time, 2),
            "Layers": 2 
        })
        
        print(f" Perplexity: {round(perplexity, 2)}")

results_df = pd.DataFrame(results_list)
results_df.to_csv('../results/tables/optimizer_comparison_baseline.csv', index=False)

print("\nAll experiments completed for baseline.")
print(results_df)

 Loading preprcoessed data
Data has been loaded successfully. Vocab: 10225, SeqLen: 20

 Starting Experiments: 9 Combinations
⏩ Skipping RNN + Adam (Already Done)
⏩ Skipping RNN + RMSprop (Already Done)
⏩ Skipping RNN + SGD (Already Done)
⏩ Skipping LSTM + Adam (Already Done)
⏩ Skipping LSTM + RMSprop (Already Done)
⏩ Skipping LSTM + SGD (Already Done)
TRAINING BASELINE: Transformer (2 Layers) + Adam
Epoch 1/20
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 146ms/step - accuracy: 0.0389 - loss: 7.1355 - val_accuracy: 0.0382 - val_loss: 6.9674
Epoch 2/20
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 163ms/step - accuracy: 0.0399 - loss: 6.7553 - val_accuracy: 0.0382 - val_loss: 7.0253
Epoch 3/20
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 169ms/step - accuracy: 0.0399 - loss: 6.7044 - val_accuracy: 0.0382 - val_loss: 7.0313
Epoch 4/20
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 153ms/step 