<a href="https://colab.research.google.com/github/One-78/Deep_Learning_Lab_Project/blob/main/DL_Lab_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

llm_classification_finetuning_path = kagglehub.competition_download('llm-classification-finetuning')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/competitions/data/download-all/llm-classification-finetuning...


100%|██████████| 57.0M/57.0M [00:00<00:00, 80.9MB/s]

Extracting files...





Data source import complete.


In [5]:
import pandas as pd
import json
import os
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

In [2]:
class CFG:
    seeds = [42, 119, 2020]
    vocab_size = 20000
    max_length = 256
    batch_size = 128
    fine_tune_epochs = 40
    learning_rate = 1e-3  # High LR
    warmup_epochs = 2  # Add warmup
    weight_decay = 0.01  # Add regularization

In [5]:
train_df = pd.read_csv(os.path.join(llm_classification_finetuning_path, 'train.csv'))
train_df.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [17]:
prompt_list = []
targets = []
for i in tqdm(range(len(train_df))):
    prompts = json.loads(train_df.iloc[i]["prompt"])
    response_a = json.loads(train_df.iloc[i]["response_a"])
    response_b = json.loads(train_df.iloc[i]["response_b"])
    conversation_a = ""
    conversation_b = ""
    for j in range(len(prompts)):
        if response_a[j] is None:
            response_a[j] = "None"
        if response_b[j] is None:
            response_b[j] = "None"
        conversation_a += prompts[j] + "\n"
        conversation_a += response_a[j] + "\n"
        conversation_b += prompts[j] + "\n"
        conversation_b += response_b[j] + "\n"
    prompt_list.append((conversation_a, conversation_b))
    if train_df.iloc[i]["winner_tie"] == 1:
        targets.append(0)
    if train_df.iloc[i]["winner_model_a"] == 1:
        targets.append(1)
    if train_df.iloc[i]["winner_model_b"] == 1:
        targets.append(2)
len(prompt_list)

  0%|          | 0/57477 [00:00<?, ?it/s]

57477

In [18]:
# Step 2: Define TextVectorization layer
text_vectorizer = TextVectorization(max_tokens=CFG.vocab_size, output_mode='int', output_sequence_length=CFG.max_length)
text_vectorizer.adapt([item[0] for item in prompt_list] + [item[1] for item in prompt_list])

In [3]:
def get_dataset(prompt_list, targets, shuffle=True, batch_size=128):
    part1 = [item[0] for item in prompt_list]
    part2 = [item[1] for item in prompt_list]
    dataset = tf.data.Dataset.from_tensor_slices(((part1, part2), targets))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=2048)
    dataset = dataset.batch(CFG.batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [9]:
def get_base_model(inputs, embedding):
    x = text_vectorizer(inputs)
    x = embedding(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
    return x
def get_model():
    inputs1 = tf.keras.Input(shape=(1,), dtype=tf.string)
    inputs2 = tf.keras.Input(shape=(1,), dtype=tf.string)
    embedding = tf.keras.layers.Embedding(input_dim=CFG.vocab_size, output_dim=64, mask_zero=True)
    x1 = get_base_model(inputs1, embedding)
    x2 = get_base_model(inputs2, embedding)
    x = tf.keras.layers.Concatenate()([x1, x2])
    x = tf.keras.layers.Conv1D(32, 3, activation="relu")(x)
    x = tf.keras.layers.Conv1D(32, 3, activation="relu")(x)
    x = tf.keras.layers.SpatialDropout1D(0.2)(x)
    x = tf.keras.layers.MaxPooling1D()(x)
    x = tf.keras.layers.Conv1D(64, 3, activation="relu")(x)
    x = tf.keras.layers.Conv1D(64, 3, activation="relu")(x)
    x = tf.keras.layers.SpatialDropout1D(0.2)(x)
    x = tf.keras.layers.MaxPooling1D()(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128, activation="swish")(x)
    outputs = tf.keras.layers.Dense(3, activation="softmax")(x)
    model = tf.keras.Model(inputs=[inputs1, inputs2], outputs=outputs)

    ## model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train the model with
    vocab_size = 20000
    max_length = 1024
    batch_size = 64  
    fine_tune_epochs = 15  
    learning_rate = 2e-5  
    warmup_epochs = 2
    weight_decay = 0.01  

In [15]:
# Fine tuning and training the model
models = []
for seed in CFG.seeds:
    model_name = f"model_{seed}.keras"
    model_name_path = os.path.join(llm_classification_finetuning_path, model_name)

    train_texts, valid_texts, train_labels, valid_labels = train_test_split(
        prompt_list, targets, test_size=0.2, random_state=seed, stratify=targets  # Add stratify
    )
    valid_ds = get_dataset(valid_texts, valid_labels, shuffle=False)

    if not os.path.exists(model_name_path):
        train_ds = get_dataset(train_texts, train_labels)
        model = get_model()

        # Use AdamW with weight decay
        optimizer = tf.keras.optimizers.AdamW(
            learning_rate=CFG.learning_rate,
            weight_decay=CFG.weight_decay
        )

        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',  # or your loss
            metrics=['accuracy']
        )

        # Callbacks
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=model_name_path,
            monitor='val_loss',
            mode='min',
            save_best_only=True,
            save_weights_only=False,
            verbose=1
        )

        early_stopping_callback = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,  # Increased patience
            verbose=1,
            restore_best_weights=True,
            min_delta=1e-4  # Add minimum improvement threshold
        )

        reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.3,  # More aggressive reduction
            patience=3,  # Increased patience
            min_lr=1e-7,
            verbose=1,
            min_delta=1e-4
        )

        # Optional: Add learning rate warmup
        def lr_schedule(epoch):
            if epoch < CFG.warmup_epochs:
                return CFG.learning_rate * (epoch + 1) / CFG.warmup_epochs
            return CFG.learning_rate

        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_schedule, verbose=1)

        # Fine-tune
        history = model.fit(
            train_ds,
            epochs=CFG.fine_tune_epochs,
            validation_data=valid_ds,
            callbacks=[checkpoint_callback, early_stopping_callback, reduce_lr_callback, lr_scheduler],
            verbose=1
        )

        # Load best weights
        model = tf.keras.models.load_model(model_name_path)
    else:
        model = tf.keras.models.load_model(model_name_path)

    loss, acc = model.evaluate(valid_ds, verbose=0)
    print(f"Seed {seed} - Validation Loss: {loss:.4f} | Validation Accuracy: {acc * 100:.2f}%")
    if 'history' in locals():
        print(f"Best epoch: {np.argmin(history.history['val_loss']) + 1}")
    models.append(model)



Seed 42 - Validation Loss: 0.9853 | Validation Accuracy: 53.52%
Best epoch: 2

Epoch 1: LearningRateScheduler setting learning rate to 1e-05.
Epoch 1/15




[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 0.3515 - loss: 1.0985
Epoch 1: val_loss improved from inf to 1.09758, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_119.keras
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 146ms/step - accuracy: 0.3515 - loss: 1.0985 - val_accuracy: 0.3930 - val_loss: 1.0976 - learning_rate: 1.0000e-05

Epoch 2: LearningRateScheduler setting learning rate to 2e-05.
Epoch 2/15
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - accuracy: 0.3815 - loss: 1.0946
Epoch 2: val_loss improved from 1.09758 to 1.07515, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_119.keras
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 146ms/step - accuracy: 0.3816 - loss: 1.0946 - val_accuracy: 0.4229 - val_loss: 1.0751 - learning_rate: 2.0000e-05

Epoch 3: LearningRateScheduler setting l



[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - accuracy: 0.3549 - loss: 1.0983
Epoch 1: val_loss improved from inf to 1.09733, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_2020.keras
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 147ms/step - accuracy: 0.3549 - loss: 1.0983 - val_accuracy: 0.3491 - val_loss: 1.0973 - learning_rate: 1.0000e-05

Epoch 2: LearningRateScheduler setting learning rate to 2e-05.
Epoch 2/15
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - accuracy: 0.3730 - loss: 1.0945
Epoch 2: val_loss improved from 1.09733 to 1.07653, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_2020.keras
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 146ms/step - accuracy: 0.3730 - loss: 1.0945 - val_accuracy: 0.4272 - val_loss: 1.0765 - learning_rate: 2.0000e-05

Epoch 3: LearningRateScheduler setting



[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.3408 - loss: 1.0984
Epoch 1: val_loss improved from inf to 1.09751, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_2024.keras
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 141ms/step - accuracy: 0.3408 - loss: 1.0984 - val_accuracy: 0.3491 - val_loss: 1.0975 - learning_rate: 1.0000e-05

Epoch 2: LearningRateScheduler setting learning rate to 2e-05.
Epoch 2/15
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - accuracy: 0.3504 - loss: 1.0961
Epoch 2: val_loss improved from 1.09751 to 1.08964, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_2024.keras
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 140ms/step - accuracy: 0.3504 - loss: 1.0961 - val_accuracy: 0.4173 - val_loss: 1.0896 - learning_rate: 2.0000e-05

Epoch 3: LearningRateScheduler setting



[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - accuracy: 0.3398 - loss: 1.0985
Epoch 1: val_loss improved from inf to 1.09780, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_2028.keras
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 142ms/step - accuracy: 0.3398 - loss: 1.0985 - val_accuracy: 0.3520 - val_loss: 1.0978 - learning_rate: 1.0000e-05

Epoch 2: LearningRateScheduler setting learning rate to 2e-05.
Epoch 2/15
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.3594 - loss: 1.0969
Epoch 2: val_loss improved from 1.09780 to 1.08795, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_2028.keras
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 140ms/step - accuracy: 0.3594 - loss: 1.0969 - val_accuracy: 0.4122 - val_loss: 1.0879 - learning_rate: 2.0000e-05

Epoch 3: LearningRateScheduler setting

Fine Tuning the model with  
vocab_size = 20000
max_length = 1024
batch_size = 32
fine_tune_epochs = 20
learning_rate = 5e-6  
warmup_epochs = 2  
weight_decay = 0.01  

In [20]:
# Continue training from best saved model
for seed in CFG.seeds:
    model_name = f"model_{seed}.keras"
    model_name_path = os.path.join(llm_classification_finetuning_path, model_name)

    train_texts, valid_texts, train_labels, valid_labels = train_test_split(
        prompt_list, targets, test_size=0.2, random_state=seed, stratify=targets
    )
    train_ds = get_dataset(train_texts, train_labels)
    valid_ds = get_dataset(valid_texts, valid_labels, shuffle=False)

    # Load existing model
    if os.path.exists(model_name_path):
        print(f"Loading existing model for seed {seed}...")
        model = tf.keras.models.load_model(model_name_path)
        initial_loss, initial_acc = model.evaluate(valid_ds, verbose=0)
        print(f"Initial - Loss: {initial_loss:.4f} | Accuracy: {initial_acc*100:.2f}%")
    else:
        print(f"No existing model found for seed {seed}, creating new one...")
        model = get_model()
        initial_loss = float('inf')

    # Reduce learning rate for continued training
    current_lr = CFG.learning_rate * 0.1  # Use 10x smaller LR
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=current_lr,
        weight_decay=CFG.weight_decay
    )
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Callbacks
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=model_name_path,
        monitor='val_loss',
        mode='min',
        save_best_only=True,
        verbose=1
    )

    early_stopping_callback = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        verbose=1,
        restore_best_weights=True,
        min_delta=1e-5
    )

    reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-8,
        verbose=1
    )

    # Continue training
    history = model.fit(
        train_ds,
        epochs=CFG.fine_tune_epochs,
        validation_data=valid_ds,
        callbacks=[checkpoint_callback, early_stopping_callback, reduce_lr_callback],
        verbose=1
    )

    # Load best weights and evaluate
    model = tf.keras.models.load_model(model_name_path)
    final_loss, final_acc = model.evaluate(valid_ds, verbose=0)
    print(f"Final - Loss: {final_loss:.4f} | Accuracy: {final_acc*100:.2f}%")
    print(f"Improvement: {(final_loss - initial_loss):.4f}")


Loading existing model for seed 42...




Initial - Loss: 0.9853 | Accuracy: 53.52%
Epoch 1/20
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.5163 - loss: 0.9988
Epoch 1: val_loss improved from inf to 0.98159, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_42.keras
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 117ms/step - accuracy: 0.5163 - loss: 0.9988 - val_accuracy: 0.5352 - val_loss: 0.9816 - learning_rate: 5.0000e-07
Epoch 2/20
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.5164 - loss: 0.9957
Epoch 2: val_loss improved from 0.98159 to 0.97881, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_42.keras
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 118ms/step - accuracy: 0.5164 - loss: 0.9957 - val_accuracy: 0.5353 - val_loss: 0.9788 - learning_rate: 5.0000e-07
Epoch 3/20
[1m1437/1437[0m [32m━━━━━━━━━━━━



Initial - Loss: 1.0615 | Accuracy: 45.15%
Epoch 1/20
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.4718 - loss: 1.0433
Epoch 1: val_loss improved from inf to 1.06156, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_119.keras
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 119ms/step - accuracy: 0.4718 - loss: 1.0433 - val_accuracy: 0.4510 - val_loss: 1.0616 - learning_rate: 5.0000e-07
Epoch 2/20
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.4728 - loss: 1.0413
Epoch 2: val_loss did not improve from 1.06156
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 116ms/step - accuracy: 0.4728 - loss: 1.0413 - val_accuracy: 0.4509 - val_loss: 1.0617 - learning_rate: 5.0000e-07
Epoch 3/20
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.4722 - loss: 1.0422
Epoch 3: val_loss did n



Initial - Loss: 1.0611 | Accuracy: 44.97%
Epoch 1/20
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.4602 - loss: 1.0539
Epoch 1: val_loss improved from inf to 1.06096, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_2020.keras
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 116ms/step - accuracy: 0.4602 - loss: 1.0539 - val_accuracy: 0.4501 - val_loss: 1.0610 - learning_rate: 5.0000e-07
Epoch 2/20
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.4601 - loss: 1.0550
Epoch 2: val_loss improved from 1.06096 to 1.06094, saving model to /root/.cache/kagglehub/competitions/llm-classification-finetuning/model_2020.keras
[1m1437/1437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 115ms/step - accuracy: 0.4601 - loss: 1.0550 - val_accuracy: 0.4497 - val_loss: 1.0609 - learning_rate: 5.0000e-07
Epoch 3/20
[1m1437/1437[0m [32m━━━━━━━━

In [14]:
test_df = pd.read_csv(os.path.join(llm_classification_finetuning_path, 'test.csv'))

In [15]:
test_prompt_list = []
for i in tqdm(range(len(test_df))):
    prompts = json.loads(test_df.iloc[i]["prompt"])
    response_a = json.loads(test_df.iloc[i]["response_a"])
    response_b = json.loads(test_df.iloc[i]["response_b"])
    conversation_a = ""
    conversation_b = ""
    for j in range(len(prompts)):
        if response_a[j] is None:
            response_a[j] = "None"
        if response_b[j] is None:
            response_b[j] = "None"
        conversation_a += prompts[j] + "\n"
        conversation_a += response_a[j] + "\n"
        conversation_b += prompts[j] + "\n"
        conversation_b += response_b[j] + "\n"
    test_prompt_list.append((conversation_a, conversation_b))
len(test_prompt_list)

  0%|          | 0/3 [00:00<?, ?it/s]

3

In [16]:
def get_test_dataset(prompt_list, batch_size=128):
    part1 = [item[0] for item in prompt_list]
    part2 = [item[1] for item in prompt_list]
    dataset = tf.data.Dataset.from_tensor_slices(((part1, part2), [0] * len(prompt_list)))
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [17]:
test_ds = get_test_dataset(test_prompt_list)