In [None]:
import pandas as pd
import json
import os
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

In [None]:
class CFG:
    seeds = [42, 119, 2020, 2024, 2028]

In [None]:
train_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
train_df.head()

In [None]:
prompt_list = []
targets = []
for i in tqdm(range(len(train_df))):
    prompts = json.loads(train_df.iloc[i]["prompt"])
    response_a = json.loads(train_df.iloc[i]["response_a"])
    response_b = json.loads(train_df.iloc[i]["response_b"])
    conversation_a = ""
    conversation_b = ""
    for j in range(len(prompts)):
        if response_a[j] is None:
            response_a[j] = "None"
        if response_b[j] is None:
            response_b[j] = "None"
        conversation_a += prompts[j] + "\n"
        conversation_a += response_a[j] + "\n"
        conversation_b += prompts[j] + "\n"
        conversation_b += response_b[j] + "\n"
    prompt_list.append((conversation_a, conversation_b))
    if train_df.iloc[i]["winner_tie"] == 1:
        targets.append(0)
    if train_df.iloc[i]["winner_model_a"] == 1:
        targets.append(1)
    if train_df.iloc[i]["winner_model_b"] == 1:
        targets.append(2)
len(prompt_list)

In [None]:
# Step 2: Define TextVectorization layer
vocab_size = 20000  # Vocabulary size (tune this as needed)
max_length = 1024    # Maximum sequence length (tune this as needed)
text_vectorizer = TextVectorization(max_tokens=vocab_size, output_mode='int', output_sequence_length=max_length)
text_vectorizer.adapt([item[0] for item in prompt_list] + [item[1] for item in prompt_list]) 

In [None]:
def get_dataset(prompt_list, targets, shuffle=True, batch_size=128):
    part1 = [item[0] for item in prompt_list]
    part2 = [item[1] for item in prompt_list]
    dataset = tf.data.Dataset.from_tensor_slices(((part1, part2), targets))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=2048)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
def get_base_model(inputs, embedding):
    x = text_vectorizer(inputs)
    x = embedding(x)
    return x
def get_model():
    inputs1 = tf.keras.Input(shape=(1,), dtype=tf.string)
    inputs2 = tf.keras.Input(shape=(1,), dtype=tf.string)
    embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64, mask_zero=True)
    x1 = get_base_model(inputs1, embedding)
    x2 = get_base_model(inputs2, embedding)
    x = tf.keras.layers.Concatenate()([x1, x2])
    x = tf.keras.layers.Conv1D(32, 3, activation="relu")(x)
    x = tf.keras.layers.Conv1D(32, 3, activation="relu")(x)
    x = tf.keras.layers.SpatialDropout1D(0.2)(x)
    x = tf.keras.layers.MaxPooling1D()(x)
    x = tf.keras.layers.Conv1D(64, 3, activation="relu")(x)
    x = tf.keras.layers.Conv1D(64, 3, activation="relu")(x)
    x = tf.keras.layers.SpatialDropout1D(0.2)(x)
    x = tf.keras.layers.MaxPooling1D()(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128, activation="swish")(x)
    outputs = tf.keras.layers.Dense(3, activation="softmax")(x)
    model = tf.keras.Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
models = []
for seed in CFG.seeds:
    model_name = f"model_{seed}.keras"
    # Step 1: Split texts and labels into train and test sets
    train_texts, valid_texts, train_labels, valid_labels = train_test_split(
        prompt_list, targets, test_size=0.2, random_state=seed
    )
    valid_ds = get_dataset(valid_texts, valid_labels, shuffle=False)
    model_name_path = f"/kaggle/input/llm-classification-finetuning-with-cnn-model/{model_name}"
    if not os.path.exists(model_name_path):
        train_ds = get_dataset(train_texts, train_labels)
        model = get_model()
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=model_name,  # Filepath to save the best model
            monitor='val_loss',        # Metric to monitor
            mode="min",
            save_best_only=True,       # Save only the best model
            verbose=1
        )
        early_stopping_callback = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',        # Metric to monitor
            patience=5,                # Number of epochs with no improvement to wait before stopping
            verbose=1,
            restore_best_weights=True  # Restore weights from the best epoch
        )
        model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[checkpoint_callback, early_stopping_callback])
        model.load_weights(model_name)
    else:
        model = tf.keras.models.load_model(model_name_path)
        model.save(model_name)
    loss, acc = model.evaluate(valid_ds, verbose=0)
    print(f"Validation Loss: {loss: .4f} Validation Accuracy: {acc * 100: .4f}%")
    models.append(model)

In [None]:
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

In [None]:
test_prompt_list = []
for i in tqdm(range(len(test_df))):
    prompts = json.loads(test_df.iloc[i]["prompt"])
    response_a = json.loads(test_df.iloc[i]["response_a"])
    response_b = json.loads(test_df.iloc[i]["response_b"])
    conversation_a = ""
    conversation_b = ""
    for j in range(len(prompts)):
        if response_a[j] is None:
            response_a[j] = "None"
        if response_b[j] is None:
            response_b[j] = "None"
        conversation_a += prompts[j] + "\n"
        conversation_a += response_a[j] + "\n"
        conversation_b += prompts[j] + "\n"
        conversation_b += response_b[j] + "\n"
    test_prompt_list.append((conversation_a, conversation_b))
len(test_prompt_list)

In [None]:
def get_test_dataset(prompt_list, batch_size=128):
    part1 = [item[0] for item in prompt_list]
    part2 = [item[1] for item in prompt_list]
    dataset = tf.data.Dataset.from_tensor_slices(((part1, part2), [0] * len(prompt_list)))
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
test_ds = get_test_dataset(test_prompt_list)

In [None]:
result = np.mean([model.predict(test_ds, verbose=0) for model in models], axis=0)

In [None]:
submission = pd.read_csv("/kaggle/input/llm-classification-finetuning/sample_submission.csv")
submission["winner_tie"] = result[:, 0]
submission["winner_model_a"] = result[:, 1]
submission["winner_model_b"] = result[:, 2]
submission.to_csv("submission.csv", index=False)
submission.head()