In [None]:

import kagglehub
llm_classification_finetuning_path = kagglehub.competition_download('llm-classification-finetuning')
print('Data source import complete.')

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
from datasets import Dataset
import torch  # base
import torch.nn.functional as F  
import json
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding

## Downloading the model

In [None]:

BERT = kagglehub.model_download("lucasmoraes001/bertlarge_competition/pyTorch/335m")

## Import Kaggle Data

In [None]:
df_submission_sample = pd.read_csv('/kaggle/input/llm-classification-finetuning/sample_submission.csv')

In [None]:
df_submission_sample.info()

In [None]:
df_test_raw = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

In [None]:
df_test_raw

## Tokenize


In [None]:

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/bertlarge_competition/pytorch/335m/1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
def preprocess_function(batch, column):
    return tokenizer(batch[column], truncation=True, padding=True)


In [None]:
df_test = df_test_raw.copy()

In [None]:
df_test['prompt'] = df_test['prompt'].apply(lambda x: json.loads(x))
df_test['response_a'] = df_test['response_a'].apply(lambda x: json.loads(x))
df_test['response_b'] = df_test['response_b'].apply(lambda x: json.loads(x))
for col in ['prompt', 'response_a', 'response_b']:
    df_test[col] = df_test[col].apply(lambda x: x[0] if isinstance(x, list) else x)
    df_test[col] = df_test[col].str.strip('[]').str.replace("'", "").str.replace('"', '')
df_test.drop('id', axis=1, inplace = True)


In [None]:
df_test = Dataset.from_pandas(df_test)

In [None]:
df_test['prompt']

In [None]:
for column in ['prompt', 'response_a', 'response_b']:
    df_test = df_test.map(lambda x: preprocess_function(x, column=column), batched=True)

In [None]:
df_test

In [None]:
df_test = df_test.remove_columns(['prompt', 'response_a', 'response_b'])

In [None]:
df_test

## Predicting

In [None]:

model = AutoModelForSequenceClassification.from_pretrained(BERT)
model.to(device='cuda')

In [None]:
df_test = df_test.with_format("torch", device='cuda')

In [None]:
model.eval(
)

with torch.no_grad():
    all_preds = []
    outputs = model(input_ids=df_test["input_ids"], attention_mask=df_test["attention_mask"])
    probs = torch.softmax(outputs['logits'], dim=1)  
    all_preds.append(probs)
        
all_preds = torch.cat(all_preds)

all_preds = all_preds.cpu().numpy()

TARGETS = ['winner_model_a', 'winner_model_b', 'winner_tie']

df_submission = pd.DataFrame(all_preds, columns=TARGETS)

df_submission = pd.concat([df_test_raw['id'], df_submission], axis =1)

for i in ["winner_model_a", "winner_model_b","winner_tie"]:
    df_submission[i] = df_submission[i].astype(np.float64)

df_submission

In [None]:
df_submission.info()

In [None]:
df_submission.to_csv("submission.csv", index=False)
