# Inspecting Files

In [None]:
import os
print("Available files:", os.listdir("/kaggle/input/llm-classification-finetuning"))

# Install

In [None]:
import pkg_resources
packages = ['transformers', 'torch', 'tqdm', 'scikit-learn']
for package in packages:
    try:
        version = pkg_resources.get_distribution(package).version
        print(f"{package}: {version}")
    except:
        print(f"{package} not installed")

In [None]:
# !pip install transformers

# !pip install torch

# !pip install tqdm

# !pip install scikit-learn

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name()}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")

In [None]:
import pandas as pd

import numpy as np

from transformers import AutoTokenizer, AutoModel

import torch

from torch import nn

from tqdm.notebook import tqdm

import torch.nn.functional as F

from sklearn.metrics import log_loss


# Enable memory efficient settings

torch.backends.cudnn.benchmark = False

torch.backends.cuda.matmul.allow_tf32 = False

torch.backends.cudnn.deterministic = True



# Verify CUDA is still available after imports

print(f"CUDA available: {torch.cuda.is_available()}")

# Environment Check

In [None]:
# Check GPU availability and memory

!nvidia-smi


import torch

print(f"CUDA available: {torch.cuda.is_available()}")

print(f"Current device: {torch.cuda.current_device()}")

print(f"Device name: {torch.cuda.get_device_name()}")


# Clear any existing cache

torch.cuda.empty_cache()

# Import and Environment Check

In [None]:
# Step 1: Imports

import os

import pandas as pd

import numpy as np

from transformers import AutoTokenizer, AutoModel

import torch

from torch import nn

from sklearn.model_selection import train_test_split

from tqdm import tqdm

import torch.nn.functional as F

from sklearn.metrics import log_loss



# Clear GPU memory

torch.cuda.empty_cache()

In [None]:
# Step 2: Data Loading

data_path1 = '/kaggle/input/llm-classification-finetuning'

file_name1 = 'train.csv'

file_name2 = 'test.csv'

file_path1 = os.path.join(data_path1, file_name1)

file_path2 = os.path.join(data_path1, file_name2)



df_train = pd.read_csv(file_path1)

df_test = pd.read_csv(file_path2)

# df_train
# df_test

# Updating packages fix

In [None]:
# !pip install --upgrade transformers
# !pip install --upgrade ipywidgets

In [None]:
# !pip uninstall -y transformers tokenizers
# !pip install transformers>=4.33.1
# import transformers
# print(f"Transformers version: {transformers.__version__}")

In [None]:
# In a notebook with internet access
# save_path = '/kaggle/working/model_files'
# os.makedirs(save_path, exist_ok=True)

# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
# model = AutoModel.from_pretrained("microsoft/deberta-v3-small")

# tokenizer.save_pretrained(f'{save_path}/deberta-tokenizer')
# model.save_pretrained(f'{save_path}/deberta-model')

# Create dataset version in Kaggle

In [None]:
# download cache
# Add this in your development environment before going offline
# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
# model = AutoModel.from_pretrained("microsoft/deberta-v3-small")
# # Save them locally
# tokenizer.save_pretrained("./deberta-tokenizer")
# model.save_pretrained("./deberta-model")

In [None]:
# import os
# print(f"Does directory exist? {os.path.exists('/kaggle/working/model_files')}")
# print("\nDirectory contents:")
# !ls -la /kaggle/working/model_files

In [None]:
# save_path = '/kaggle/working/model_files'
# print(f"1. Creating directory at {save_path}")
# os.makedirs(save_path, exist_ok=True)

# print("\n2. Downloading tokenizer...")
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
# print("Tokenizer downloaded successfully")

# print("\n3. Downloading model...")
# model = AutoModel.from_pretrained("microsoft/deberta-v3-small")
# print("Model downloaded successfully")

# print("\n4. Saving tokenizer...")
# tokenizer.save_pretrained(f'{save_path}/deberta-tokenizer')
# print("Tokenizer saved")

# print("\n5. Saving model...")
# model.save_pretrained(f'{save_path}/deberta-model')
# print("Model saved")

# print("\n6. Final directory structure:")
# !ls -R /kaggle/working/model_files

In [None]:
# # Check file sizes
# !du -h /kaggle/working/model_files/*

# # Check contents of a config file
# !cat /kaggle/working/model_files/deberta-model/config.json

In [None]:
# from transformers import AutoTokenizer, AutoModel

# # Try loading from saved files
# test_tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/model_files/deberta-tokenizer', local_files_only=True)
# test_model = AutoModel.from_pretrained('/kaggle/working/model_files/deberta-model', local_files_only=True)

# print("Successfully loaded saved files!")

# DataPreprocessor

In [None]:
# Step 3: Data Processor
import kagglehub
from transformers import DebertaV2Tokenizer, DebertaV2Model, DebertaV2Config

class DataProcessor:
   def __init__(self, max_length=128):
        # Get model path from kagglehub
       model_path = kagglehub.model_download("ziyonaressker/deberta-v3-small/transformers/v1")
        
       self.tokenizer = DebertaV2Tokenizer.from_pretrained(
            model_path,
            local_files_only=True
        )
       self.max_length = max_length



   def clean_text(self, text):

       return text.strip('[]"').replace('\\n', ' ').replace('\\', '')



   def prepare_features(self, prompts, responses_a, responses_b):

       prompts = [self.clean_text(str(p)) for p in prompts]

       responses_a = [self.clean_text(str(r)) for r in responses_a]

       responses_b = [self.clean_text(str(r)) for r in responses_b]



       prompt_encodings = self.tokenizer(prompts, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")

       resp_a_encodings = self.tokenizer(responses_a, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")

       resp_b_encodings = self.tokenizer(responses_b, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")



       return {

           'prompt_ids': prompt_encodings['input_ids'],

           'prompt_mask': prompt_encodings['attention_mask'],

           'resp_a_ids': resp_a_encodings['input_ids'],

           'resp_a_mask': resp_a_encodings['attention_mask'],

           'resp_b_ids': resp_b_encodings['input_ids'],

           'resp_b_mask': resp_b_encodings['attention_mask']

       }

# Model Architecture

In [None]:
# Step 4: Model Architecture

class PreferenceClassifier(nn.Module):
   def __init__(self):
       super().__init__()
       # Get model path from kagglehub
       model_path = kagglehub.model_download("ziyonaressker/deberta-v3-small/transformers/v1")
       
       # Load config and model
       config = DebertaV2Config.from_pretrained(model_path)
       self.encoder = DebertaV2Model.from_pretrained(
            model_path,
            config=config,
            local_files_only=True
        )

       hidden_size = self.encoder.config.hidden_size


       self.classifier = nn.Sequential(

           nn.Linear(hidden_size * 3, hidden_size),

           nn.LayerNorm(hidden_size),

           nn.Dropout(0.1),

           nn.ReLU(),

           nn.Linear(hidden_size, 3)

       )



   def forward(self, prompt_ids, prompt_mask, resp_a_ids, resp_a_mask, resp_b_ids, resp_b_mask):

       prompt_enc = self.encoder(prompt_ids, attention_mask=prompt_mask).last_hidden_state[:, 0, :]

       resp_a_enc = self.encoder(resp_a_ids, attention_mask=resp_a_mask).last_hidden_state[:, 0, :]

       resp_b_enc = self.encoder(resp_b_ids, attention_mask=resp_b_mask).last_hidden_state[:, 0, :]



       combined = torch.cat([prompt_enc, resp_a_enc, resp_b_enc], dim=1)

       return self.classifier(combined)

# Dataset and DataLoader classes

In [None]:
# Step 5: Dataset Class

class PreferenceDataset(torch.utils.data.Dataset):

   def __init__(self, features, labels=None):

       self.features = features

       self.labels = labels



   def __len__(self):

       return len(self.features['prompt_ids'])



   def __getitem__(self, idx):

       item = {

           'prompt_ids': self.features['prompt_ids'][idx],

           'prompt_mask': self.features['prompt_mask'][idx],

           'resp_a_ids': self.features['resp_a_ids'][idx],

           'resp_a_mask': self.features['resp_a_mask'][idx],

           'resp_b_ids': self.features['resp_b_ids'][idx],

           'resp_b_mask': self.features['resp_b_mask'][idx]

       }

       if self.labels is not None:

           item['labels'] = self.labels[idx]

       return item

# Training Function and Evaluation

In [None]:
# Step 6: Training Functions

def create_data_loader(df, processor, batch_size=16, is_training=True):

   chunk_size = 1000

   all_features = []

   all_labels = []



   for i in range(0, len(df), chunk_size):

       chunk = df.iloc[i:i+chunk_size]

       features = processor.prepare_features(

           chunk['prompt'].tolist(),

           chunk['response_a'].tolist(),

           chunk['response_b'].tolist()

       )

       all_features.append(features)



       if is_training:

           labels = torch.tensor(

               chunk[['winner_model_a', 'winner_model_b', 'winner_tie']].values,

               dtype=torch.float32

           )

           all_labels.append(labels)



   combined_features = {k: torch.cat([f[k] for f in all_features]) for k in all_features[0].keys()}

   dataset = PreferenceDataset(

       combined_features,

       torch.cat(all_labels) if is_training else None

   )



   return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=is_training, num_workers=0)



def compute_log_loss(y_true, y_pred):

   y_true_indices = np.argmax(y_true, axis=1)

   return log_loss(y_true_indices, y_pred, labels=[0, 1, 2])



def train_epoch(model, train_loader, optimizer, criterion, device, scaler):

   model.train()

   total_loss = 0

   all_preds = []

   all_labels = []



   for batch in tqdm(train_loader):

       batch = {k: v.to(device) for k, v in batch.items()}

       labels = batch.pop('labels')

       labels = torch.argmax(labels, dim=1)  # Convert one-hot to indices



       optimizer.zero_grad()



       with torch.amp.autocast(device_type='cuda'):

           outputs = model(**batch)

           loss = criterion(outputs, labels)

           probs = F.softmax(outputs, dim=1)



       scaler.scale(loss).backward()

       torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

       scaler.step(optimizer)

       scaler.update()



       total_loss += loss.item()

       all_preds.extend(probs.detach().cpu().numpy())

       all_labels.extend(F.one_hot(labels, num_classes=3).cpu().numpy())



       del outputs, loss, probs

       torch.cuda.empty_cache()



   return total_loss / len(train_loader), np.array(all_preds), np.array(all_labels)



def evaluate(model, dataloader, device):

   model.eval()

   all_preds = []

   all_labels = []



   with torch.no_grad():

       for batch in dataloader:

           batch = {k: v.to(device) for k, v in batch.items()}

           labels = batch.pop('labels')

           labels = torch.argmax(labels, dim=1)



           with torch.amp.autocast(device_type='cuda'):

               outputs = model(**batch)

               probs = F.softmax(outputs, dim=1)



           all_preds.extend(probs.cpu().numpy())

           all_labels.extend(F.one_hot(labels, num_classes=3).cpu().numpy())



   return compute_log_loss(np.array(all_labels), np.array(all_preds))



def train_model(model, train_loader, valid_loader, device, num_epochs=3):

   optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

   criterion = nn.CrossEntropyLoss()

   scaler = torch.amp.GradScaler()

   best_logloss = float('inf')

   checkpoint_path = 'model_checkpoint.pt'



   for epoch in range(num_epochs):

       train_loss, train_preds, train_labels = train_epoch(

           model, train_loader, optimizer, criterion, device, scaler

       )



       train_logloss = compute_log_loss(train_labels, train_preds)

       val_logloss = evaluate(model, valid_loader, device)



       print(f"Epoch {epoch+1}:")

       print(f"Train Loss={train_loss:.4f}, Train LogLoss={train_logloss:.4f}")

       print(f"Val LogLoss={val_logloss:.4f}")



       if val_logloss < best_logloss:

           best_logloss = val_logloss

           torch.save(model.state_dict(), 'best_model.pt')

           print("Best model saved!")



       if (epoch + 1) % 1 == 0:

           torch.save({

               'epoch': epoch,

               'model_state_dict': model.state_dict(),

               'optimizer_state_dict': optimizer.state_dict(),

               'best_logloss': best_logloss,

           }, checkpoint_path)



# def generate_submission(model, test_loader, device):

#    model.eval()

#    predictions = []



#    with torch.no_grad():

#        for batch in tqdm(test_loader):

#            batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}

#            with torch.amp.autocast(device_type='cuda'):

#                outputs = model(**batch)

#                probs = F.softmax(outputs, dim=1)

#            predictions.extend(probs.cpu().numpy())



   # predictions = np.array(predictions)

   # submission = pd.DataFrame({

   #     'id': df_test['id'],

   #     'winner_model_a': predictions[:, 0],

   #     'winner_model_b': predictions[:, 1],

   #     'winner_tie': predictions[:, 2]

   # })



   # return submission

def generate_submission(model, test_loader, device):
   model.eval()
   predictions = []

   with torch.no_grad():
       for batch in tqdm(test_loader):
           batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
           with torch.amp.autocast(device_type='cuda'):
               outputs = model(**batch)
               probs = F.softmax(outputs, dim=1)
           predictions.extend(probs.cpu().numpy())

   predictions = np.array(predictions)
    
    # Create submission exactly like the successful version
   sub_df = df_test[["id"]].copy()
   class_names = ['winner_model_a', 'winner_model_b', 'winner_tie']
   sub_df[class_names] = predictions

    # Save with default pandas precision 
   sub_df.to_csv('submission.csv', index=False)
    
    # Verify format
   print("\nSubmission Preview:")
   print(sub_df.head())
   print("\nColumn dtypes:", sub_df.dtypes)
   print("\nFile size:", os.path.getsize('submission.csv'), "bytes")
    
   return sub_df


# Main Training and Prediction Loop

In [None]:
# Step 7: Main Execution

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

processor = DataProcessor()



train_data, valid_data = train_test_split(df_train, test_size=0.1, random_state=42)

train_loader = create_data_loader(train_data, processor, batch_size=16)

valid_loader = create_data_loader(valid_data, processor, batch_size=16)



model = PreferenceClassifier()

model = model.to(device)



train_model(model, train_loader, valid_loader, device)



test_loader = create_data_loader(df_test, processor, batch_size=16, is_training=False)

try:

   model.load_state_dict(torch.load('best_model.pt'))

except:

   print("Error loading model. Continuing with current weights.")



submission = generate_submission(model, test_loader, device)

submission.to_csv('submission.csv', index=False)

# Save a Model

In [None]:
# from transformers import AutoTokenizer, AutoModel
# import os

# # Create directory in Kaggle working directory
# LOCAL_MODEL_DIR = '/kaggle/working/deberta-v3-small'
# os.makedirs(LOCAL_MODEL_DIR, exist_ok=True)

# # Download model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
# model = AutoModel.from_pretrained("microsoft/deberta-v3-small")

# # Save locally
# tokenizer.save_pretrained(LOCAL_MODEL_DIR)
# model.save_pretrained(LOCAL_MODEL_DIR)

In [None]:
# # Install kagglehub if not already installed
# !pip install --upgrade kagglehub==0.3.4

# import kagglehub
# kagglehub.login()



In [None]:
# # Set up directories and model info
# LOCAL_MODEL_DIR = '/kaggle/working/deberta-v3-small'
# MODEL_SLUG = 'deberta-v3-small'
# VARIATION_SLUG = 'v1'  # Using v1 to indicate first version

# # Create directory and download model if not already done
# os.makedirs(LOCAL_MODEL_DIR, exist_ok=True)
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
# model = AutoModel.from_pretrained("microsoft/deberta-v3-small")

# # Save locally
# tokenizer.save_pretrained(LOCAL_MODEL_DIR)
# model.save_pretrained(LOCAL_MODEL_DIR)

# # Upload model with version info
# kagglehub.model_upload(
#     handle=f"ziyonaressker/{MODEL_SLUG}/transformers/{VARIATION_SLUG}",
#     local_model_dir=LOCAL_MODEL_DIR,
#     version_notes='DeBERTa-v3-small model v1 for LLM Classification task - 2024-12-04'
# )

In [None]:
# #verification
# # Test downloading the model
# test_path = kagglehub.model_download(f"ziyonaressker/{MODEL_SLUG}/transformers/{VARIATION_SLUG}")
# print(f"Model downloaded to: {test_path}")

# # Verify files exist
# print("\nFiles in downloaded directory:")
# print(os.listdir(test_path))