In [None]:
!pip install -q -U peft --no-index --find-links ../input/llm-detect-pip/

In [None]:
import torch
import sklearn
import numpy as np
import pandas as pd
import time

from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from torch.cuda.amp import autocast
from threading import Thread

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

if (not torch.cuda.is_available()): print("Sorry - GPU required!")

In [None]:
MAX_LENGTH = 1024
BATCH_SIZE = 2
DEVICE = torch.device("cuda:0")  

In [None]:
test = pd.read_csv('/kaggle/input/c/llm-classification-finetuning/test.csv')
sample_sub = pd.read_csv('/kaggle/input/c/llm-classification-finetuning/sample_submission.csv')

# concatenate strings in list
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)

display(sample_sub)
display(test.head(5))

In [None]:
# Prepare text for model
test['text'] = 'User prompt: ' + test['prompt'] +  '\n\nModel A :\n' + test['response_a'] +'\n\n--------\n\nModel B:\n'  + test['response_b']
print(test['text'][0])

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/llama3.2-3b-instruct-hf/transformers/default/1', local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True
#tokenizer.save_pretrained('tokenizer')

In [None]:
%%time

tokens = tokenizer(test['text'].tolist(), padding='max_length',
                   max_length=MAX_LENGTH, truncation=True, return_tensors='pt')

INPUT_IDS = tokens['input_ids'].to(DEVICE, dtype=torch.int32)
ATTENTION_MASKS = tokens['attention_mask'].to(DEVICE, dtype=torch.int32)

# Move tensors to CPU and convert them to lists
input_ids_cpu = [tensor.cpu().tolist() for tensor in INPUT_IDS]
attention_masks_cpu = [tensor.cpu().tolist() for tensor in ATTENTION_MASKS]

data = pd.DataFrame()
data['INPUT_IDS'] = input_ids_cpu
data['ATTENTION_MASKS'] = attention_masks_cpu
data[:2]

In [None]:
base_model = LlamaForSequenceClassification.from_pretrained(
    '/kaggle/input/llama3.2-3b-instruct-hf/transformers/default/1',
    num_labels=3,
    #torch_dtype=torch.float32
    torch_dtype=torch.bfloat16,
    device_map='cuda:0'
)
base_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# LoRa configuration
peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.05,
    bias='none',
    inference_mode=True,
    task_type=TaskType.SEQ_CLS,
    target_modules=['o_proj', 'v_proj'])

In [None]:
# Get peft
device='cuda:0'
model_0 = get_peft_model(base_model, peft_config).to(device) 
# Load weights
model_0.load_state_dict(torch.load('/kaggle/input/llama_3b__v2/transformers/default/1/model_llama_3_cp_1_v1.pth'), strict=False)
model_0.eval()

In [None]:
model_0.print_trainable_parameters()

In [None]:
import gc
gc.collect()

In [None]:
def inference(df, model, device, batch_size=BATCH_SIZE):
    input_ids = torch.tensor(df['INPUT_IDS'].values.tolist(), dtype=torch.long)
    attention_mask = torch.tensor(df['ATTENTION_MASKS'].values.tolist(), dtype=torch.long)
    
    generated_class_a = []
    generated_class_b = []
    generated_class_c = []

    model.eval()
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch_input_ids = input_ids[start_idx:end_idx].to(device)
        batch_attention_mask = attention_mask[start_idx:end_idx].to(device)
        
        with torch.no_grad():
            with autocast():
                outputs = model(
                    input_ids=batch_input_ids,
                    attention_mask=batch_attention_mask
                )
        
        probabilities = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
        
        generated_class_a.extend(probabilities[:, 0])
        generated_class_b.extend(probabilities[:, 1])
        generated_class_c.extend(probabilities[:, 2])
    
    df['winner_model_a'] = generated_class_a
    df['winner_model_b'] = generated_class_b
    df['winner_tie'] = generated_class_c

    torch.cuda.empty_cache()  

    return df

In [None]:
import time

st = time.time()

# Run inference on the entire dataset with the single model
data = inference(data, model_0, device)

print(f"Processing complete. Total time: {time.time() - st}")

In [None]:
TARGETS = ['winner_model_a', 'winner_model_b', 'winner_tie']

sample_sub[TARGETS] = data[TARGETS]
display(sample_sub)

In [None]:
sample_sub.to_csv('submission.csv', index=False)