In [None]:
!pip install --no-deps /kaggle/input/pip-inference/bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl
!pip install --no-deps /kaggle/input/pip-inference/transformers-4.45.1-py3-none-any.whl
!pip install --no-deps /kaggle/input/pip-inference/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -q -U peft --no-index --find-links ../input/llm-detect-pip/

In [None]:
!pip show bitsandbytes
!pip show transformers
!pip show tokenizers
!pip show peft

In [None]:
import torch
import sklearn
import numpy as np
import pandas as pd
import time
import psutil
import os

from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from torch.cuda.amp import autocast
from threading import Thread

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

if (not torch.cuda.is_available()): print("Sorry - GPU required!")

In [None]:
def get_gpu_memory_usage(device_id=0, stage=""):
    if torch.cuda.is_available():
        torch.cuda.synchronize(device=device_id) # Pastikan semua operasi CUDA selesai
        allocated_mb = torch.cuda.memory_allocated(device=device_id) / (1024 * 1024)
        reserved_mb = torch.cuda.memory_reserved(device=device_id) / (1024 * 1024)
        total_mb = torch.cuda.get_device_properties(device=device_id).total_memory / (1024 * 1024)
        print(f"GPU {device_id} Memory ({stage}): Allocated={allocated_mb:.2f} MB, Reserved={reserved_mb:.2f} MB, Total={total_mb:.2f} MB")
        return allocated_mb, reserved_mb
    else:
        print(f"CUDA not available ({stage}). Cannot print GPU memory usage.")
        return None, None

def get_system_memory_usage(stage=""):
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    rss_mb = mem_info.rss / (1024 * 1024) # Resident Set Size in MB
    vms_mb = mem_info.vms / (1024 * 1024) # Virtual Memory Size in MB
    print(f"System RAM Usage ({stage}): RSS={rss_mb:.2f} MB, VMS={vms_mb:.2f} MB")
    return rss_mb, vms_mb

In [None]:
get_system_memory_usage("Initial - Before everything")
get_gpu_memory_usage(device_id=0, stage="Initial - GPU 0")
if torch.cuda.device_count() > 1:
    get_gpu_memory_usage(device_id=1, stage="Initial - GPU 1")

In [None]:
MODEL_NAME = '/kaggle/input/llama-3.2/transformers/1b-instruct/1'
WEIGHTS_PATH = '/kaggle/input/3-1-training-3-2-1b-instruct-lora-4/llama_3_finetuned_model.pth'
MAX_LENGTH = 1024
BATCH_SIZE = 8
DEVICE = torch.device("cuda")    

# Prepare Data 

In [None]:
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
sample_sub = pd.read_csv('/kaggle/input/llm-classification-finetuning/sample_submission.csv')

# concatenate strings in list
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)

display(sample_sub)
display(test.head(5))

In [None]:
# Prepare text for model
test['text'] = 'User prompt: ' + test['prompt'] +  '\n\nModel A :\n' + test['response_a'] +'\n\n--------\n\nModel B:\n'  + test['response_b']
print(test['text'][0])

# Tokenize

In [None]:
%%time

tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/lmsys-model/tokenizer')

tokens = tokenizer(test['text'].tolist(), padding='max_length',
                   max_length=MAX_LENGTH, truncation=True, return_tensors='pt')

INPUT_IDS = tokens['input_ids'].to(DEVICE, dtype=torch.int32)
ATTENTION_MASKS = tokens['attention_mask'].to(DEVICE, dtype=torch.int32)

# Move tensors to CPU and convert them to lists
input_ids_cpu = [tensor.cpu().tolist() for tensor in INPUT_IDS]
attention_masks_cpu = [tensor.cpu().tolist() for tensor in ATTENTION_MASKS]

data = pd.DataFrame()
data['INPUT_IDS'] = input_ids_cpu
data['ATTENTION_MASKS'] = attention_masks_cpu
data[:2]
get_system_memory_usage("After tokenizing data")

# Load model 

In [None]:
# BitsAndBytes configuration
bnb_config =  BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    bnb_8bit_use_double_quant=False)

# Load base model on GPU 0
device0 = torch.device('cuda:0')
get_system_memory_usage("Before loading base_model_0")
get_gpu_memory_usage(device_id=0, stage="Before loading base_model_0")

base_model_0 = LlamaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map='cuda:0')
base_model_0.config.pad_token_id = tokenizer.pad_token_id
get_system_memory_usage("After loading base_model_0")
get_gpu_memory_usage(device_id=0, stage="After loading base_model_0")

# Load base model on GPU 1
device1 = torch.device('cuda:1')
get_system_memory_usage("Before loading base_model_1")
get_gpu_memory_usage(device_id=1, stage="Before loading base_model_1")

base_model_1 = LlamaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map='cuda:1')
base_model_1.config.pad_token_id = tokenizer.pad_token_id
get_system_memory_usage("After loading base_model_1")
get_gpu_memory_usage(device_id=1, stage="After loading base_model_1")

# Load weights 

In [None]:
# LoRa configuration
peft_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    bias='none',
    inference_mode=True,
    task_type=TaskType.SEQ_CLS,
    target_modules=['o_proj', 'v_proj'])

In [None]:
# Get peft
model_0 = get_peft_model(base_model_0, peft_config).to(device0) 
# Load weights
model_0.load_state_dict(torch.load(WEIGHTS_PATH), strict=False)
model_0.eval()
get_gpu_memory_usage(device_id=0, stage="After PEFT and loading weights for model_0")

model_1 = get_peft_model(base_model_1, peft_config).to(device1)
model_1.load_state_dict(torch.load(WEIGHTS_PATH), strict=False)
model_1.eval()
get_gpu_memory_usage(device_id=1, stage="After PEFT and loading weights for model_1")

In [None]:
# Trainable Parameters
model_0.print_trainable_parameters(), model_1.print_trainable_parameters()

# Inference

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()
get_system_memory_usage("Before inference loop")
get_gpu_memory_usage(device_id=0, stage="Before inference - GPU 0")
if device1: get_gpu_memory_usage(device_id=1, stage="Before inference - GPU 1")

In [None]:
def inference(df, model, device, batch_size=BATCH_SIZE):
    input_ids = torch.tensor(df['INPUT_IDS'].values.tolist(), dtype=torch.long)
    attention_mask = torch.tensor(df['ATTENTION_MASKS'].values.tolist(), dtype=torch.long)
    
    generated_class_a = []
    generated_class_b = []
    generated_class_c = []

    model.eval()
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch_input_ids = input_ids[start_idx:end_idx].to(device)
        batch_attention_mask = attention_mask[start_idx:end_idx].to(device)
        
        with torch.no_grad():
            with autocast():
                outputs = model(
                    input_ids=batch_input_ids,
                    attention_mask=batch_attention_mask
                )
        
        probabilities = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
        
        generated_class_a.extend(probabilities[:, 0])
        generated_class_b.extend(probabilities[:, 1])
        generated_class_c.extend(probabilities[:, 2])
    
    df['winner_model_a'] = generated_class_a
    df['winner_model_b'] = generated_class_b
    df['winner_tie'] = generated_class_c

    torch.cuda.empty_cache()  

    return df

In [None]:
st = time.time()

N_SAMPLES = len(data)

# Split the data into two subsets
half = round(N_SAMPLES / 2)
sub1 = data.iloc[0:half].copy()
sub2 = data.iloc[half:N_SAMPLES].copy()

# Function to run inference in a thread
def run_inference(df, model, device, results, index):
    results[index] = inference(df, model, device)

# Dictionary to store results from threads
results = {}

# start threads
t0 = Thread(target=run_inference, args=(sub1, model_0, device0, results, 0))
t1 = Thread(target=run_inference, args=(sub2, model_1, device1, results, 1))

t0.start()
t1.start()

# Wait for all threads to finish
t0.join()
t1.join()

# Combine results back into the original DataFrame
data = pd.concat([results[0], results[1]], axis=0)

print(f"Processing complete. Total time: {time.time() - st}")
get_system_memory_usage("After inference loop")
get_gpu_memory_usage(device_id=0, stage="After inference - GPU 0")
if device1: get_gpu_memory_usage(device_id=1, stage="After inference - GPU 1")

In [None]:
TARGETS = ['winner_model_a', 'winner_model_b', 'winner_tie']

sample_sub[TARGETS] = data[TARGETS]
display(sample_sub)

In [None]:
sample_sub.to_csv('submission.csv', index=False)