In [None]:
!pip install -q -U bitsandbytes --no-index --find-links ../input/llama3-pip/
!pip install -q -U transformers --no-index --find-links ../input/llama3-pip/
!pip install -q -U tokenizers --no-index --find-links ../input/llama3-pip/
!pip install -q -U accelerate --no-index --find-links ../input/llama3-pip/
!pip install -q -U peft --no-index --find-links ../input/llama3-pip/

In [None]:
import torch
import sklearn
import numpy as np
import pandas as pd
import accelerate
import time
import emoji
import os
import torch.nn as nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoTokenizer, LlamaModel,LlamaPreTrainedModel, LlamaForSequenceClassification,AutoConfig, BitsAndBytesConfig
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from torch.cuda.amp import autocast
from threading import Thread

#放弃向huggingface访问
# os.environ['TRANSFORMERS_OFFLINE']="1"

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

print(torch.device("cuda"))
if (not torch.cuda.is_available()): print("Sorry - GPU required!")

print("pip all imported")

In [None]:
MODEL_NAME = '/kaggle/input/llama3-2-1b-model/model/'
WEIGHTS_PATH = '/kaggle/input/llama-3-finetuned-model/transformers/default/1/llama_3_finetuned_model.pth'
MAX_LENGTH = 2048
batch_size= 3
DEVICE = torch.device("cpu")    

# Prepare Data 

In [None]:
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
sample_sub = pd.read_csv('/kaggle/input/llm-classification-finetuning/sample_submission.csv')


# concatenate strings in list
def process(input_str):
    stripped_str = input_str.strip('[]')
    text = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(text)

test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)


# train = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
# train.loc[:, 'prompt'] = train['prompt'].apply(process)
# train.loc[:, 'response_a'] = train['response_a'].apply(process)
# train.loc[:, 'response_b'] = train['response_b'].apply(process)

display(sample_sub)
display(test.head(6))

In [None]:
# Prepare text for model
test['text'] = 'User prompt: ' + test['prompt'] +  '\n\nModel A :\n' + test['response_a'] +'\n\n--------\n\nModel B:\n'  + test['response_b']
#train
#train['text'] = 'User prompt: ' + train['prompt'] +  '\n\nModel A :\n' + train['response_a'] +'\n\n--------\n\nModel B:\n'  + train['response_b']
print(test['text'][0])

# Tokenize

In [None]:
%%time

tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/llama-fine-tune/tokenizer',local_files_only=True)
#train
# train_tokens = tokenizer(train['text'].tolist(), padding='max_length',
#                    max_length=MAX_LENGTH, truncation=True, return_tensors='pt')

# train_INPUT_IDS = train_tokens['input_ids'].to(DEVICE, dtype=torch.int32)
# train_ATTENTION_MASKS = train_tokens['attention_mask'].to(DEVICE, dtype=torch.int32)

#test
test_tokens = tokenizer(test['text'].tolist(), padding='max_length',
                   max_length=MAX_LENGTH, truncation=True, return_tensors='pt')
test_INPUT_IDS = test_tokens['input_ids'].to(DEVICE, dtype=torch.int32)
test_ATTENTION_MASKS = test_tokens['attention_mask'].to(DEVICE, dtype=torch.int32)

In [None]:
input_ids_test = [tensor.tolist() for tensor in test_INPUT_IDS]
attention_masks_test = [tensor.tolist() for tensor in test_ATTENTION_MASKS]
data = pd.DataFrame()
data['INPUT_IDS'] = input_ids_test
data['ATTENTION_MASKS'] = attention_masks_test
data[:2]

In [None]:

#train_datasets

from torch.utils.data import Dataset, DataLoader,random_split
class LlamaDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": torch.tensor(self.labels[0][idx] ,dtype=torch.long)
        }

# train_lables = [train['winner_model_a'].values.astype(int)*0 + train['winner_model_b'].values.astype(int)*1 + train['winner_tie'].values.astype(int)*2]
# dataset = LlamaDataset(train_INPUT_IDS, train_ATTENTION_MASKS, train_lables)

# l = len(dataset)
# train_size = int(0.9 * l)  # 用于训练
# eval_size = l - train_size  # 剩余用于测试

# print(len(dataset) == train_size+eval_size)
# train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])
# print(eval_dataset[0])

# Load model 
We load 1 model on each gpu.  

In [None]:
from peft import IA3Config, TaskType, get_peft_model,PeftModel
bnb_config =  BitsAndBytesConfig(
    load_in_8bit=True
)


#inference use for eval 
peft_config = LoraConfig(
    # r=4,
    # lora_alpha=8,
    # lora_dropout=0.05,
    # bias='none',
    inference_mode=True,  #注意不要打开eval模式
    task_type=TaskType.SEQ_CLS,
    # target_modules=['q_proj', 'v_proj'],  # 修改这里
)

In [None]:
from transformers import TrainingArguments , Trainer

#trainner_set

training_args = TrainingArguments(
    output_dir="/kaggle/working",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=2e-5,              # PEFT参数学习率
    weight_decay=0.01,    #L2 正则化系数。
    warmup_ratio=0.03,
    fp16=False,                     
    bf16=torch.cuda.is_bf16_supported(), 
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_total_limit=2,  #旧的 checkpoint 会被删除，以节省存储空间
    report_to="tensorboard",
    dataloader_pin_memory=True
)

In [None]:

class LlamaForClassification(LlamaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = LlamaModel(config)
        # 动态分类头维度
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size // 2),
            nn.GELU(),
            nn.Linear(config.hidden_size // 2, 3)
        )
        self.post_init()

    def forward(self, input_ids, attention_mask, labels=None,inputs_embeds=None, **kwargs):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
            **kwargs  # Pass any other keyword arguments
        )
        hidden_states = outputs.last_hidden_state
        logits = self.classifier(hidden_states[:, -1, :])

        #无需显式处理损失计算的细节
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions
        )
        
device = torch.device("cuda")

#train from zero use model_name
#continue train or eval use checkpoints
            
checkpoints = "/kaggle/input/1epoch/transformers/default/2/checkpoint-1500"
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=3)
model =  LlamaForClassification.from_pretrained(
    MODEL_NAME,
    config=config,
    device_map="auto",
)

model = PeftModel.from_pretrained(model, checkpoints)
#从头trainer需要重新设置peft的操作
#model = get_peft_model(model, peft_config)
model.eval()
print("model load success")

Now, we have sucessfully loaded one model on each GPU!

# model weight

In [None]:
#load weight

#model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
model.print_trainable_parameters()

#trainner start

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
   
# )
# trainer.train(resume_from_checkpoint = "/kaggle/input/check-points/transformers/default/1/checkpoint-1000")
# trainer.save_model("/kaggle/working/llama3-model")

# Inference


In [None]:
import gc
gc.collect()

In [None]:
def inference(df, model, device, batch_size=batch_size):
    input_ids = torch.tensor(df['INPUT_IDS'].values.tolist(), dtype=torch.long)
    attention_mask = torch.tensor(df['ATTENTION_MASKS'].values.tolist(), dtype=torch.long)
    
    generated_class_a = []
    generated_class_b = []
    generated_class_c = []

    model.eval()
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch_input_ids = input_ids[start_idx:end_idx].to(device)
        batch_attention_mask = attention_mask[start_idx:end_idx].to(device)
        
        with torch.no_grad():
            with autocast():
                outputs = model(
                    input_ids=batch_input_ids,
                    attention_mask=batch_attention_mask
                )
        
        probabilities = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
        
        generated_class_a.extend(probabilities[:, 0])
        generated_class_b.extend(probabilities[:, 1])
        generated_class_c.extend(probabilities[:, 2])
    
    df['winner_model_a'] = generated_class_a
    df['winner_model_b'] = generated_class_b
    df['winner_tie'] = generated_class_c

    torch.cuda.empty_cache()  

    return df

In [None]:

# Function to run inference in a thread
def run_inference(df, model, device, results, index):
    results[index] = inference(df, model, device)

# Dictionary to store results from threads
results = {}

print("ready for start")

In [None]:
st = time.time()
# start threads
run_inference(data, model, device, results, 0)
print(f"model eval spend 推理总花费时常: {time.time() - st}")

In [None]:
TARGETS = ['winner_model_a', 'winner_model_b', 'winner_tie']
sample_sub[TARGETS] = data[TARGETS]


#llama3  swap data 0,1
# sample_sub.iloc[:,[1, 2]] = sample_sub.iloc[:,[2, 1]].values

display(sample_sub)

In [None]:
sample_sub.to_csv('/kaggle/working/submission.csv', index=False)
print("answer get")

Inference completes in ~4.5 hrs, there are still stuff to improve upon this. I would encourage to try out different post-processing and share. Kaggle way :) 