<a href="https://colab.research.google.com/github/RobinfRoth/ip5/blob/main/llm_model3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [14]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

ajshaseewer_raw_dataset_path = kagglehub.dataset_download('ajshaseewer/raw-dataset')
ajshaseewer_challenge_description_path = kagglehub.dataset_download('ajshaseewer/challenge-description')
robinroth_finetuned_llm_one_word_prediction_path = kagglehub.dataset_download('robinroth/finetuned-llm-one-word-prediction')
metaresearch_llama_3_2_transformers_3b_instruct_1_path = kagglehub.model_download('metaresearch/llama-3.2/Transformers/3b-instruct/1')

print('Data source import complete.')


Data source import complete.


# Model 4 - Finetune LLAMA using Lora and a Classification Head

## 1. Setup

In [39]:
%%capture
%pip install -U --upgrade transformers==4.45.2
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb
%pip install -U scikit-learn
%pip install -U evaluate

In [40]:
import os
import evaluate
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
from google.colab import userdata

TRAIN_MODEL = True
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [41]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb, json
from datasets import load_dataset
from datasets import Dataset
from trl import SFTTrainer, setup_chat_format, SFTConfig

In [42]:
from huggingface_hub import login
from google.colab import userdata
hf_token = userdata.get("HUGGINFACE_TOKEN")
login(token = hf_token)

In [43]:
wb_token = userdata.get("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3.2 on Student Programming Projects',
    job_type="training",
    anonymous="allow"
)



In [44]:
base_model = metaresearch_llama_3_2_transformers_3b_instruct_1_path
results_path = "results"
new_model = "llama-3.2-3b-it-programming-projects-V7"

challenge_description = pd.read_csv(ajshaseewer_challenge_description_path + "/Frogger.csv")
challenge_dict = { k:v for k, v in zip(challenge_description.to_dict()["Challenge Name"].values(),
                                       challenge_description.to_dict()["Challenge description"].values()) }
label_names = list(challenge_dict.keys())
classes = [ c for c in label_names ]
class2id = { c:id for id, c in enumerate(classes) }
id2class = { id:c for c, id in class2id.items() }

## 2. Loading the model, tokenizer and define metrics

In [18]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    return_dict=True,
    num_labels=len(classes),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /root/.cache/kagglehub/models/metaresearch/llama-3.2/Transformers/3b-instruct/1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from sklearn.metrics import f1_score
import torch

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    sigmoid = torch.nn.Sigmoid()

    predictions = sigmoid(torch.Tensor(logits))
    y_pred = np.zeros(predictions.shape)
    y_pred[np.where(predictions >= 0.5)] = 1

    y_true = labels.astype(np.int32)

    f1_micro = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_macro = f1_score(y_true=y_true, y_pred=y_pred, average='macro')

    return {'f1_micro': f1_micro, 'f1_macro': f1_macro}

## 3. Loading and processing of the dataset

In [21]:
def preprocess_labels(row):
    row['labels'] =  row[label_names].astype("float").to_list()
    return row

In [22]:
dataset_name = ajshaseewer_raw_dataset_path + "/raw_dataset.csv"

# Importing the dataset
df = pd.read_csv(dataset_name)

challenge_names = df.columns[7:].to_list()

# remove floats from xml
df["xml_code"] = df["xml_code"].str.replace(r"[\dD]+(\.[\dD]+)+", "", regex=True)
# replace file separator by a newline
df["xml_code"] = df["xml_code"].str.replace("%", "\n")

df = df.apply(preprocess_labels, axis=1)

### Use a MultiLabel Classification Head

In [29]:
def tokenize_data(example):
    result = tokenizer(example["xml_code"], truncation=True, max_length=4096)
    result["labels"] = example["labels"]
    return result

In [30]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=42)
df_test, df_validation = train_test_split(df_test, train_size=0.4, random_state=42)

ds_train = Dataset.from_pandas(df_train[["xml_code", "labels"]])
ds_test = Dataset.from_pandas(df_test[["xml_code", "labels"]])
ds_validation = Dataset.from_pandas(df_validation[["xml_code", "labels"]])

ds_train = ds_train.map(tokenize_data, batched=True, remove_columns=["xml_code"])
ds_test = ds_test.map(tokenize_data, batched=True, remove_columns=["xml_code"])
ds_validation = ds_validation.map(tokenize_data, batched=True, remove_columns=["xml_code"])

Map:   0%|          | 0/302 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

In [31]:
# Sample of the set
ds_train

Dataset({
    features: ['labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 302
})

## 4. Setting up The Model and Finetune it

In [32]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [33]:
# LoRA config
peft_config = LoraConfig(
    r=8, # rank = how much of the models weights are adjusted
    lora_alpha=32,
    lora_dropout=0.2,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["lm_head", "q_proj", "v_proj"],  #modules,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# set pad_token_id
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

trainable params: 2,336,768 || all params: 3,215,129,600 || trainable%: 0.0727


In [35]:
# Hyperparamters
training_arguments = TrainingArguments(
    output_dir=results_path,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=16,
    optim="adamw_torch",
    lr_scheduler_type="linear",
    num_train_epochs=10,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=5,
    eval_accumulation_steps=1,
    logging_steps=5,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    group_by_length=True,
    report_to="wandb",
    metric_for_best_model="f1_micro",
    load_best_model_at_end=True,
)

In [36]:
# Setting parameters
trainer = Trainer(
    model=model,
    train_dataset=ds_train,
    eval_dataset=ds_validation,
    processing_class=tokenizer,
    args=training_arguments,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [37]:
print(ds_validation["labels"])

[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.

In [45]:
if TRAIN_MODEL:
    trainer.train()

TypeError: LlamaForSequenceClassification.forward() got an unexpected keyword argument 'num_items_in_batch'

In [None]:
def plot_loss(training_loss: list[float], validation_loss: list[float]) -> None:
    # Create subplots
    fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

    # training loss
    axes[0].plot(range(0, len(training_loss)), training_loss, label='Training Loss', marker='o', color="blue")
    axes[0].set_xlabel('Iterations', fontsize=14)
    axes[0].set_ylabel('Loss', fontsize=14)
    axes[0].set_title('Loss Function During Training', fontsize=16)
    axes[0].grid(True, linestyle='--', alpha=0.6)
    axes[0].legend(fontsize=12)

    # validation loss
    axes[1].plot(range(0, len(validation_loss)), validation_loss, label='Validation Loss', marker='s', color="orange")
    axes[1].set_xlabel('Iterations', fontsize=14)
    axes[1].set_title('Loss Function During Validation', fontsize=16)
    axes[1].grid(True, linestyle='--', alpha=0.6)
    axes[1].legend(fontsize=12)

    # Show the plot
    plt.tight_layout()
    plt.show()

In [None]:
if TRAIN_MODEL:
    wandb.finish()

    train_loss = []
    val_loss = []
    for elem in trainer.state.log_history:
        if "loss" in elem.keys():
            train_loss.append(elem["loss"])
        if "eval_loss" in elem.keys():
            val_loss.append(elem["eval_loss"])

    plot_loss(train_loss, val_loss)

In [None]:
if TRAIN_MODEL:
    best_checkpoint_dir = trainer.state.best_model_checkpoint
    print(f"Best model checkpoint saved at: {best_checkpoint_dir}")
    best_model = trainer.model
    best_model.save_pretrained(best_checkpoint_dir)
    trainer.tokenizer.save_pretrained(best_checkpoint_dir)

## 5. Loading Finetuned Model

In [None]:
base_mode_url = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
finetuned_model_path = "/kaggle/input/finetuned-llm-one-word-prediction/llama-3.2-3b-it-programming-projects-V5"

tokenizer = AutoTokenizer.from_pretrained(base_mode_url)

base_model_reload = AutoModelForSequenceClassification.from_pretrained(
    base_mode_url,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    num_labels=len(classes),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
)

In [None]:
# Merge adapter with base model

lora_model = PeftModel.from_pretrained(base_model_reload, finetuned_model_path)
model = lora_model.merge_and_unload()

In [None]:
# Load the pipeline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None) # top_k=None shows all predictions

idx = 2
row = df_test.iloc[idx]
xml_code = row["xml_code"]
labels = row.iloc[7:-1]

# Get predictions
predictions = pipe(xml_code)
print(predictions)
print(f"Ground truth:\n {labels}")

###

In [None]:
complete_model = "complete_model_5v"
model.save_pretrained(complete_model)
tokenizer.save_pretrained(complete_model)

model.push_to_hub(complete_model, use_temp_dir=False)