In [1]:
pip install torch transformers datasets unsloth trl tqdm pillow numpy huggingface_hub scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os  
import torch  
import pandas as pd  
from datasets import load_dataset  
from transformers import TextStreamer  
from unsloth import FastVisionModel, is_bf16_supported  
from unsloth.trainer import UnslothVisionDataCollator  
from trl import SFTTrainer, SFTConfig  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score  
from huggingface_hub import login  

# Login for Hugging Face model push/pull  
login()


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastVisionModel, is_bf16_supported


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
dataset = load_dataset("ravisri/bird-presence-classification")  
train_samples = dataset["train"]  
test_samples  = dataset["test"]

README.md:   0%|          | 0.00/563 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/48.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2299 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/575 [00:00<?, ? examples/s]

In [27]:
instruction = "Does this image contains a bird?, Answer 0 if you can see any bird answer 1 if you can't see any bird, no other answer is valid even if the image is pixalated or blurry"  

def label_to_int(label):  # Map text '0'/0 to int 0 (bird), else 1 (no_bird)  
    return 1 if str(label) == "1" else 0

def convert_to_conversation(sample):  
    conversation = [  
        {"role": "user", "content": [  
            {"type": "text", "text": instruction},  
            {"type": "image", "image": sample["image"]}  
        ]},  
        {"role": "assistant", "content": [  
            {"type": "text", "text": sample["caption"]}  
        ]},  
    ]  
    return {  
        "messages": conversation,  
        "label": label_to_int(sample["label"]),  
        "image-id": sample["image-id"],  
        "image": sample["image"],  
        "caption": sample["caption"]  
    }  

train_data = [convert_to_conversation(s) for s in train_samples]  
test_data  = [convert_to_conversation(s) for s in test_samples]

In [28]:
import collections  
print(collections.Counter([s["label"] for s in test_data]))  
# Or, if you want to see the original label strings:  
print(collections.Counter([s["label"] for s in test_samples]))

Counter({0: 288, 1: 287})
Counter({0: 288, 1: 287})


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"  

model, tokenizer = FastVisionModel.from_pretrained(  
    "unsloth/Llama-3.2-11B-Vision-Instruct",  
    load_in_4bit=True,  
    use_gradient_checkpointing="unsloth",  
)  
model = FastVisionModel.get_peft_model(  
    model,  
    finetune_vision_layers=True,  
    finetune_language_layers=True,  
    finetune_attention_modules=True,  
    finetune_mlp_modules=True,  
    r=16, lora_alpha=16, lora_dropout=0,  
    bias="none", random_state=3407,  
    use_rslora=False, loftq_config=None,  
)

==((====))==  Unsloth 2025.4.7: Fast Mllama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients


In [31]:
import re  

def parse_prediction(output_text):
    s = output_text.lower().strip()  
    if "no bird" in s:  
        pred_label = 1  # no_bird  
    elif "bird" in s:  
        pred_label = 0  # bird  
    else:  
        pred_label = None

In [53]:
import pandas as pd  
import torch  
import re  
from tqdm import tqdm  

def calculate_per_class_metrics(y_true, y_pred):  
    # For "bird" class (label 0)  
    tp_bird = ((y_true == 0) & (y_pred == 0)).sum()  
    fp_bird = ((y_true != 0) & (y_pred == 0)).sum()  
    fn_bird = ((y_true == 0) & (y_pred != 0)).sum()  

    precision_bird = tp_bird / (tp_bird + fp_bird) if (tp_bird + fp_bird) > 0 else 0  
    recall_bird = tp_bird / (tp_bird + fn_bird) if (tp_bird + fn_bird) > 0 else 0  
    f1_bird = 2 * precision_bird * recall_bird / (precision_bird + recall_bird) if (precision_bird + recall_bird) > 0 else 0  

    # For "no_bird" class (label 1)  
    tp_no_bird = ((y_true == 1) & (y_pred == 1)).sum()  
    fp_no_bird = ((y_true != 1) & (y_pred == 1)).sum()  
    fn_no_bird = ((y_true == 1) & (y_pred != 1)).sum()  

    precision_no_bird = tp_no_bird / (tp_no_bird + fp_no_bird) if (tp_no_bird + fp_no_bird) > 0 else 0  
    recall_no_bird = tp_no_bird / (tp_no_bird + fn_no_bird) if (tp_no_bird + fn_no_bird) > 0 else 0  
    f1_no_bird = 2 * precision_no_bird * recall_no_bird / (precision_no_bird + recall_no_bird) if (precision_no_bird + recall_no_bird) > 0 else 0  

    # Overall accuracy  
    accuracy = (y_true == y_pred).mean()  

    print(f"\nOverall Accuracy: {accuracy:.4f}")  

    print("\nClass: Bird (label 0)")  
    print(f"  Precision: {precision_bird:.4f}")  
    print(f"  Recall:    {recall_bird:.4f}")  
    print(f"  F1-score:  {f1_bird:.4f}")  
    print(f"  TP: {tp_bird}, FP: {fp_bird}, FN: {fn_bird}")  

    print("\nClass: No bird (label 1)")  
    print(f"  Precision: {precision_no_bird:.4f}")  
    print(f"  Recall:    {recall_no_bird:.4f}")  
    print(f"  F1-score:  {f1_no_bird:.4f}")  
    print(f"  TP: {tp_no_bird}, FP: {fp_no_bird}, FN: {fn_no_bird}")  

def get_predictions_fast(test_data, model, tokenizer, instruction, batch_size=8, results_file="test_predictions.csv"):  
    results = []  
    device = "cuda" if torch.cuda.is_available() else "cpu"  
    model.eval()    
    n = len(test_data)  

    for i in tqdm(range(0, n, batch_size)):  
        batch = test_data[i:i+batch_size]  
        messages_batch = [  
            [  
                {"role": "user", "content": [  
                    {"type": "text", "text": instruction},  
                    {"type": "image", "image": sample["image"]}  
                ]}  
            ] for sample in batch  
        ]  
        input_texts = [  
            tokenizer.apply_chat_template(messages, add_generation_prompt=True)  
            for messages in messages_batch  
        ]  
        images = [[sample["image"]] for sample in batch]  
        try:  
            inputs = tokenizer(  
                images, input_texts,  
                add_special_tokens=False,  
                return_tensors="pt",  
                padding=True  
            )  
        except Exception as e:  
            print("Error in batch at index", i, ":", e)  
            continue  

        inputs = {k: v.to(device) for k, v in inputs.items()}  

        with torch.no_grad():  
            output_ids = model.generate(**inputs, max_new_tokens=8)  

        for j, output in enumerate(output_ids):  
            output_text = tokenizer.decode(output, skip_special_tokens=True).strip()  
            s = output_text.lower()  
            if "no bird" in s:  
                pred_label = 1  
            elif "bird" in s:  
                pred_label = 0  
            elif re.match(r"\D*1\b", s):  
                pred_label = 1  
            elif re.match(r"\D*0\b", s):  
                pred_label = 0  
            else:  
                pred_label = None  
            idx = i + j  
            results.append({  
                "index": idx,  
                "true_label": batch[j]["label"],  
                "prediction": pred_label,  
                "output_text": output_text,  
            })  

    df = pd.DataFrame(results)  
    df.to_csv(results_file, index=False)  
    print(f"Results saved to {results_file}.")  
    # Filter invalid predictions  
    valid = df['prediction'].notna() & df['true_label'].notna()  
    y_true = df.loc[valid, "true_label"].astype(int)  
    y_pred = df.loc[valid, "prediction"].astype(int)  
    calculate_per_class_metrics(y_true, y_pred) 

In [8]:
from unsloth import FastVisionModel, UnslothTrainer, UnslothTrainingArguments, UnslothVisionDataCollator  

FastVisionModel.for_training(model)  

training_args = UnslothTrainingArguments(  
    per_device_train_batch_size=2,  
    gradient_accumulation_steps=4,  
    warmup_steps=5,  
    max_steps=30,  
    learning_rate=2e-4,  
    fp16=not is_bf16_supported(),  
    bf16=is_bf16_supported(),  
    logging_steps=1,  
    optim="adamw_8bit",  
    weight_decay=0.01,  
    lr_scheduler_type="linear",  
    seed=3407,  
    output_dir="outputs",  
    report_to="none",  
    remove_unused_columns=False,  
    dataset_text_field="",  
    dataset_kwargs={"skip_prepare_dataset": True},  
    dataset_num_proc=4,  
    max_seq_length=2048,  
)  

trainer = UnslothTrainer(  
    model=model,  
    tokenizer=tokenizer,  
    data_collator=UnslothVisionDataCollator(model, tokenizer),  
    train_dataset=train_data,  
    args=training_args,  
)  

trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,299 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 67,174,400/11,000,000,000 (0.61% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.2389
2,3.1671
3,3.1955
4,2.9549
5,2.4594
6,2.0304
7,1.5619
8,1.0748
9,0.6112
10,0.379


In [9]:
from huggingface_hub import HfApi  

hub_model_id = "ravisri/finetuned-llama-model"  # Change as you like!  
api = HfApi()  

# Option 1: Create a repo (does nothing if it exists)  
api.create_repo(repo_id=hub_model_id, exist_ok=True)  

# Option 2: Upload with `push_to_hub`  
model.save_pretrained(hub_model_id, push_to_hub=True)  
tokenizer.save_pretrained(hub_model_id, push_to_hub=True)

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[]

In [12]:
model.save_pretrained("finetuned-llama-model")  
tokenizer.save_pretrained("finetuned-llama-model")

[]

In [13]:
model.push_to_hub("ravisri/finetuned-llama-model")  
tokenizer.push_to_hub("ravisri/finetuned-llama-model")

adapter_model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

Saved model to https://huggingface.co/ravisri/finetuned-llama-model


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
from unsloth import FastVisionModel  

BASE_MODEL_PATH = "unsloth/Llama-3.2-11B-Vision-Instruct"  

model, tokenizer = FastVisionModel.from_pretrained(  
    BASE_MODEL_PATH,  
    load_in_4bit=True,  
    use_gradient_checkpointing="unsloth",  
)  
model = FastVisionModel.get_peft_model(  
    model,  
    finetune_vision_layers=True,  
    finetune_language_layers=True,  
    finetune_attention_modules=True,  
    finetune_mlp_modules=True,  
    r=16, lora_alpha=16, lora_dropout=0,  
    bias="none", random_state=3407,  
    use_rslora=False, loftq_config=None,  
)  

# Evaluate BEFORE finetuning  
print("\n==== BEFORE finetuning ====\n")  
# For base model:  
get_predictions_fast(test_data, model, tokenizer, instruction, batch_size=8, results_file="test_predictions_base.csv")  

==((====))==  Unsloth 2025.4.7: Fast Mllama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients

==== BEFORE finetuning ====



100%|██████████| 72/72 [03:03<00:00,  2.54s/it]

Results saved to test_predictions_base.csv.

Overall Accuracy: 0.5009

Class: Bird (label 0)
  Precision: 0.5009
  Recall:    1.0000
  F1-score:  0.6674
  TP: 288, FP: 287, FN: 0

Class: No bird (label 1)
  Precision: 0.0000
  Recall:    0.0000
  F1-score:  0.0000
  TP: 0, FP: 0, FN: 287





In [56]:
from unsloth import FastVisionModel  

BASE_MODEL_PATH = "unsloth/Llama-3.2-11B-Vision-Instruct"  
ADAPTER_PATH = "ravisri/finetuned-llama-model" # or "./lora_model" if local  

model, tokenizer = FastVisionModel.from_pretrained(  
    BASE_MODEL_PATH,  
    load_in_4bit=True,  
    use_gradient_checkpointing="unsloth",  
)  
model.load_adapter(ADAPTER_PATH)  

# Evaluate AFTER finetuning  
print("\n==== AFTER finetuning ====\n")  

# For finetuned model:  
get_predictions_fast(test_data, model, tokenizer, instruction, batch_size=8, results_file="test_predictions_finetuned.csv")  

==((====))==  Unsloth 2025.4.7: Fast Mllama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


==== AFTER finetuning ====



100%|██████████| 72/72 [03:45<00:00,  3.13s/it]

Results saved to test_predictions_finetuned.csv.

Overall Accuracy: 0.8574

Class: Bird (label 0)
  Precision: 0.8787
  Recall:    0.8299
  F1-score:  0.8536
  TP: 239, FP: 33, FN: 49

Class: No bird (label 1)
  Precision: 0.8383
  Recall:    0.8850
  F1-score:  0.8610
  TP: 254, FP: 49, FN: 33



