In [None]:
import json
from datasets import Dataset

# Load the training dataset (balanced version) from a JSON file.
# Each record is expected to contain the training text and classification label.
with open("train_balanced.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# Output a single record for structural verification of the dataset.
print(train_data[1])

# Convert loaded records into a Hugging Face Dataset object
# This enables efficient preprocessing and model training compatibility.
train_dataset = Dataset.from_list(train_data)

# Display sample content and dataset schema to confirm correct field formatting.
print(train_dataset[0])
print(train_dataset.features)

{'text': 'Age:40.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation sensory, motor weakness, optic neuritis. Number of attacks until the first visit:2. OCBs in CSF:negative. Visit diagnosis:rrms. DMT taken. Current DMT:tecfidera-dimethylfumarate on June 21, 2018. Vit D at visit: 61.47. MRI of brain or spine done. Received Gadolinium for Brain MRI. No new lesions on Brain MRI. Total T1 Gad-enhancing lesions in Brain: 0. Cervical spine MRI date: May 02, 2019. Received Gadolinium for Cervical spine MRI. Cervical spine MRI: Not compared to previous spine MRI. Total T1 Gad-enhancing lesions in Cervical spine: 0. The 25-foot walk done. 25-foot walk test time was 4.25 sec. SDMT done. SDMT numerator: 71. SDMT denominator: 71. 9-HPT done. 9-HPT ability: yes, able with right and left hands. Dominant hand:Right.  9-hole peg dominant:18 sec. 9-hole peg non-dominant:15.74 sec. EDSS:1- no disability, minimal signs in one FS (one FS grade 1). The patient is in NEDA. Wil

In [None]:
!pip install unsloth trl peft accelerate bitsandbytes



In [None]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [None]:
from unsloth import FastLanguageModel
import torch

# Model selection: Phi-3 Mini (4k context), 4-bit quantized version for efficient training.
model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"

# Maximum sequence length used for encoding the input text.
# Higher values allow longer examples but increase memory usage.
max_seq_length = 2048
dtype = None  # Automatic precision handling (BF16/FP16 depending on GPU support)

# Load base language model and tokenizer using Unsloth’s optimized runtime.
# 4-bit quantization reduces VRAM requirements while maintaining strong performance.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)


In [None]:
from datasets import Dataset
import json  # Required for safe serialization of output fields

# Function to transform each dataset record into a structured instruction format
# The model learns to associate the provided text with both the target output and correct label.
def format_prompt(example):
    return (
        f"### Text: {example['text']}\n"
        f"### Output: {json.dumps(example['output'])}\n"
        f"### Label: {example['label']}<|endoftext|>"
    )

# Convert raw JSON records into instruction-formatted samples for supervised fine-tuning (SFT)
formatted_data = [format_prompt(item) for item in train_data]

# Wrap the formatted text into a Hugging Face Dataset for efficient batching during training
dataset = Dataset.from_dict({"text": formatted_data})


In [None]:
# Apply LoRA (Low-Rank Adaptation) for parameter-efficient fine-tuning
# Only a small subset of model weights is trained while base weights remain frozen.
model = FastLanguageModel.get_peft_model(
    model,

    # LoRA hyperparameters:
    r=64,  # Rank of low-rank matrices (higher = more trainable capacity, but more VRAM)
    lora_alpha=128,  # Scaling factor that stabilizes LoRA updates
    lora_dropout=0.1,  # Regularization to reduce overfitting during fine-tuning

    # Specific model submodules where LoRA will be injected into attention and MLP layers
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",      # Attention projection layers
        "gate_proj", "up_proj", "down_proj",        # Feed-forward network layers
    ],

    # Additional efficiency optimizations
    bias="none",  # Bias terms not trained to reduce memory cost
    use_gradient_checkpointing="unsloth",  # Saves memory during training by recomputing activations
    random_state=3407,  # Ensures reproducible results
    use_rslora=False,  # Rank-stabilized LoRA disabled for this experiment
    loftq_config=None  # LoFTQ quantization not applied here
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.3 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Configure Supervised Fine-Tuning (SFT) with optimized memory + speed for Unsloth models
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,

    # Process the dataset as a single text field formatted earlier
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,

    dataset_num_proc=2,  # Multi-processing for faster tokenization

    # Training hyperparameters
    args=TrainingArguments(
        per_device_train_batch_size=4,      # Small GPUs-friendly batch per device
        gradient_accumulation_steps=4,      # Effective batch size = 4 × 4 = 16
        warmup_steps=25,                    # Stabilizes early training phase
        num_train_epochs=2,                 # Full dataset passes (higher risks overfitting on small data)
        learning_rate=1e-4,                 # Standard LR for LoRA fine-tuning

        # Mixed precision — picks FP16 or BF16 depending on hardware support
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),

        logging_steps=25,  # Log training loss every 25 update steps

        # Optimized 8-bit AdamW — reduces VRAM usage significantly
        optim="adamw_8bit",
        weight_decay=0.1,       # Regularization to combat overfitting
        lr_scheduler_type="linear",

        seed=3407,  # Ensures reproducibility of experiments

        output_dir="outputs",  # Trained model checkpoints saved here

        # Save model checkpoints each epoch, but keep only last 2 to save disk space
        save_strategy="epoch",
        save_total_limit=2,

        dataloader_pin_memory=False,  # Disabled to avoid CUDA memory fragmentation

        report_to="none",  # No external logging tools like Weights & Biases
    ),
)


In [None]:
# Train the model
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,372 | Num Epochs = 2 | Total steps = 298
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 119,537,664 of 3,940,617,216 (3.03% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,1.6643
50,0.3366
75,0.2361
100,0.2262
125,0.2153
150,0.2087
175,0.1974
200,0.1968
225,0.1974
250,0.1885


In [None]:
import pandas as pd
import torch
from unsloth import FastLanguageModel

FastLanguageModel.for_inference(model)  # ensure model is in inference mode

test_df = pd.read_json("test_balanced.json")

TP, TN, FP, FN = [], [], [], []
all_results = []

for idx, row in test_df.iterrows():
    text_input = row["text"]
    true_output = row["output"]
    true_label = row["label"]

    messages = [{"role": "user", "content": text_input}]

    # Tokenize prompt correctly
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    )

    input_ids = inputs["input_ids"].to("cuda")
    attention_mask = inputs["attention_mask"].to("cuda")

    # Generate
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=128,
        temperature=0.0,  # deterministic for classification
        do_sample=False,
    )

    # Slice to get new tokens only
    generated_tokens = outputs[:, input_ids.shape[1]:]
    pred_output = tokenizer.batch_decode(generated_tokens)[0].strip()

    pred_upper = pred_output.upper()

    # Label parsing
    if "NEDA" in pred_upper:
        pred_label = "NEDA"
    elif "EDA" in pred_upper:
        pred_label = "EDA"
    else:
        pred_label = "NEDA"
        print(f"⚠️ Unexpected output: {pred_output}")

    # Categorize
    if true_label == "EDA" and pred_label == "EDA":
        TP.append(pred_label)
        match="TP"
    elif true_label == "EDA" and pred_label == "NEDA":
        FN.append(pred_label)
        match="FN"
    elif true_label == "NEDA" and pred_label == "EDA":
        FP.append(pred_label)
        match="FP"
    else:
        TN.append(pred_label)
        match="TN"

    all_results.append([
        text_input, true_output, true_label, pred_output, pred_label, match,
    ])

# Metrics
num_TP, num_TN, num_FP, num_FN = map(len, (TP, TN, FP, FN))
total = num_TP + num_TN + num_FP + num_FN

accuracy = (num_TP + num_TN) / total
precision = num_TP / (num_TP + num_FP) if (num_TP + num_FP) else 0
recall = num_TP / (num_TP + num_FN) if (num_TP + num_FN) else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0

print("\n✔ Final Metrics")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"\nTP: {num_TP}, TN: {num_TN}, FP: {num_FP}, FN: {num_FN}")


Metrics:
Accuracy: 0.567
Precision: 0.569
Recall: 0.552
F1 Score: 0.561

Counts:
TP: 164, TN: 173, FP: 124, FN: 133


In [None]:
# Convert all evaluation results into a structured DataFrame for saving and analysis
results_df = pd.DataFrame(
    all_results,
    columns=[
        "text",          # Original input text
        "true_output",   # Ground-truth structured output
        "true_label",    # Ground-truth class label (EDA / NEDA)
        "pred_output",   # Raw text generated by the model
        "pred_label",    # Parsed predicted class label (EDA / NEDA)
        "result_type"    # Classification outcome category: TP, TN, FP, FN
    ],
)

# Inspect the first few rows to verify format before exporting
results_df.head()


In [None]:
results_df.to_excel("model_predictions_comparison.xlsx", index=False)

In [None]:
model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="q4_k_m")

In [None]:
from google.colab import files
import os

# Locate the fine-tuned GGUF model file and download it to the local machine for use or sharing
gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]
if gguf_files:
    gguf_file = os.path.join("gguf_model", gguf_files[0])
    print(f"Downloading: {gguf_file}")
    files.download(gguf_file)
