In [None]:
import json
from datasets import Dataset

# Load the training dataset (balanced version) from a JSON file.
# Each record is expected to contain the training text and classification label.
with open("train_balanced.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# Output a single record for structural verification of the dataset.
print(train_data[1])

# Convert loaded records into a Hugging Face Dataset object
# This enables efficient preprocessing and model training compatibility.
train_dataset = Dataset.from_list(train_data)

# Display sample content and dataset schema to confirm correct field formatting.
print(train_dataset[0])
print(train_dataset.features)


{'text': 'Age:40.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation sensory, motor weakness, optic neuritis. Number of attacks until the first visit:2. OCBs in CSF:negative. Visit diagnosis:rrms. DMT taken. Current DMT:tecfidera-dimethylfumarate on June 21, 2018. Vit D at visit: 61.47. MRI of brain or spine done. Received Gadolinium for Brain MRI. No new lesions on Brain MRI. Total T1 Gad-enhancing lesions in Brain: 0. Cervical spine MRI date: May 02, 2019. Received Gadolinium for Cervical spine MRI. Cervical spine MRI: Not compared to previous spine MRI. Total T1 Gad-enhancing lesions in Cervical spine: 0. The 25-foot walk done. 25-foot walk test time was 4.25 sec. SDMT done. SDMT numerator: 71. SDMT denominator: 71. 9-HPT done. 9-HPT ability: yes, able with right and left hands. Dominant hand:Right.  9-hole peg dominant:18 sec. 9-hole peg non-dominant:15.74 sec. EDSS:1- no disability, minimal signs in one FS (one FS grade 1). The patient is in NEDA. Wil

In [None]:
!pip install unsloth trl peft accelerate bitsandbytes



In [None]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [None]:
from unsloth import FastLanguageModel
import torch

# Model configuration
model_name = "unsloth/Llama-3.2-3B-Instruct"  # Using a 3B Llama model for instruction tuning
max_seq_length = 2048  # Maximum token length supported during training/inference
dtype = None  # Let Unsloth automatically select optimal precision

# Load the base Llama 3B model with 4-bit quantization
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,              # Mixed precision handled automatically
    load_in_4bit=True,        # Enables memory-efficient fine-tuning using quantization
)

# Display confirmation of loading
print(f"Loaded model: {model_name} with max sequence length {max_seq_length}")


In [None]:
from datasets import Dataset
import json

# Function to structure each text sample into an instruction-following format.
# This approach aligns with supervised fine-tuning (SFT) requirements for instruction models.
def format_prompt(example):
    return (
        f"### Text: {example['text']}\n"
        f"### Output: {json.dumps(example['output'])}\n"
        f"### Label: {example['label']}<|endoftext|>"
    )

# Apply the formatting to every sample in the loaded training data.
# 'train_data' must already be defined and loaded as a Python list of dicts.
formatted_data = [format_prompt(item) for item in train_data]

# Convert formatted text into a Hugging Face Dataset object
# to enable efficient batching and tokenization for training.
dataset = Dataset.from_dict({"text": formatted_data})

# Show confirmation and sample preview for validation
print(f"Dataset formatted successfully. Total samples: {len(dataset)}")
print(dataset[0])


In [None]:
# Configure Low-Rank Adaptation (LoRA) on the pre-trained model
# to enable parameter-efficient fine-tuning. Only a small subset
# of model weights are trained, significantly reducing memory usage.

model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank: controls capacity of the low-rank update; higher = more expressive
    target_modules=[        # Modules of the model where LoRA is applied
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,         # Scaling factor for LoRA updates (usually 2x rank)
    lora_dropout=0,         # Dropout rate for LoRA layers (0 for optimized performance)
    bias="none",            # No additional bias is trained; optimized for memory
    use_gradient_checkpointing="unsloth",  # Reduces GPU memory usage during backprop
    random_state=3407,      # Seed for reproducibility of LoRA initialization
    use_rslora=False,       # Disable rank-stabilized LoRA for standard fine-tuning
    loftq_config=None,      # Optional advanced quantization configuration
)



Unsloth 2025.11.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Initialize the SFTTrainer for supervised fine-tuning (SFT) using the Hugging Face/TRL API.
# This trainer is optimized for the Unsloth framework and supports LoRA adapters.

trainer = SFTTrainer(
    model=model,                  # The pre-trained model with LoRA adapters applied
    tokenizer=tokenizer,          # Corresponding tokenizer
    train_dataset=dataset,        # Training dataset containing prompts and labels
    dataset_text_field="text",    # Name of the field in the dataset containing text input
    max_seq_length=max_seq_length,# Maximum sequence length for inputs
    dataset_num_proc=2,           # Number of CPU processes to speed up data preprocessing
    args=TrainingArguments(       # Core training configuration
        per_device_train_batch_size=2,   # Number of samples per GPU per step
        gradient_accumulation_steps=8,   # Accumulate gradients to simulate larger batch size (2x8=16 effective)
        warmup_steps=25,                 # Linear learning rate warmup over first 25 steps
        num_train_epochs=1,              # Number of full passes over the training data
        learning_rate=1e-4,              # Initial learning rate for optimizer
        fp16=not torch.cuda.is_bf16_supported(),  # Use FP16 if BF16 not supported
        bf16=torch.cuda.is_bf16_supported(),      # Use BF16 if supported for efficiency
        logging_steps=25,                 # Log training metrics every 25 steps
        optim="adamw_8bit",               # Use 8-bit AdamW optimizer for memory efficiency
        weight_decay=0.1,                  # L2 regularization to prevent overfitting
        lr_scheduler_type="linear",        # Linear learning rate decay
        seed=3407,                         # Seed for reproducibility
        output_dir="outputs",              # Directory for saving checkpoints
        save_strategy="epoch",             # Save model at the end of each epoch
        save_total_limit=2,                # Keep only the two most recent checkpoints
        dataloader_pin_memory=False,       # Pin memory for faster DataLoader transfer to GPU
        report_to="none",                  # Disable external reporting (e.g., Weights & Biases)
    ),
)


In [None]:
# Train the model
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,372 | Num Epochs = 1 | Total steps = 149
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 97,255,424 of 3,310,005,248 (2.94% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,1.8385
50,0.3455
75,0.2653
100,0.2571
125,0.248


Step,Training Loss
25,1.8385
50,0.3455
75,0.2653
100,0.2571
125,0.248


In [None]:
import pandas as pd
import torch

# Load the test dataset for evaluation
# 'test_balanced.json' should contain the evaluation set with the same structure as the training set
test_df = pd.read_json("test_balanced.json")

# Initialize lists to store classification results
TP = []  # True Positives: correctly predicted EDA
TN = []  # True Negatives: correctly predicted NEDA
FP = []  # False Positives: predicted EDA but true label NEDA
FN = []  # False Negatives: predicted NEDA but true label EDA
all_results = []  # Store full details of each prediction

# Iterate through each row in the test dataset
for idx, row in test_df.iterrows():
    text_input = row['text']           # Raw text input
    true_output = row['output']        # Ground-truth extracted output
    true_label = row['label']          # Ground-truth classification label

    messages = [{"role": "user", "content": text_input}]

    # Tokenize input text using the LoRA-enabled tokenizer
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",  # Returns input_ids and attention_mask as PyTorch tensors
    ).to("cuda")

    # Generate predictions using the fine-tuned model
    outputs = model.generate(
        input_ids=inputs,
        attention_mask=torch.ones_like(inputs),  # Create attention mask
        max_new_tokens=256,                      # Maximum generated tokens
        use_cache=True,                          # Use caching for faster generation
        temperature=0.1,                         # Low temperature for deterministic predictions
        do_sample=True,                          # Sampling enabled for stochastic generation
        top_p=0.9,                               # Nucleus sampling
    )

    # Decode the generated sequence including the input prompt
    full_output_sequence = tokenizer.batch_decode(outputs)[0].strip()

    # Extract only the generated part (exclude input prompt tokens)
    generated_tokens = outputs[:, inputs.shape[1]:]
    pred_output = tokenizer.batch_decode(generated_tokens)[0].strip()

    # Robust label parsing for binary classification
    pred_upper = pred_output.upper()
    if "NEDA" in pred_upper:
        pred_label = "NEDA"
    elif "EDA" in pred_upper:
        pred_label = "EDA"
    else:
        pred_label = "NEDA"  # Fallback to avoid errors on unexpected output
        print(f"⚠️ WARNING: Unexpected model output: '{pred_output}'")

    # Assign predictions to TP, TN, FP, FN lists
    match = ""
    if true_label == "EDA":
        if pred_label == "EDA":
            TP.append([text_input, true_output, true_label, pred_output, pred_label, "TP"])
            match = "TP"
        else:  # Predicted NEDA instead of EDA
            FN.append([text_input, true_output, true_label, pred_output, pred_label, "FN"])
            match = "FN"
    else:  # True label is NEDA
        if pred_label == "EDA":
            FP.append([text_input, true_output, true_label, pred_output, pred_label, "FP"])
            match = "FP"
        else:  # Correctly predicted NEDA
            TN.append([text_input, true_output, true_label, pred_output, pred_label, "TN"])
            match = "TN"

    # Store full record of this prediction
    all_results.append([
        text_input, true_output, true_label, pred_output, pred_label, match
    ])

# Compute counts of each classification category
num_TP = len(TP)
num_TN = len(TN)
num_FP = len(FP)
num_FN = len(FN)

# Compute standard evaluation metrics with safeguards against division by zero
total_samples = num_TP + num_TN + num_FP + num_FN
accuracy = (num_TP + num_TN) / total_samples if total_samples > 0 else 0
precision_denominator = (num_TP + num_FP)
precision = num_TP / precision_denominator if precision_denominator > 0 else 0
recall_denominator = (num_TP + num_FN)
recall = num_TP / recall_denominator if recall_denominator > 0 else 0
f1_denominator = (precision + recall)
f1 = 2 * (precision * recall) / f1_denominator if f1_denominator > 0 else 0

# Display the evaluation metrics
print("Metrics:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

# Display counts for transparency
print("\nCounts:")
print(f"TP: {num_TP}, TN: {num_TN}, FP: {num_FP}, FN: {num_FN}")


Metrics:
Accuracy: 0.616
Precision: 0.707
Recall: 0.397
F1 Score: 0.509

Counts:
TP: 118, TN: 248, FP: 49, FN: 179


In [None]:
# Convert all evaluation results into a structured DataFrame for saving and analysis
results_df = pd.DataFrame(
    all_results,
    columns=[
        "text",          # Original input text
        "true_output",   # Ground-truth structured output
        "true_label",    # Ground-truth class label (EDA / NEDA)
        "pred_output",   # Raw text generated by the model
        "pred_label",    # Parsed predicted class label (EDA / NEDA)
        "result_type"    # Classification outcome category: TP, TN, FP, FN
    ],
)

# Inspect the first few rows to verify format before exporting
results_df.head()


In [None]:
results_df.to_excel("model_predictions_comparison.xlsx", index=False)

In [None]:
model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="q4_k_m")

In [None]:
from google.colab import files
import os

# Locate the fine-tuned GGUF model file and download it to the local machine for use or sharing
gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]
if gguf_files:
    gguf_file = os.path.join("gguf_model", gguf_files[0])
    print(f"Downloading: {gguf_file}")
    files.download(gguf_file)
