In [None]:
import json
from datasets import Dataset

# Load the training dataset (balanced version) from a JSON file.
# Each record is expected to contain the training text and classification label.
with open("train_balanced.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# Output a single record for structural verification of the dataset.
print(train_data[1])

# Convert loaded records into a Hugging Face Dataset object
# This enables efficient preprocessing and model training compatibility.
train_dataset = Dataset.from_list(train_data)

# Display sample content and dataset schema to confirm correct field formatting.
print(train_dataset[0])
print(train_dataset.features)


{'text': 'Age:40.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation sensory, motor weakness, optic neuritis. Number of attacks until the first visit:2. OCBs in CSF:negative. Visit diagnosis:rrms. DMT taken. Current DMT:tecfidera-dimethylfumarate on June 21, 2018. Vit D at visit: 61.47. MRI of brain or spine done. Received Gadolinium for Brain MRI. No new lesions on Brain MRI. Total T1 Gad-enhancing lesions in Brain: 0. Cervical spine MRI date: May 02, 2019. Received Gadolinium for Cervical spine MRI. Cervical spine MRI: Not compared to previous spine MRI. Total T1 Gad-enhancing lesions in Cervical spine: 0. The 25-foot walk done. 25-foot walk test time was 4.25 sec. SDMT done. SDMT numerator: 71. SDMT denominator: 71. 9-HPT done. 9-HPT ability: yes, able with right and left hands. Dominant hand:Right.  9-hole peg dominant:18 sec. 9-hole peg non-dominant:15.74 sec. EDSS:1- no disability, minimal signs in one FS (one FS grade 1). The patient is in NEDA. Wil

In [None]:
!pip install unsloth trl peft accelerate bitsandbytes



In [None]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [None]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/Llama-3.2-3B-Instruct"

max_seq_length = 2048  # Maximum number of tokens for input sequences
dtype = None  # Automatically determine the appropriate data type for model loading

# Load the LLaMA 3B model and corresponding tokenizer in 4-bit precision to optimize GPU memory usage
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)


In [None]:
from datasets import Dataset

# Convert each training example into a structured prompt for fine-tuning
def format_prompt(example):
    return f"### Text: {example['text']}\n### Output: {json.dumps(example['output'])}\n### Label: {example['label']}<|endoftext|>"

# Apply the formatting function to all training examples
formatted_data = [format_prompt(item) for item in train_data]

# Create a Hugging Face Dataset from the formatted data to use with the trainer
dataset = Dataset.from_dict({"text": formatted_data})


In [None]:
# Integrate LoRA adapters into the pre-trained model for parameter-efficient fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank determines adapter capacity; higher = more expressive but uses more memory
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # Core attention projections
        "gate_proj", "up_proj", "down_proj",    # Feed-forward network projections
    ],
    lora_alpha=128,          # Scaling factor to balance adapter updates
    lora_dropout=0,          # No dropout for stability during training
    bias="none",             # Disable additional bias parameters
    use_gradient_checkpointing="unsloth",  # Optimized gradient checkpointing for memory efficiency
    random_state=3407,       # Ensure reproducibility
    use_rslora=False,        # Disable rank-stabilized LoRA for this run
    loftq_config=None,       # No custom quantization configuration
)


Unsloth 2025.11.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
# Initialize SFTTrainer for fine-tuning the model using parameter-efficient LoRA adapters
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,           # Training dataset containing formatted text prompts
    dataset_text_field="text",       # Field in dataset to use for training
    max_seq_length=max_seq_length,   # Maximum sequence length for tokenization
    dataset_num_proc=2,              # Number of processes for dataset preprocessing
    args=TrainingArguments(
        per_device_train_batch_size=2,       # Batch size per GPU
        gradient_accumulation_steps=8,       # Accumulate gradients to simulate larger batch size
        warmup_steps=50,                     # Number of warmup steps for learning rate scheduling
        num_train_epochs=3,                  # Total epochs for fine-tuning
        learning_rate=2e-4,                  # Initial learning rate
        fp16=not torch.cuda.is_bf16_supported(), # Use FP16 if BF16 not supported
        bf16=torch.cuda.is_bf16_supported(),     # Use BF16 if supported by GPU
        logging_steps=10,                     # Frequency of logging training info
        optim="adamw_8bit",                   # Optimizer using 8-bit AdamW for memory efficiency
        weight_decay=0.01,                    # Weight decay for regularization
        lr_scheduler_type="linear",           # Linear learning rate decay
        seed=3407,                             # Seed for reproducibility
        output_dir="outputs",                  # Directory to save checkpoints and outputs
        report_to="none",                      # Disable external logging (e.g., WandB)
    ),
)


In [None]:
# Train the model
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,372 | Num Epochs = 3 | Total steps = 447
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 97,255,424 of 3,310,005,248 (2.94% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.8662
20,1.4187
30,0.5274
40,0.3443
50,0.296
60,0.2702
70,0.2619
80,0.253
90,0.2595
100,0.2458


In [None]:
import pandas as pd
import torch

# Enable optimized inference mode for the fine-tuned model using Unsloth
FastLanguageModel.for_inference(model)

# Load the test dataset containing text samples and labels
test_df = pd.read_json("test_balanced.json")

# Initialize lists to store classification outcomes
TP, TN, FP, FN = [], [], [], []
all_results = []

# Iterate over each test example for evaluation
for idx, row in test_df.iterrows():
    text_input = row['text']         # Raw input text for classification
    true_output = row['output']      # Ground-truth structured output
    true_label = row['label']        # Ground-truth class label (EDA/NEDA)

    messages = [{"role": "user", "content": text_input}]

    # Tokenize and prepare inputs for model generation
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",  # Returns dictionary with input_ids and attention_mask
    ).to("cuda")

    # Generate model output sequence (includes input + generated tokens)
    outputs = model.generate(
        input_ids=inputs,
        attention_mask=torch.ones_like(inputs),  # Ensures proper attention handling
        max_new_tokens=256,
        use_cache=True,
        temperature=0.1,
        do_sample=True,
        top_p=0.9,
    )

    # Decode the full output sequence including the prompt
    full_output_sequence = tokenizer.batch_decode(outputs)[0].strip()

    # Extract only the generated portion beyond the input prompt
    generated_tokens = outputs[:, inputs.shape[1]:]
    pred_output = tokenizer.batch_decode(generated_tokens)[0].strip()

    # Convert model output to uppercase for robust label parsing
    pred_upper = pred_output.upper()
    if "NEDA" in pred_upper:
        pred_label = "NEDA"
    elif "EDA" in pred_upper:
        pred_label = "EDA"
    else:
        pred_label = "NEDA"  # Default fallback
        print(f"⚠️ WARNING: Unexpected model output: '{pred_output}'")

    # Determine classification outcome category (TP, TN, FP, FN)
    match = ""
    if true_label == "EDA":
        if pred_label == "EDA":
            TP.append([text_input, true_output, true_label, pred_output, pred_label, "TP"])
            match = "TP"
        else:
            FN.append([text_input, true_output, true_label, pred_output, pred_label, "FN"])
            match = "FN"
    else:
        if pred_label == "EDA":
            FP.append([text_input, true_output, true_label, pred_output, pred_label, "FP"])
            match = "FP"
        else:
            TN.append([text_input, true_output, true_label, pred_output, pred_label, "TN"])
            match = "TN"

    # Append all relevant info for later analysis
    all_results.append([
        text_input, true_output, true_label, pred_output, pred_label, match
    ])

# Calculate evaluation metrics with safety for division by zero
num_TP, num_TN, num_FP, num_FN = len(TP), len(TN), len(FP), len(FN)
total_samples = num_TP + num_TN + num_FP + num_FN
accuracy = (num_TP + num_TN) / total_samples if total_samples > 0 else 0
precision = num_TP / (num_TP + num_FP) if (num_TP + num_FP) > 0 else 0
recall = num_TP / (num_TP + num_FN) if (num_TP + num_FN) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Print metrics summary
print("Metrics:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

# Print counts of each classification category
print("\nCounts:")
print(f"TP: {num_TP}, TN: {num_TN}, FP: {num_FP}, FN: {num_FN}")


Metrics:
Accuracy: 0.611
Precision: 0.706
Recall: 0.380
F1 Score: 0.495

Counts:
TP: 113, TN: 250, FP: 47, FN: 184


In [None]:
# Convert all evaluation results into a structured DataFrame for saving and analysis
results_df = pd.DataFrame(
    all_results,
    columns=[
        "text",          # Original input text
        "true_output",   # Ground-truth structured output
        "true_label",    # Ground-truth class label (EDA / NEDA)
        "pred_output",   # Raw text generated by the model
        "pred_label",    # Parsed predicted class label (EDA / NEDA)
        "result_type"    # Classification outcome category: TP, TN, FP, FN
    ],
)

# Inspect the first few rows to verify format before exporting
results_df.head()


In [None]:
results_df.to_excel("model_predictions_comparison.xlsx", index=False)

In [None]:
model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="q4_k_m")

In [None]:
from google.colab import files
import os

# Locate the fine-tuned GGUF model file and download it to the local machine for use or sharing
gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]
if gguf_files:
    gguf_file = os.path.join("gguf_model", gguf_files[0])
    print(f"Downloading: {gguf_file}")
    files.download(gguf_file)
