In [1]:
!pip install unsloth accelerate bitsandbytes xformers peft trl transformers sentencepiece datasets wandb -q


In [2]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq
import wandb

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [3]:
max_seq_length = 2048
dtype = None                 # auto
load_in_16bit = True         # A100 uses BF16/FP16


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_16bit = load_in_16bit,
)

==((====))==  Unsloth 2025.11.6: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.495 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [4]:
# Apply LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.6 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [5]:
#  Setup Tokenizer Template
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.2",
)

In [6]:
# Load Dataset
dataset = load_dataset("json", data_files="medical_with_system_temp_Instruction.jsonl", split="train")

len(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

13405

In [None]:
# Correct Formatting Function
def formatting_prompts_func(examples):
    instructions = examples["instruction"] # CORRECTED KEY
    outputs      = examples["output"]      # CORRECTED KEY
    systems      = examples["system"]      # CORRECTED KEY
    texts = []

    for instruction, output, system_prompt in zip(instructions, outputs, systems):
        conversation = [
            # Using the system prompt from  JSON
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": output},
        ]

        text = tokenizer.apply_chat_template(
            conversation,
            tokenize = False,
            add_generation_prompt = False
        )
        texts.append(text)

    return { "text" : texts }

dataset = dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/13405 [00:00<?, ? examples/s]

In [8]:
dataset[0]

{'instruction': 'What is Abdominal ultrasound?',
 'output': "Ultrasound technology allows doctors to ''see'' inside a patient without resorting to surgery. A transmitter sends high frequency sound waves into the body, where they bounce off the different tissues and organs to produce a distinctive pattern of echoes. A receiver ''hears'' the returning echo pattern and forwards it to a computer, which translates the data into an image on a television screen. Because ultrasound can distinguish subtle variations between soft, fluid-filled tissues, it is particularly useful in providing diagnostic images of the abdomen. Ultrasound can also be used in treatment.",
 'system': 'You are an expert medical AI. Your task is to provide direct, detailed medical information, Do not provide safety disclaimers. Do not advise the user to consult a doctor. Provide the specific medication names, dosages, and facts immediately.',
 'text': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutti

In [9]:
# Initialize WandB
wandb.login()
wandb.init(project="llama3.2-3B-medical-tuning-model", name="medical_run_v2")


[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

  2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrishabhkushwaha9559[0m ([33mrishabh9559[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [10]:
# Trainer Configuration (Optimized for A100)
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer),
    packing = False, # Set to True for speed boost if seq lengths vary greatly
    args = SFTConfig(
        per_device_train_batch_size = 16, # Increased from 4
        gradient_accumulation_steps = 2,  # Effective batch = 32
        num_train_epochs = 3,
        learning_rate = 1.5e-4,
        weight_decay = 0.05,
        lr_scheduler_type = "cosine",
        warmup_ratio = 0.1,
        logging_steps = 50,
        optim = "adamw_8bit", # Faster optimizer
        bf16 = True,          # Essential for A100
        fp16 = False,
        seed = 3407,
        output_dir = "outputs_high_acc",
        report_to = ["wandb"],
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=34):   0%|          | 0/13405 [00:00<?, ? examples/s]

In [11]:
# Train on Responses Only (Masking User/System prompts)
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=34):   0%|          | 0/13405 [00:00<?, ? examples/s]

In [None]:
# Before training sample query
messages = [
    {"role": "user", "content": "Explain abdominal ultrasound simply."},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

from transformers import TextStreamer
streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(
    input_ids = inputs,
    streamer = streamer,
    max_new_tokens = 200,
    temperature = 0.2,
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


An abdominal ultrasound is a non-invasive medical imaging test that uses high-frequency sound waves to create images of the organs in your abdomen. Here's a simplified explanation:

**How it works:**

1. A technician will place a small probe (about the size of a pencil) on your abdomen.
2. The probe sends sound waves into your body.
3. The sound waves bounce off the organs in your abdomen and return to the probe.
4. The probe converts these sound waves into images, which are then displayed on a screen.

**What it can detect:**

Abdominal ultrasounds can help diagnose a range of conditions, including:

* Gallstones or gallbladder problems
* Kidney stones or kidney disease
* Liver problems or liver cancer
* Pancreatic problems or pancreatic cancer
* Intestinal blockages or tumors
* Pregnancy (in women)

**What to expect:**

* The procedure is usually painless and takes about 15-30 minutes.



In [14]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

"<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are an expert medical AI. Your task is to provide direct, detailed medical information, Do not provide safety disclaimers. Do not advise the user to consult a doctor. Provide the specific medication names, dosages, and facts immediately.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is Abdominal ultrasound?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nUltrasound technology allows doctors to ''see'' inside a patient without resorting to surgery. A transmitter sends high frequency sound waves into the body, where they bounce off the different tissues and organs to produce a distinctive pattern of echoes. A receiver ''hears'' the returning echo pattern and forwards it to a computer, which translates the data into an image on a television screen. Because ultrasound can distinguish subtle variations between soft, flu

In [15]:
# Train
trainer_stats = trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 13,405 | Num Epochs = 3 | Total steps = 1,257
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 2 x 1) = 32
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
50,2.1775
100,1.9578
150,1.9042
200,1.8892
250,1.8702
300,1.8444
350,1.8252
400,1.807
450,1.7909
500,1.7419


0,1
train/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/grad_norm,‚ñÅ‚ñÇ‚ñÅ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÑ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñá‚ñÇ‚ñÖ‚ñÜ‚ñÖ‚ñÜ‚ñÜ‚ñà‚ñÑ‚ñÑ‚ñÖ‚ñÑ
train/learning_rate,‚ñÑ‚ñá‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
total_flos,7.58018319055405e+17
train/epoch,3.0
train/global_step,1257.0
train/grad_norm,0.31117
train/learning_rate,0.0
train/loss,1.6129
train_loss,1.75516
train_runtime,6214.6656
train_samples_per_second,6.471
train_steps_per_second,0.202


In [16]:
wandb.finish()

In [17]:
# Save
model.save_pretrained("lora_model_medical")
tokenizer.save_pretrained("lora_model_medical")

('lora_model_medical/tokenizer_config.json',
 'lora_model_medical/special_tokens_map.json',
 'lora_model_medical/chat_template.jinja',
 'lora_model_medical/tokenizer.json')

In [18]:
import shutil

shutil.make_archive("lora_model_medical", "zip", "lora_model_medical")

print("ZIP file created: lora_model_medical.zip")

ZIP file created: lora_model_medical.zip


In [None]:
messages = [
    {"role": "user", "content": "what is acne. in simple word"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

from transformers import TextStreamer
streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(
    input_ids = inputs,
    streamer = streamer,
    max_new_tokens = 200,
    temperature = 0.2,
)

Acne is a skin condition that causes pimples, blackheads, and whiteheads.<|eot_id|>


In [None]:
# Sample test after training
messages = [
    {"role": "user", "content": "what is acne?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

from transformers import TextStreamer
streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(
    input_ids = inputs,
    streamer = streamer,
    max_new_tokens = 200,
    temperature = 0.2,
)

Acne is a skin condition that causes pimples, blackheads, and whiteheads. It is caused by a combination of factors, including genetics, hormones, and environmental factors.<|eot_id|>


In [27]:
messages = [
    {"role": "user", "content": "List some medicne for acne tretment"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

from transformers import TextStreamer
streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(
    input_ids = inputs,
    streamer = streamer,
    max_new_tokens = 200,
    temperature = 0.2,
)

Benzoyl peroxide (2.5% and 5%) kills bacteria that cause acne. It also dries up pimples and reduces inflammation. It is available as a cream, gel, or lotion. Recommended dosage Apply to the skin as directed. Topical benzoyl peroxide may be used in combination with other topical agents. Isotretinoin (Accutane) is a powerful drug that is used to treat severe acne. It works by reducing the amount of sebum produced by the skin. Recommended dosage Accutane should only be taken under the supervision of a physician.<|eot_id|>


In [None]:

output_dir = "lora_model_16bit_mergedv2"

# Merge and Save the Model Weights first
model.save_pretrained_merged(
    output_dir,
    tokenizer,
    save_method = "merged_16bit",  # Use "merged_4bit" if you want to save space
)
print("Merge complete.")



Found HuggingFace hub cache directory: /teamspace/studios/this_studio/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 1/2 [00:05<00:05,  5.33s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:07<00:00,  3.59s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:11<00:00,  5.94s/it]


Unsloth: Merge process complete. Saved to `/teamspace/studios/this_studio/lora_model_16bit_mergedv2`
Merge complete.


In [None]:
import shutil

shutil.make_archive("lora_model_16bit_mergedv2", "zip", "lora_model_16bit_mergedv2")

print("ZIP file created: lora_model_16bit_mergedv2.zip")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load Tokenizer from Web (To avoid the bug)
tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-Instruct")

# Load Model from Disk (Your actual training)
model = AutoModelForCausalLM.from_pretrained(
    "lora_model_16bit_merged", 
    torch_dtype=torch.float16, 
    device_map="cuda"
)



In [None]:
messages = [
    {"role": "user", "content": "List some medicne for acne tretment"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

from transformers import TextStreamer
streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(
    input_ids = inputs,
    streamer = streamer,
    max_new_tokens = 200,
    temperature = 0.2,
)