In [1]:
#  Check GPU availability
# Test if CUDA is available
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

CUDA available: True
GPU device: Tesla T4


In [2]:
# Create and setup hugging face API key in secrets (HF_TOKEN)

In [3]:
# Install required dependencies
!pip install weave
!pip install unsloth # install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git # Also get the latest version Unsloth!


Collecting unsloth
  Downloading unsloth-2025.12.9-py3-none-any.whl.metadata (65 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.9/65.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.12.7 (from unsloth)
  Downloading unsloth_zoo-2025.12.7-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from u

Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-arro7qna
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-arro7qna
  Resolved https://github.com/unslothai/unsloth.git to commit cf4342bf41e4a93573d08392b11f8093b30ddb8f
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.12.9-py3-none-any.whl size=383294 sha256=4b0eb91e4fcc0cd4b9a07d787dcf51dfed13b0718f5ad668ad15a540eefb2e12
  Stored in directory: /tmp/pip-ephem-wheel-cache-niuacllu/wheels/60/3e/1f/e576c07051d90cf64b6a41434d87ccf4db33fafd5343bf5de0
Successfully built unsloth
Installing collected packages: unsloth


In [4]:
#  Import necessary libraries
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer # TRL supports the Supervised Fine-Tuning (SFT) Trainer for training language models.
from unsloth import is_bfloat16_supported
from huggingface_hub import login
from transformers import TrainingArguments
from datasets import load_dataset
import wandb

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
# Check Hugging face token

# Check HF token
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')
login(hf_token)


In [6]:
# Check WANDB_API_TOKEN token
from google.colab import userdata

wandb_token = userdata.get('WANDB_API_TOKEN')

if wandb_token:
    print("WANDB_API_TOKEN successfully retrieved!")
    # For security reasons, I won't print the full token here,
    # but you can inspect the 'wandb_token' variable if needed.
    print(f"First 5 characters of WANDB_API_TOKEN: {wandb_token[:5]}...")
else:
    print("WANDB_API_TOKEN not found in Colab secrets. Please add it.")

WANDB_API_TOKEN successfully retrieved!
First 5 characters of WANDB_API_TOKEN: 
# S...


In [7]:
# Setup pretrained deep seek r1
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
MAX_SEQUENCE_LENGTH = 2048
DTYPE = None # We keep it None , when we give data to our model it can check in which formatt he data it is getting reset
LOAD_IN_4BIT = True # Quantized model finetune


# Use model from  unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
    token =  hf_token
)

==((====))==  Unsloth 2025.12.9: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [8]:
# To Test The Model (System Prompt)
prompt_style = """
Below is a task description along with additional context provided in the input section. Your goal is to provide an accurate, clinically sound response that directly addresses the request.

Use your medical expertise to ensure the answer is evidence-based, clearly structured, and easy to understand. When appropriate, briefly explain the rationale behind key conclusions without revealing internal reasoning processes.

### Task:
You are a medical expert specializing in clinical reasoning, diagnostics, and treatment planning. Answer the medical question below using your advanced knowledge.
Provide a clear, concise, and clinically appropriate response. Include relevant explanations, differential considerations, and recommendations as needed.

### Query:
{question}

### Answer:

"""


In [9]:
# Ask a complicated question medical reasoning question from the model before finetuning

# Inference on the model
# question asked
question = """
A 61-year-old woman with a long history of involuntary urine leakage during activities such as coughing or sneezing,
but no nocturnal leakage, undergoes a gynecologic examination and a Q-tip test.
Based on these findings, what would cystometry most likely show regarding her postvoid residual volume and detrusor muscle contractions?

"""
# inference - Provide a clear, concise, and clinically appropriate response. Include relevant explanations, differential considerations, and recommendations as needed.
# Take inference with the help of unsloth
FastLanguageModel.for_inference(model)

# Tokenize the input
inputs = tokenizer([prompt_style.format(question=question)], return_tensors="pt").to("cuda")  # tokenize the input

# Generate a response
outputs = model.generate (
    input_ids = inputs.input_ids, # Think of this as the question you're giving to the AI model, but translated into numbers the computer understands.
    attention_mask = inputs.attention_mask, # Imagine you're highlighting the important parts of your question and crossing out any extra spaces or empty bits.(attention mask is used for padding)
    max_new_tokens = 1200, # Response Limit
    use_cache = True # if similar question is asked instead of running a whole , store it in cache fro there it will generate the output for fast response
)

# Decode the response tokens back to text
# response = tokenizer.batch_decode(outputs)

# Decode the response tokens back to text
# better response
# response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

In [10]:
# Decode the response tokens back to text
response = tokenizer.batch_decode(outputs)
# Print full output
print(response)

["<｜begin▁of▁sentence｜>\nBelow is a task description along with additional context provided in the input section. Your goal is to provide an accurate, clinically sound response that directly addresses the request.\n\nUse your medical expertise to ensure the answer is evidence-based, clearly structured, and easy to understand. When appropriate, briefly explain the rationale behind key conclusions without revealing internal reasoning processes.\n\n### Task:\nYou are a medical expert specializing in clinical reasoning, diagnostics, and treatment planning. Answer the medical question below using your advanced knowledge.\nProvide a clear, concise, and clinically appropriate response. Include relevant explanations, differential considerations, and recommendations as needed.\n\n### Query:\n\nA 61-year-old woman with a long history of involuntary urine leakage during activities such as coughing or sneezing,\nbut no nocturnal leakage, undergoes a gynecologic examination and a Q-tip test.\nBased

In [11]:
# Decode the response tokens back to text
# better response
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# Print full output
print(response)




Below is a task description along with additional context provided in the input section. Your goal is to provide an accurate, clinically sound response that directly addresses the request.

Use your medical expertise to ensure the answer is evidence-based, clearly structured, and easy to understand. When appropriate, briefly explain the rationale behind key conclusions without revealing internal reasoning processes.

### Task:
You are a medical expert specializing in clinical reasoning, diagnostics, and treatment planning. Answer the medical question below using your advanced knowledge.
Provide a clear, concise, and clinically appropriate response. Include relevant explanations, differential considerations, and recommendations as needed.

### Query:

A 61-year-old woman with a long history of involuntary urine leakage during activities such as coughing or sneezing,
but no nocturnal leakage, undergoes a gynecologic examination and a Q-tip test.
Based on these findings, what would cysto

In [12]:
# Extract the answer part along with the think
answer = response.split("### Answer:")[-1].strip()
print(answer)

Cystometry, specifically a filling cytometry test, is the most appropriate next step. This test evaluates the capacity of the bladder and the contractile function of the detrusor muscle. The findings from this test can provide direct information about the post-void residual volume and the presence of detrusor muscle contractions, which are key components in the diagnosis and management of conditions like urinary incontinence.

Additionally, if the Q-tip test was performed and showed abnormalities, it would be crucial to correlate these findings with the results of the cystometry to ensure a comprehensive assessment of the patient's bladder function. The Q-tip test primarily assesses the closure mechanism of the urethra and is often used in conjunction with other diagnostic tools like cystometry to achieve accurate diagnostic conclusions.

In summary, performing a filling cytometry test would provide the necessary clinical information to better understand the patient's urinary leakage m

In [13]:
#### Setup Fine tuning

In [14]:
# Load the Data set
# medical_dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split = "train[:500]", trust_remote_code = True)
medical_dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split = "train[:500]")
# dataset - verified data used to train openai model (https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT)
# What the data set has (Question , chain of thought, response)
# "en" - English Language
# "train[:500]" :- It is a big data set so we will use the first 500 samples



README.md: 0.00B [00:00, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19704 [00:00<?, ? examples/s]

In [15]:
# First Object
medical_dataset[0]


{'Question': 'Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?',
 'Complex_CoT': "Okay, let's see what's going on here. We've got sudden weakness in the person's left arm and leg - and that screams something neuro-related, maybe a stroke?\n\nBut wait, there's more. The right lower leg is swollen and tender, which is like waving a big flag for deep vein thrombosis, especially after a long flight or sitting around a lot.\n\nSo, now I'm thinking, how could a clot in the leg end up causing issues like weakness or stroke symptoms?\n\nOh, right! There's this thing called a paradoxical embolism. It can happen if there's some kind of short circuit in the heart - like a hole that shouldn't be there.\n\nLet's put this together: if a blood clot from the leg somehow travels to the l

In [16]:
# when we are getting the output
# (response = tokenizer.batch_decode(outputs))
# so when we print the above it has <｜begin▁of▁sentence｜> () and <｜end▁of▁sentence｜> (It is used both for training and generating output)
# The tokens like <｜begin▁of▁sentence｜> and <｜end▁of▁sentence｜> are special tokens used by the model or tokenizer to mark the beginning and end of a generated sequence
# For finetuning, it's crucial to tell the model where the response ends with the help of this <｜end▁of▁sentence｜>

EOS_TOKEN = tokenizer.eos_token  # Define EOS_TOKEN which tells the model when to stop generating text during training
EOS_TOKEN


'<｜end▁of▁sentence｜>'

In [17]:
# Updated training prompt style to add </think> tag
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""

In [18]:
# We have to format the data  in a specific format , so that we can give it to the model for training (Labelling)

# Prepare the data for finetuning
def preprocess_input_data(examples): # examples is typically a dictionary containing lists of values for each column
  inputs = examples["Question"] # This line extracts the list of questions from the examples dictionary and assigns it to the variable inputs
  cots= examples["Complex_CoT"] # similarly, this extracts the list of 'Complex_CoT' (Chain of Thought) from examples and assigns it to cots
  outputs = examples["Response"] # This extracts the list of desired responses and assigns it to responses

  texts = [] # An empty list called texts is initialized. This list will store the newly formatted strings that will be used for training.


  for input, cot, output in zip(inputs, cots, outputs): # The zip() function is like a zipper on a jacket. It takes these three lists and combines them item by item
    text = train_prompt_style.format(input, cot, output) + EOS_TOKEN # the input will automaticlly go in (### Questions {input}) in prompt
    texts.append(text)

  return {
      "texts" : texts,
  }



In [19]:
# Convert the dataset to finetuning dataset
finetune_dataset = medical_dataset.map(preprocess_input_data , batched = True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [20]:
# Calling the first object of the text
finetune_dataset["texts"][0]
# the prompt has
# the question after the ### Question:{} ,
# Complex_CoT between <think>
# and Response after <think>


"Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.\nPlease answer the following medical question.\n\n### Question:\nGiven the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?\n\n### Response:\n<think>\nOkay, let's see what's going on here. We've got sudden weakness in the person's left arm and leg - and that screams something neuro-related, maybe a stroke?\n\nBut wait, there's more. The right lower leg is sw

In [21]:
# Setup or apply Lora Fine tuning to the model
# made the model with  trainable adpters
# peft :- full form in copy

# FastLanguageModel.get_peft_model function to apply LoRA (Low-Rank Adaptation) for efficient fine-tuning of the pre-trained language model.
model_lora = FastLanguageModel.get_peft_model(

    # model=model: This is your main, pre-trained language model – the powerful base car we talked about.
    model=model,

    # r=16: Think of 'r' as the size of the upgrade kit. A bigger number means more detailed (and potentially more effective) changes can be made, but it also takes more effort (memory/time). 16 is a good, balanced size for our upgrade.
    # Size of the trainable adapters (This is the rank of the update matrices. A higher r means more parameters are added and potentially more expressiveness, but also increases memory usage and training time)
    # Preferred based on the usecase
    r=16,

    #  This list defines which specific layers or modules within the base model will have LoRA adapters applied to them. In this case, it targets key attention and feed-forward layers
    # target_modules=[...]: This is like picking which parts of the car get the upgrade. Instead of upgrading every single component, we're focusing on the crucial parts that affect how the car 'thinks' and 'learns' – like the steering, suspension, and engine management system (q_proj, k_proj, etc. are technical terms for these important 'thinking' parts of the model).
    # add adapters to this layers
    target_modules = [
        #  These three work together to figure out how much each word in a sentence should "pay attention" to every other word.
        "q_proj", # Query (q_proj): This layer transforms each word into a 'query' vector, asking, "What information do I need from other words?
        "k_proj", # Key (k_proj): This transforms each word into a 'key' vector, essentially saying, "Here's the information I have."
        "v_proj", # Value (v_proj): This transforms each word into a 'value' vector, which holds the actual content to be passed on.

        "o_proj", # o_proj (Output Projection): After the q, k, and v projections have done their work and the attention scores are calculated, the o_proj layer combines all the attended information and projects it back into the model's main representation space. It's the final step in an attention block.

        # gate_proj, up_proj, down_proj: These are typically found within the Feed-Forward Network (FFN) or Multi-Layer Perceptron (MLP) block of a Transformer layer, especially in models like Llama.
        "gate_proj", # gate_proj acts like a gate, deciding which parts of this expanded information are important to let through. This adds non-linearity and allows the model to learn complex patterns.
        "up_proj", # The up_proj expands the dimensionality of the data,
        "down_proj"  # down_proj: This layer then projects the processed information back down to the original dimensionality, making it ready for the next layer of the Transformer or for the final output.
    ],

    # This is a scaling factor for the LoRA layers. (This is like the strength of the upgrade. It tells the model how much impact these small changes should have.)
    # lora_alpha=16: This is like the strength of the upgrade. It tells the model how much impact these small changes should have. Often, it's set to the same value as 'r'.
    # the more the move value will be given to the adapters
    lora_alpha = 16,

    # This sets the dropout probability for the LoRA layers. A value of 0 means no dropout is applied, which is typical during fine-tuning for smaller datasets or when aiming for maximum performance.
    # lora_dropout=0: This means we're not adding any 'randomness' to the upgrade process. In some training, you might add dropout to prevent over-optimization, but here, 0 means we want all the upgrades to stick.
    # full retaintion of information
    lora_dropout = 0,

    # This specifies how bias terms are handled in the LoRA adapters. "none" means no bias is added to the LoRA adapters, which is a common practice.
    # bias="none": This relates to how we fine-tune the 'basic settings' of the model. "none" means we're not making specific adjustments to these basic settings with our small upgrades.
    bias = "none",

    # This enables gradient checkpointing, a memory optimization technique that reduces GPU memory usage during training by recomputing some activations instead of storing them.
    # use_gradient_checkpointing="unsloth": This is a clever trick to save memory while upgrading. Imagine you're building a complex model kit: instead of laying out all the pieces at once (which takes up a lot of space), you only lay out the pieces you need for the current step, then put them away and bring out new ones for the next step. Unsloth has a super-efficient version of this trick.
    use_gradient_checkpointing = "unsloth",

    # This sets the random seed for reproducibility of the LoRA adapter initialization.
    # random_state=3047: This is like having a specific blueprint for your upgrades. If you use the same random_state, anyone else building the same upgrade will get identical results.
    random_state = 3047,

    #  This indicates that "Rank-Stabilized LoRA" is not being used. RSLora is a variant that aims to stabilize training with very low ranks.
    # use_rslora=False: This is about choosing a specific type of upgrade kit. We're using the standard kit, not a special 'rank-stabilized' version.
    # RSLora('rank-stabilized') is a modification of LoRA designed to improve stability and performance especially when using very low ranks. It achieves this by applying a scaling factor to the LoRA updates, effectively stabilizing the training process even with minimal added parameters.
    use_rslora = False,

    # This parameter is for LoFTQ (LoRA-Fine-Tuning Quantization) configuration. Setting it to None means LoFTQ is not being applied.
    # loftq_config=None: This refers to another advanced type of upgrade that combines LoRA with something called 'quantization'. We're not using that here, so it's set to None
    # Quantization, in this context, is a technique used to reduce the memory footprint and speed up computation of large language models.
    loftq_config = None


)

Unsloth 2025.12.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [22]:
# Add this before creating the trainer (Dependies resolver)
if hasattr(model, '_unwrapped_old_generate'):
    del model._unwrapped_old_generate

In [23]:
# train the lora model with the training adapter that we have defined

# Purpose: This function acts as an adapter specifically for the SFTTrainer. The SFTTrainer expects its formatting_func to return a list of strings that it can then tokenize.
#dynamic_formatting_func ensures that this content is always presented to the SFTTrainer in the exact List[str] format it expects, regardless of whether it's processing one example or many at a time. It's a bridge to handle the trainer's input requirements flexibly.
def dynamic_formatting_func(example):
    text_content = example["texts"]
    if isinstance(text_content, list):
        return text_content
    else:
        return [text_content]

trainer = SFTTrainer(
    model = model_lora,
    tokenizer = tokenizer, # MAin tokenize while downloading
    train_dataset = finetune_dataset, # first 500 samples of the whole data set
    dataset_text_field = "texts", # where our text exists
    max_seq_length = MAX_SEQUENCE_LENGTH, # MAX_SEQUENCE_LENGTH while downloading the data defined
    dataset_num_proc = 1,  # Number of GPU
    # The error indicates that formatting_func is explicitly required by Unsloth's SFTTrainer.
    # It expects a function that takes an example (dictionary) from the dataset and returns the formatted text string.
    # Since finetune_dataset already has a 'texts' column with formatted strings, we just need a function to extract it.
    formatting_func = dynamic_formatting_func,


    # Define training args (Configured based on the free colab capabilities)
    args = TrainingArguments(
        per_device_train_batch_size = 2, # How many samples are processed at once on each GPU.
        gradient_accumulation_steps = 4, # Accumulates gradients over multiple batches before performing an optimization step. This effectively increases the batch size without requiring more GPU memory.
        num_train_epochs = 1, # The total number of times the model will iterate over the entire training dataset.
        warmup_steps = 5, # The number of steps over which the learning rate linearly increases from 0 to its initial value.
        max_steps = 60, # The total number of optimization steps to perform. If set, it overrides `num_train_epochs`.
        learning_rate = 2e-4, # The initial learning rate for the optimizer. A smaller learning rate means smaller updates to model weights.
        fp16 = not is_bfloat16_supported(), # Whether to use 16-bit floating-point precision (FP16). Here, it's enabled if bfloat16 is not supported.
        bf16=is_bfloat16_supported(), # Whether to use bfloat16 precision. Enabled if bfloat16 is supported.
        logging_steps = 10, # How often (in steps) to log training metrics (e.g., loss, learning rate).
        optim = "adamw_8bit", # The optimizer to use. "adamw_8bit" is a memory-efficient variant of AdamW.
        weight_decay = 0.01, # The strength of L2 regularization, which helps prevent overfitting by penalizing large weights.
        lr_scheduler_type="linear", # The type of learning rate scheduler to use. "linear" means the learning rate decreases linearly after the warmup phase.
        seed=3407, # Random seed for reproducibility.
        output_dir = "outputs", # The directory where model checkpoints and logs will be saved.
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/500 [00:00<?, ? examples/s]

In [29]:
# Setup WANDB
from google.colab import userdata
wnb_token = userdata.get("WANDB_API_TOKEN")
# Login to WnB
wandb.login(key=wnb_token) # import wandb
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-on-Medical-CoT-Dataset',
    job_type="training",
    anonymous="allow"
)



wandb: Use W&B Weave for improved LLM call tracing. Weave is installed but not imported. Add `import weave` to the top of your script.


In [30]:
# Start the fine-tuning process
# Training loss should decrease for better model performance
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.9322
20,1.4248
30,1.4068
40,1.3648
50,1.3912
60,1.3705


0,1
train/epoch,▁▂▄▅▇██
train/global_step,▁▂▄▅▇██
train/grad_norm,█▂▁▁▂▁
train/learning_rate,█▇▅▄▂▁
train/loss,█▂▂▁▁▁

0,1
total_flos,1.6686758448119808e+16
train/epoch,0.96
train/global_step,60.0
train/grad_norm,0.25627
train/learning_rate,0.0
train/loss,1.3705
train_loss,1.48173
train_runtime,1048.754
train_samples_per_second,0.458
train_steps_per_second,0.057


In [32]:
wandb.finish()

In [36]:
# Testing after fine-tuning
question = """
A 61-year-old woman with a long history of involuntary urine leakage during activities such as coughing or sneezing,
but no nocturnal leakage, undergoes a gynecologic examination and a Q-tip test.
Based on these findings, what would cystometry most likely show regarding her postvoid residual volume and detrusor muscle contractions?

"""

# new model
# Take inference with the help of unsloth
FastLanguageModel.for_inference(model_lora)

# Tokenize the input
inputs = tokenizer([prompt_style.format(question=question)], return_tensors="pt").to("cuda")  # tokenize the input

# Generate a response
outputs = model_lora.generate (
    input_ids = inputs.input_ids, # Think of this as the question you're giving to the AI model, but translated into numbers the computer understands.
    attention_mask = inputs.attention_mask, # Imagine you're highlighting the important parts of your question and crossing out any extra spaces or empty bits.(attention mask is used for padding)
    max_new_tokens = 1200, # Response Limit
    use_cache = True # if similar question is asked instead of running a whole , store it in cache fro there it will generate the output for fast response
)

# Decode the response tokens back to text
response = tokenizer.batch_decode(outputs)


In [37]:
print(response)

["<｜begin▁of▁sentence｜>\nBelow is a task description along with additional context provided in the input section. Your goal is to provide an accurate, clinically sound response that directly addresses the request.\n\nUse your medical expertise to ensure the answer is evidence-based, clearly structured, and easy to understand. When appropriate, briefly explain the rationale behind key conclusions without revealing internal reasoning processes.\n\n### Task:\nYou are a medical expert specializing in clinical reasoning, diagnostics, and treatment planning. Answer the medical question below using your advanced knowledge.\nProvide a clear, concise, and clinically appropriate response. Include relevant explanations, differential considerations, and recommendations as needed.\n\n### Query:\n\nA 61-year-old woman with a long history of involuntary urine leakage during activities such as coughing or sneezing,\nbut no nocturnal leakage, undergoes a gynecologic examination and a Q-tip test.\nBased

In [38]:
print(response[0].split("### Answer:")[1])



<think>
Okay, let's think about this. So, we have a 61-year-old woman with a history of involuntary urine leakage, especially when she coughs or sneezes. But, interestingly, she doesn't leak at night. Hmm, that's a bit unusual because I usually think of nighttime leakage as a common issue with conditions like overactive bladder syndrome.

Alright, she undergoes a gynecological exam and a Q-tip test. The Q-tip test is something I remember is used to assess urethral patency. It's typically done to check if there's any obstruction or dysfunction in the urethra that could cause urinary symptoms like the ones she's experiencing.

Now, let's think about the findings from the Q-tip test. It measures how well the urethral passage opens when you pull on the Q-tip. If it's open, that suggests there's no obstruction. If it's not opening well, that could indicate some kind of urethral dysfunction.

Given her symptoms, her Q-tip test results are normal. So, the urethra is functioning well. That m

In [39]:
question = """A 59-year-old man presents with a fever, chills, night sweats, and generalized fatigue,
              and is found to have a 12 mm vegetation on the aortic valve. Blood cultures indicate gram-positive, catalase-negative,
              gamma-hemolytic cocci in chains that do not grow in a 6.5% NaCl medium.
              What is the most likely predisposing factor for this patient's condition?"""

FastLanguageModel.for_inference(model_lora)

# Tokenize the input
inputs = tokenizer([prompt_style.format(question=question)], return_tensors="pt").to("cuda")  # tokenize the input

# Generate a response
outputs = model_lora.generate (
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True
)

# Decode the response tokens back to text
response = tokenizer.batch_decode(outputs)

print(response[0].split("### Answer:")[1])



<think>
Alright, let's think about this. We've got a 59-year-old man who's got a fever, chills, and looks like he's dragging with night sweats. That's definitely a red flag for something systemic. His blood culture shows gram-positive, catalase-negative, gamma-hemolytic cocci that are in chains and won't grow in a 6.5% NaCl medium. Hmm, that's a pretty specific clue.

Okay, so gram-positive, catalase-negative, and chains of cocci—what's that about? It's definitely Streptococcus. Oh, but wait, the fact that it doesn't grow in a 6.5% NaCl medium is a big deal. That suggests it's not the typical Streptococcus pneumoniae, which usually doesn't grow in that concentration. So, what about Streptococcus viridans? Yeah, that fits because it's gram-positive, catalase-negative, and it doesn't grow in 6.5% NaCl. Also, it's known for causing endocarditis, especially in people who have some kind of heart issues.

And now, what's the link between his condition and heart problems? We're seeing a veg

In [None]:
# Save the fine-tuned model and tokenizer locally
# model.save_pretrained("final_deepseek_model")
# tokenizer.save_pretrained("final_deepseek_model")

In [None]:
# Push the model and tokenizer to Hugging Face Hub
# Replace 'your-username/your-model-name' with your desired repository name
# model.push_to_hub("your-username/deepseek-r1-distill-llama-8b-finetuned", token=hf_token)
# tokenizer.push_to_hub("your-username/deepseek-r1-distill-llama-8b-finetuned", token=hf_token)