In [None]:
%%capture
!pip install unsloth #install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Import the relevant packages

In [None]:
##modeule for finetuning
from unsloth import FastLanguageModel 
#check if hardware supports bgloat16 precision
from unsloth import is_bfloat16_supported
import torch #pytorch
from trl import SFTTrainer #trainer for supervised finetuning
import wandb #weights and biases
#training hyperparameters
from transformers import TrainingArguments
#loading finetuning dataset
from datasets import load_dataset 



Access and login to Hugging face token

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient #import kaggle secrets
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN")
login(hf_token)

In [None]:
wb_token = user_secrets.get_secret("wnb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset',
    job_type="training"
    anonymous="allow"
)

Loading the model and tokenizer  - unsloth version of DeepSeek-R1-Distill-Llama-8B.

In [None]:
max_seq_length = 2048
dtype = None
load_in_4bit = True
#loading model in 4 bit quantization to optimise memory usage and performance
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Deepseek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

Model inference before finetuning


In [None]:
#create prompt stype for model
#define system prompt and include placeholders for question and response generation
#think and response

prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.


### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>{}
"""

#Creating a test medical question for inference
question = "A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contraction"

#Enable optimized inference model for Unsloth models
#which improves speed & efficiency
FastLanguageModel.for_inference(model)

#Format the question using structured prompt('prompt_style') and tokenize it
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

#generate response using the model
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)

#decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)

#Extract and print only the relevant response part ie after "### Response:"
print(response[0].split("### Response:")[1])


We got the result, but for getting the consise outputs in desired format, we need to finetune with the dataset

Loading and Processing the dataset FreedomIntelligence/medical-o1-reasoning-SFT from https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT?row=46

In [None]:
#Loading first 500 samples from FreedomIntelligence/medical-o1-reasoning-SFT dataset
#from huggingface hub
from datasets import load_dataset
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split = "train[0:500]",trust_remote_code=True)
dataset

In [None]:
#change the prompt_style for processing the dataset

train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>
{}
</think>
{}
"""



In [None]:
EOS_TOKEN = tokenizer.eos_token  #must add EOS_TOKEN
EOS_TOKEN

In [None]:
#python function to create a "text" column in the dataset
#which consists of train prompt style.
#fill the placeholders with questions, chains of text and answers

def formatting_prompts_func(examples):
    inputs = examples["Question"]
    cots = examples["Complex_CoT"]
    outputs = examples["Response"]
    texts = []
    for input, cot, output in zip(inputs, cots, outputs):
        #format the inputs, cots and outputs in train_prompt_style format
        #and append EOS token to mark the end of sentence for each input 
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text) #append the input to the texts list
    return {
        "text": texts,
    }

In [None]:
#map the text column using formatting_prompts_func function

dataset = dataset.map(formatting_prompts_fuc, batched = True,)
dataset["text"][0]

Setting up the model using LoRA

In [None]:
#Setup model by adding Low Rank Adapter to the model
#using peft - Parameter Efficient Fine Tuning, which wraps the base model
#with LoRA modifications ensurin that only specific parameters are trained
model = FastLangugaeModel.get_peft_model(
    model,
    r = 16, #nuomber of trainable parameters for LoRA
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj"
        "down_proj",
        ],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
)
#target_modules specifies the list of modules within the model to be adapted ie in the attention 
#mechanism in the transformer model
#use_gradient_checkpointing="unsloth" technique to save memory
#during training 
#loftq - lora finetuning quantization


In [None]:
#setting training arguments and trainer with model, tokenizer and imp parametrs
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2, #no of processes to use - 2 cpus
    args=TrainingArguments(
        per_device_train_batch_size=2, #no of processes per cpu at a time
        gradient_accumulation_steps=4, #how many steps gradient need to accumulate before updating weights
        num_train_epochs = 1, #warmup_ratio for full training runs
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10 #loging details every 10 steps
        optim="adamw_8bit",
        weight_decay=0.01, #regularisation prevent overfitting
        lr_scheduler_type="linear", #learning rate
        seed=3407,
        output_dir="outputs",
    ),
)


In [None]:
#train the model
trainer_stats = trainer.train()

In [None]:
#model inference after finetuning
question = "A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?"


FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])


In [None]:
#saving the model locally
new_model_local = "DeepSeek-R1-Medical-COT"
model.save_pretrained(new_model_local) 
tokenizer.save_pretrained(new_model_local)

model.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit",)