In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install unsloth # install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git # Also get the latest version Unsloth

In [None]:
!pip install unsloth_zoo

In [None]:
# Step3: Import necessary libraries
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
from huggingface_hub import login
from transformers import TrainingArguments
from datasets import load_dataset
import wandb

In [None]:
from huggingface_hub import login
hf_token = ""
login(hf_token)


In [None]:

# Optional: Check GPU availability
# Test if CUDA is available
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

In [None]:
# Step5: Setup pretrained DeepSeek-R1

model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
max_sequence_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_sequence_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token
)

In [None]:
# Step6: Setup system prompt
prompt_style = """
Below is a task description along with additional context provided in the input section. Your goal is to provide a well-reasoned response that effectively addresses the request.

Before crafting your answer, take a moment to carefully analyze the question. Develop a clear, step-by-step thought process to ensure your response is both logical and accurate.

### Task:
You are a medical expert specializing in clinical reasoning, diagnostics, and treatment planning. Answer the medical question below using your advanced knowledge.

### Query:
{}

### Answer:
{}
"""

In [None]:

# Step7: Run Inference on the model

# Define a test question
question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or
              sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings,
              what would cystometry most likely reveal about her residual volume and detrusor contractions?"""

FastLanguageModel.for_inference(model)

# Tokenize the input
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response
outputs = model.generate (
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True
)

# Decode the response tokens back to text
response = tokenizer.batch_decode(outputs)


print(response)

In [None]:

print(response[0].split("### Answer:")[1])

In [None]:
# Step8: Setup fine-tuning

# Load Dataset
medical_dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split = "train[:5000]", trust_remote_code = True)
     

In [None]:
medical_dataset[1]

In [None]:

EOS_TOKEN = tokenizer.eos_token  # Define EOS_TOKEN which tells the model when to stop generating text during training
EOS_TOKEN

In [None]:

### Finetuning
# Updated training prompt style to add  tag
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:

{}

{}"""

In [None]:
def preprocess_input_data(examples):
  inputs = examples["Question"]
  cots = examples["Complex_CoT"]
  outputs = examples["Response"]

  texts = []

  for input, cot, output in zip(inputs, cots, outputs):
    text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
    texts.append(text)

  return {
      "texts" : texts,
  }

In [None]:
finetune_dataset = medical_dataset.map(preprocess_input_data, batched = True,)

In [None]:
finetune_dataset["texts"][0]

In [None]:
# Step9: Setup/Apply LoRA finetuning to the model

model_lora = FastLanguageModel.get_peft_model(
    model = model,
    r = 16,
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3047,
    use_rslora = False,
    loftq_config = None
)

In [None]:

# Add this before creating the trainer
if hasattr(model, '_unwrapped_old_generate'):
    del model._unwrapped_old_generate

In [None]:
def dummy_formatting_func(example):
    text = example["texts"]
    return [text] if isinstance(text, str) else text


trainer = SFTTrainer(
    model = model_lora,
    tokenizer = tokenizer,
    train_dataset = finetune_dataset,
    dataset_text_field = "texts",
    formatting_func = dummy_formatting_func,  # ✅ Required, must return List[str]
    max_seq_length = max_sequence_length,
    dataset_num_proc = 1,

    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        num_train_epochs = 1,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)


In [None]:
wandb.login(key="") # import wandb
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-on-Medical-CoT-Dataset',
    job_type="training",
    anonymous="allow"
)

In [None]:
trainer_stats = trainer.train()

In [None]:

wandb.finish()

In [None]:
# Step10: Testing after fine-tuning
question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing
              but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings,
              what would cystometry most likely reveal about her residual volume and detrusor contractions?"""

FastLanguageModel.for_inference(model_lora)

# Tokenize the input
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response
outputs = model_lora.generate (
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True
)

# Decode the response tokens back to text
response = tokenizer.batch_decode(outputs)

print(response)

In [None]:

print(response[0].split("### Answer:")[1])

In [None]:
question = """A 59-year-old man presents with a fever, chills, night sweats, and generalized fatigue,
              and is found to have a 12 mm vegetation on the aortic valve. Blood cultures indicate gram-positive, catalase-negative,
              gamma-hemolytic cocci in chains that do not grow in a 6.5% NaCl medium.
              What is the most likely predisposing factor for this patient's condition?"""

FastLanguageModel.for_inference(model_lora)

# Tokenize the input
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response
outputs = model_lora.generate (
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True
)

# Decode the response tokens back to text
response = tokenizer.batch_decode(outputs)

print(response[0].split("### Answer:")[1])

In [None]:
# Convert to full model if you haven't already
model_fp16 = model_lora.merge_and_unload()

# Push merged float16 model to Hugging Face
model_fp16.push_to_hub("ShubhamZoro/DeepSeek-R1-Medical-COT-FP16")
tokenizer.push_to_hub("ShubhamZoro/DeepSeek-R1-Medical-COT-FP16")


In [None]:
if hasattr(config, "quantization_config"):
    del config.quantization_config  # ✅ Fully remove it


In [None]:
from huggingface_hub import login
hf_token = ""
login(hf_token)


from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Step 1: Load base model in FP16 (not quantized!)
base_model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")

# Step 2: Load your fine-tuned LoRA adapter
model = PeftModel.from_pretrained(
    base_model,
    "ShubhamZoro/DeepSeek-R1-Medical-COT"  # <- this must point to the LoRA weights only
)

# Step 3: Merge the adapter
model = model.merge_and_unload()

# Step 4: Push clean merged model to HF
model.push_to_hub("ShubhamZoro/DeepSeek-R1-Medical-COT-FP16-CLEAN", safe_serialization=True)
tokenizer.push_to_hub("ShubhamZoro/DeepSeek-R1-Medical-COT-FP16-CLEAN")


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained(
    "ShubhamZoro/DeepSeek-R1-Medical-COT-FP16-CLEAN",
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained("ShubhamZoro/DeepSeek-R1-Medical-COT-FP16-CLEAN")

prompt = "A 60-year-old male with night sweats and aortic murmur. What is the most likely diagnosis?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=200)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "ShubhamZoro/DeepSeek-R1-Medical-COT-FP16-CLEAN"
local_dir = "./deepseek_medical_local"

# Download and cache to local folder
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

model.save_pretrained(local_dir)
tokenizer.save_pretrained(local_dir)


In [1]:
!pip install bitsandbytes accelerate


Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    "ShubhamZoro/DeepSeek-R1-Medical-COT-FP16-CLEAN",
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained("ShubhamZoro/DeepSeek-R1-Medical-COT-FP16-CLEAN")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

response = pipe("How to treat cancer?", max_new_tokens=200)
print(response[0]["generated_text"])

2025-08-12 16:50:28.391847: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755017428.558386      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755017428.612662      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/839 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Device set to use cuda:0


How to treat cancer? This question is asked by so many people, and it's quite a complex one. There are various ways to tackle cancer, depending on the type and stage of the cancer. Let's explore some of the common approaches.

First, surgery. In many cases, surgery is a primary treatment option. It's often used when the cancer is localized and can be removed completely. Surgeons can remove tumors and surrounding healthy tissue in the hope of eliminating the cancer. It's important to note that surgery isn't always an option, especially for cancers that have spread widely or if the tumor can't be accessed easily.

Next, radiation therapy. This approach is often used after surgery or alongside chemotherapy. It's effective for cancers that can't be fully removed by surgery. Radiation can target specific areas to destroy cancer cells. It's used both externally and internally (brachytherapy). While radiation can be effective, it can also cause side effects like fatigue, skin irritation, and 

In [3]:
# Save model and tokenizer locally
model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_model")


('saved_model/tokenizer_config.json',
 'saved_model/special_tokens_map.json',
 'saved_model/chat_template.jinja',
 'saved_model/tokenizer.json')

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load model and tokenizer from saved directory
model = AutoModelForCausalLM.from_pretrained("saved_model")
tokenizer = AutoTokenizer.from_pretrained("saved_model")

# Recreate the pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [5]:
response = pipe("How to treat cancer?", max_new_tokens=200)
print(response[0]["generated_text"])

How to treat cancer? Maybe with a drug that delivers a precise amount of radiation to a specific tumor, not the whole body. That’s what I’ve read in some cancer research articles. It sounds promising because it could be more effective and reduce the side effects compared to traditional treatments like chemotherapy. But I’m not sure how it actually works, so I want to dive deeper into this concept.

Let me start by understanding what it means to have a drug deliver radiation specifically to a tumor. It’s like having a delivery system that can pinpoint where the radiation is needed. This could be useful because traditional radiation therapy affects the whole body, which can be problematic for patients, especially children. It's a big risk with significant side effects.

So, how does this delivery system work? I know about some medical imaging techniques, like PET scans or MRI, that help pinpoint where tumors are located. Maybe the drug uses these imaging tools to find the right spot. But