In [None]:
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.2
    Uninstalling transformers-4.46.2:
      Successfully uninstalled transformers-4.46.2
Successfully installed transformers-4.46.3
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting mu

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [None]:
from google.colab import userdata
HF_TOKEN =userdata.get('HF_TOKEN')


In [None]:
model_name = "Qwen/Qwen2.5-3B-Instruct"
dataset_name = "ruslanmv/ai-medical-chatbot"


In [None]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_storage=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
# Check the embedding size
embedding_size = model.get_input_embeddings().weight.size()
print(f"Embedding size: {embedding_size}")

Embedding size: torch.Size([151936, 2048])


In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
      target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [None]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle(seed=65).select(range(2000)) # Only use 2000 samples for quick demo




README.md:   0%|          | 0.00/863 [00:00<?, ?B/s]

dialogues.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/256916 [00:00<?, ? examples/s]

In [None]:
# Check the vocab size
vocab_size = tokenizer.vocab_size
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 151643


In [None]:
dataset[1]

{'Description': 'What causes blood in urine?',
 'Patient': "Dr, My daughter is 5yrs old.i saw stains in her trousers a few days ago.the stains were light red colour.later i found some pus like liquid near her urinary tract.yesterday i saw light brick coloured liquid along her urine.feeling panic,gave  urine for culter and routine test,culter result not yet recd.her routine test say,pus cells:4-8 and epithiall cells :2-4.What's wrong with my daughter? (she goes to urine only 4 to 5 times a day,drinking water too not sufficient)",
 'Doctor': 'Thanks for contacting HCMYou are concerned that your daughter may have a urinary tract infection. Your description of her urine and findings in her panties does suggest urinary tract infection. The urine analysis though is not very convincing for a urinary tract infection. The sample shows 2-4 epithelial cells and only 4-8 puss cells. The counts are normal and do not indicate infection. I recommend you wait for the culture results. I would recommend

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['Patient'])):
        text = f"### Question: {example['Patient'][i]}\n ### Answer: {example['Doctor'][i]}"
        output_texts.append(text)
    return output_texts

In [None]:
df = dataset.to_pandas()
df

Unnamed: 0,Description,Patient,Doctor
0,Can blood pressure medication be stopped to ch...,"I'm 35, BP 150/100 without medicine, never smo...","Hello,Thanks for writing to Health Care Magic,..."
1,What causes blood in urine?,"Dr, My daughter is 5yrs old.i saw stains in he...",Thanks for contacting HCMYou are concerned tha...
2,What causes swelling and blisters around lips ...,my daughter is experiencing sever swelling of ...,"hello, thanks for your query, it's unlikely al..."
3,What causes shoulder pain with stiffness in jaw?,Fell on sidewalk face first about 8 hrs ago. S...,"Hello and welcome to HCM,The injuries caused o..."
4,Q. My partner's nipples feel different. Is she...,"Hello doctor,I had sex without protection but,...",Hello For more information consult an obstetri...
...,...,...,...
1995,What causes difficulty to breath after during ...,i have been treated with bronchitis before .. ...,Thanks for your question on Healthcare Magic.I...
1996,Suggest an effective alternative for Amitripty...,I am researching prescription medication optio...,Hello and Welcome to ‘Ask A Doctor’ service. I...
1997,Suggest treatment for an eye infection,Hi i have had an eye infection affecting both ...,Hi thanks for asking question in HCM.Here swab...
1998,Is it embarrassing to get your hair down there...,is it embarrassing to get your hair down there...,dear its not embarrassing at all to get rid o...


In [None]:
import pandas as pd
from datasets import Dataset
from trl import apply_chat_template

# Create dataset dictionary in the required format
dataset_dict = {
    "prompt": [[{
        "role": "user",
        "content": f"Description: {row['Description']}\nPatient Message: {row['Patient']}"
    }] for _, row in df.iterrows()],

    "completion": [[{
        "role": "assistant",
        "content": response
    }] for response in df['Doctor']]
}

# Convert to Dataset format
dataset = Dataset.from_dict(dataset_dict)

# Apply chat template
dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
new_model = 'finetuned__qwen'

In [None]:
dataset = dataset.train_test_split(test_size=0.1)


In [None]:
dataset['train'][1]

{'prompt': '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescription: What causes pain near the penis?\nPatient Message: I am experiencing minor pain in the left area of where my Penis is situated from yesterday. The pain is not in the penis. But on the left side of the area where usually males are having their pelvic hair. As I am touching there, I am feeling that there could be probably a swelling kind of thing. Please guide me.<|im_end|>\n<|im_start|>assistant\n',
 'completion': 'Hi,It seems that you might be having enlarged tender inguinal lymph node producing pain and swelling over that region.There might be having some local skin infection giving rise this problem.you might require one course of antibiotic medicine for 3-5 days.Ok and take care.<|im_end|>\n'}

In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none"  # Disable WandB logging
)



In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
model = trainer.train()
model

Step,Training Loss,Validation Loss
180,1.599,2.010266
360,1.9224,1.990724
540,2.1022,1.967277
720,2.0904,1.950833
900,1.8191,1.940507


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=900, training_loss=1.952250833246443, metrics={'train_runtime': 4722.7004, 'train_samples_per_second': 0.381, 'train_steps_per_second': 0.191, 'total_flos': 8873028197007360.0, 'train_loss': 1.952250833246443, 'epoch': 1.0})

In [None]:
fine_tuned_model = trainer.model


In [None]:
fine_tuned_model.save_pretrained('finetuned__qwen')
tokenizer.save_pretrained('finetuned__qwen')

('finetuned__qwen/tokenizer_config.json',
 'finetuned__qwen/special_tokens_map.json',
 'finetuned__qwen/vocab.json',
 'finetuned__qwen/merges.txt',
 'finetuned__qwen/added_tokens.json',
 'finetuned__qwen/tokenizer.json')

In [None]:
model.push_to_hub(new_model)
tokenizer.push_to_hub(new_model)

adapter_model.safetensors:   0%|          | 0.00/120M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Nishanth1904/finetuned_qwen/commit/c7a7940dee204d943e57fcc22b99976aae92fe98', commit_message='Upload tokenizer', commit_description='', oid='c7a7940dee204d943e57fcc22b99976aae92fe98', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nishanth1904/finetuned_qwen', endpoint='https://huggingface.co', repo_type='model', repo_id='Nishanth1904/finetuned_qwen'), pr_revision=None, pr_num=None)

In [None]:
# Check the embedding size
embedding_size = model.get_input_embeddings().weight.size()
print(f"Embedding size: {embedding_size}")

Embedding size: torch.Size([151936, 2048])


In [None]:
while True:
    user_input = input("\nYou: ")
    if user_input.lower() == 'quit':
        break

    # Format prompt with chat template
    prompt = f"""<|im_start|>human
{user_input}<|im_end|>
<|im_start|>assistant"""

    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = fine_tuned_model.generate(
        **inputs,
        max_length=512,
        temperature=0.7,
        do_sample=True,
        top_p=0.95,
        top_k=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.encode("<|im_end|>", add_special_tokens=False)[0]
    )

    # Decode and clean response
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    response = response.split("<|im_start|>assistant")[-1].replace("<|im_end|>", "").strip()

    print("\nAssistant:", response)


You: traetment for hairloss





Assistant: Hello, You are suffering from alopecia, which is a condition of hair loss. For this you can use the following medicines:- 1. Minoxidil: This is a topical medicine which will stimulate the hair growth. Apply on the bald areas twice daily. 2. Finasteride: This is a oral medication. It has to be taken once in a day. If you have more information about your problem please share it with me and I'll be happy to help you. Take care Regards. Dr. Shinas Hussain

You: Quit


In [None]:
from huggingface_hub import HfApi, HfFolder, Repository, create_repo

# Login to Hugging Face Hub (only required once)
# !huggingface-cli login

# Define the repository name
repo_name = "finetuned__qwen"  # Change this to your desired repository name
model_save_path = "/content/finetuned__qwen"  # Path to the folder where the model is saved locally

# Save the model (you might have already done this; customize as needed)

# Create a repository on the Hub
create_repo(repo_name, private=False)  # Set private=True if you want a private repo

# Upload the model to the Hub
from huggingface_hub import upload_folder
upload_folder(
    folder_path=model_save_path,
    repo_id=f"Nishanth1904/{repo_name}",  # Replace "your-username" with your Hugging Face username
    commit_message="Initial commit of the trained model"
)

print(f"Model uploaded to https://huggingface.co/your-username/{repo_name}")


adapter_model.safetensors:   0%|          | 0.00/120M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Upload 14 LFS files:   0%|          | 0/14 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/120M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/240M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/120M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/240M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Model uploaded to https://huggingface.co/your-username/finetuned__qwen
