In [1]:
!pip install huggingface_hub==0.25.0
!pip install -U transformers
!pip install -U datasets
!pip install -U accelerate
!pip install -U bitsandbytes
!pip install -U peft
!pip install trl==0.9.6

Collecting huggingface_hub==0.25.0
  Downloading huggingface_hub-0.25.0-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.25.0-py3-none-any.whl (436 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/436.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m436.4/436.4 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.36.0
    Uninstalling huggingface-hub-0.36.0:
      Successfully uninstalled huggingface-hub-0.36.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.50.0 requires huggingface-hub<2.0,>=0.33.5, but you have huggingface-hub 0.25.0 which is incompatible.
diffusers 0.36.0 requires huggingface-hub<2.0,>=0.34.0, but you

In [5]:
# Import necessary libraries for the fine-tuning pipeline
import os
import torch
from datasets import load_dataset  # For loading the medical dataset
from transformers import (AutoModelForCausalLM,  # For loading pre-trained language models
                         AutoTokenizer,  # For tokenizing text data
                         BitsAndBytesConfig,  # For model quantization settings
                         TrainingArguments,  # For configuring training hyperparameters
                         logging)
from peft import LoraConfig, get_peft_model  # Parameter-Efficient Fine-Tuning with LoRA
from huggingface_hub import login  # For authenticating with HuggingFace
from trl import SFTTrainer, setup_chat_format  # For supervised fine-tuning and chat formatting
import bitsandbytes as bnb  # For 4-bit quantization to reduce memory usage

# Set up authentication with HuggingFace using a secure token from Kaggle secrets
hf_token = os.environ.get("hugging_face")
login(token=hf_token)

# Define the model and dataset to use
base_model = "google/gemma-2-2b-it"  # Starting with Google's Gemma 2 2B instruction-tuned model
new_model = "Gemma-2-2b-it-ChatDoctor"  # Name for our fine-tuned medical assistant model
dataset_name = "lavita/ChatDoctor-HealthCareMagic-100k"  # Medical Q&A dataset with 100k examples

# Configure hardware acceleration based on the GPU capabilities
if torch.cuda.get_device_capability()[0] >= 8:
   torch_dtype = torch.bfloat16  # Use bfloat16 precision on newer GPUs (Ampere architecture or newer)
   attn_implementation = "flash_attention_2"  # Use Flash Attention 2 for faster training on modern GPUs
else:
   torch_dtype = torch.float16  # Fall back to float16 on older GPUs
   attn_implementation = "eager"  # Use standard attention implementation on older hardware

# Configure model quantization settings to reduce memory requirements
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,  # Load model in 4-bit precision instead of 16-bit to save memory
   bnb_4bit_quant_type="nf4",  # Use normalized float 4 quantization for better quality
   bnb_4bit_compute_dtype=torch_dtype,  # Use the precision determined by GPU capabilities
   bnb_4bit_use_double_quant=True,  # Apply double quantization for additional memory savings
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Load the pre-trained model with quantization and hardware optimization settings
model = AutoModelForCausalLM.from_pretrained(
  base_model,                           # Use the Gemma 2 2B model we specified earlier
  quantization_config=bnb_config,       # Apply the 4-bit quantization configuration to reduce memory usage
  device_map="auto",                    # Automatically manage model placement across available GPUs/CPU
  attn_implementation=attn_implementation  # Use the attention implementation we selected based on GPU capabilities
)

# Load the tokenizer for the same model
tokenizer = AutoTokenizer.from_pretrained(
  base_model,                           # Use the tokenizer that matches our base model
  trust_remote_code=True                # Allow running remote code in the tokenizer implementation for full functionality
)

# This function identifies all linear layers in the model that should be modified with LoRA
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit                # Look specifically for 4-bit quantized linear layers
  lora_module_names = set()              # Create an empty set to store eligible layer names

  # Iterate through all named modules in the model
  for name, module in model.named_modules():
      if isinstance(module, cls):        # Check if the module is a 4-bit linear layer
          names = name.split('.')        # Split the full path name into components
          if len(names) == 1:            # If it's a top-level module
              lora_module_names.add(names[0])
          else:                          # If it's a nested module
              lora_module_names.add(names[-1])  # Add only the base name of the module

  lora_module_names.discard('lm_head')   # Remove the language model head, which shouldn't be modified with LoRA
  return list(lora_module_names)         # Return the list of eligible layer names

# Get the list of modules that will be fine-tuned using LoRA
modules = find_all_linear_names(model)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [7]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
   r=16,
   lora_alpha=32,
   lora_dropout=0.05,
   bias="none",
   task_type="CAUSAL_LM",
   target_modules=modules
)

tokenizer.chat_template = None
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [8]:
import re
from datasets import load_dataset

dataset = load_dataset(
   dataset_name,
   split="all",
   cache_dir="./cache"
)

dataset = dataset.shuffle(seed=42).select(range(2000))

def clean_text(text):
   text = re.sub(r'\b(?:www\.[^\s]+|http\S+)', '', text)
   text = re.sub(r'\b(?:aCht Doctor(?:.com)?(?:.in)?|www\.(?:google|yahoo)\S*)', '', text)
   text = re.sub(r'\s+', ' ', text)
   return text.strip()

def format_chat_template(row):
   cleaned_instruction = clean_text(row["instruction"])  # Очистка инструкции
   cleaned_input = clean_text(row["input"])             # Очистка входных данных
   cleaned_output = clean_text(row["output"])           # Очистка выходных данных

   row_json = [
       {"role": "system", "content": cleaned_instruction},
       {"role": "user", "content": cleaned_input},
       {"role": "assistant", "content": cleaned_output}
   ]
   row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
   return row

dataset = dataset.map(format_chat_template, num_proc=4)
dataset = dataset.train_test_split(test_size=0.1)
data_collator = lambda batch: tokenizer(
   batch["text"],
   return_tensors="pt",
   padding=True,
   truncation=True
)

README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

data/train-00000-of-00001-5e7cb295b9cff0(…):   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2e-4,
    warmup_steps=100,
    logging_steps=10,
    save_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    fp16=True,
    optim="paged_adamw_8bit",
    max_grad_norm=0.3,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    max_seq_length=512,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

model.config.use_cache = False


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  super().__init__(


In [18]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 


[34m[1mwandb[0m: Enter your choice:

 1


[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohiraxonsobirjonova[0m ([33mmohiraxonsobirjonova-ustudy[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss




TrainOutput(global_step=339, training_loss=2.3547334699152493, metrics={'train_runtime': 3978.2946, 'train_samples_per_second': 1.357, 'train_steps_per_second': 0.085, 'total_flos': 1.6901381162115072e+16, 'train_loss': 2.3547334699152493, 'epoch': 3.0})

In [19]:
merged_model = model.merge_and_unload()
merged_model.save_pretrained(new_model)
merged_model.push_to_hub(new_model, use_temp_dir=False)



README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...tDoctor/model.safetensors:   1%|1         | 28.7MB / 2.22GB            

CommitInfo(commit_url='https://huggingface.co/Mohira/Gemma-2-2b-it-ChatDoctor/commit/d35f3493b95bf64cb1918909faac747733ea7767', commit_message='Upload Gemma2ForCausalLM', commit_description='', oid='d35f3493b95bf64cb1918909faac747733ea7767', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Mohira/Gemma-2-2b-it-ChatDoctor', endpoint='https://huggingface.co', repo_type='model', repo_id='Mohira/Gemma-2-2b-it-ChatDoctor'), pr_revision=None, pr_num=None)

In [24]:
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")



('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/chat_template.jinja',
 './final_model/tokenizer.model',
 './final_model/added_tokens.json',
 './final_model/tokenizer.json')

In [1]:
from peft import PeftModel

base_model_name = "google/gemma-2-2b-it"

tokenizer = AutoTokenizer.from_pretrained("./final_model")

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16,
)

model.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(model, "./final_model")

model = model.merge_and_unload()

test_prompt = """<|im_start|>system
If you are a doctor, please answer the medical questions based on the patient's description.<|im_end|>
<|im_start|>user
I have a headache and fever for 2 days. What should I do?<|im_end|>
<|im_start|>assistant
"""

inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

response = tokenizer.decode(outputs[0], skip_special_tokens=False)
print(response)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


<|im_start|><|im_start|>system
If you are a doctor, please answer the medical questions based on the patient's description.<|im_end|>
<|im_start|>user
I have a headache and fever for 2 days. What should I do?<|im_end|>
<|im_start|>assistant
It sounds like you're not feeling well! Two days of headaches and fevers could be related to something more serious. It's important to get checked out by a professional. Here are some things you can do:

1. **Call your doctor or healthcare provider.** Let them know about your symptoms and they will be able to assess whether further action is needed. 
2. **Stay hydrated.** Drink plenty of fluids like water, tea, and broth. This helps your body recover from illness.
3. **Rest.** Give yourself time to recover and avoid strenuous activities.  
4. **Monitor your symptoms.** Keep track of how you feel, including any changes in your condition. 
5. **Follow your doctor's instructions.** They may provide additional advice or treatment based on their assessme

In [2]:
from peft import PeftModel

base_model_name = "google/gemma-2-2b-it"

tokenizer = AutoTokenizer.from_pretrained("./final_model")

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16,
)

model.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(model, "./final_model")

model = model.merge_and_unload()

test_prompt = """<|im_start|>system
If you are a doctor, please answer the medical questions based on the patient's description.<|im_end|>
<|im_start|>user
I have a headache and fever for 2 days. What should I do?<|im_end|>
<|im_start|>assistant
"""

inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

response = tokenizer.decode(outputs[0], skip_special_tokens=False)
print(response)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



<|im_start|><|im_start|>system
If you are a doctor, please answer the medical questions based on the patient's description.<|im_end|>
<|im_start|>user
I have a headache and fever for 2 days. What should I do?<|im_end|>
<|im_start|>assistant
I understand you're feeling awful with a headache and fever for two days.  It sounds like you need some help to feel better.

Here is what you can do:

* **Hydrate:** Drink plenty of fluids. Water is best, but you can also try juice or broth.
* **Rest:** Get plenty of rest. Your body needs time to recover. 
* **Over-the-counter medications:** Consider taking over-the-counter pain relievers such as ibuprofen or acetaminophen.  Always follow the directions on the packaging.
* **Monitor your symptoms:** Pay close attention to how you're feeling. If you notice any changes in your symptoms, consult a doctor. 


Let me know if you have any other questions! 
<end_of_turn><eos>    
<end_of_turn><eos><eos><end_of_turn><eos><eos><eos><eos><end_of_turn><eos><e