In [3]:
!pip install -q  torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
# Model and tokenizer names
base_model_name = "llSourcell/medllama2_7b"
refined_model = "opdx"

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Downloading shards:   0%|          | 0/2 [01:29<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Dataset
data_name = "satyam-03/ddx-conversations-10k"
training_data = load_dataset(data_name, split="train")

# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=2,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=1,
    optim="paged_adamw_8bit",
    save_steps=25,
    logging_steps=2,
    learning_rate=1e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="linear",
    report_to="tensorboard",
    gradient_checkpointing=True
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params
)

# Training
fine_tuning.train()

# Save Model
fine_tuning.model.save_pretrained(refined_model)

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/1090 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
2,1.2508
4,1.3012
6,1.3964
8,1.3824
10,1.4838
12,1.5211
14,1.5184
16,1.4125
18,1.4722
20,1.3836




In [None]:
fine_tuning.model.save_pretrained('opdx')
!ls

opdx  results_modified	sample_data


## Convert with llama.cpp — convert-lora-to-ggml.py

In [3]:
!git clone https://github.com/ggerganov/llama.cpp.git

Cloning into 'llama.cpp'...
Updating files:  81% (505/618)
Updating files:  82% (507/618)
Updating files:  83% (513/618)
Updating files:  84% (520/618)
Updating files:  85% (526/618)
Updating files:  86% (532/618)
Updating files:  87% (538/618)
Updating files:  88% (544/618)
Updating files:  89% (551/618)
Updating files:  90% (557/618)
Updating files:  91% (563/618)
Updating files:  92% (569/618)
Updating files:  93% (575/618)
Updating files:  94% (581/618)
Updating files:  95% (588/618)
Updating files:  96% (594/618)
Updating files:  97% (600/618)
Updating files:  98% (606/618)
Updating files:  99% (612/618)
Updating files: 100% (618/618)
Updating files: 100% (618/618), done.


In [9]:
%cd llama.cpp

c:\Users\asus\Desktop\Dl Project\DL-Final\temp\finetuning\llama.cpp


In [16]:
!pip install -r requirements.txt

Collecting numpy~=1.24.4 (from -r ./requirements/requirements-convert.txt (line 1))
  Downloading numpy-1.24.4-cp310-cp310-win_amd64.whl.metadata (5.6 kB)
Collecting sentencepiece~=0.1.98 (from -r ./requirements/requirements-convert.txt (line 2))
  Downloading sentencepiece-0.1.99-cp310-cp310-win_amd64.whl.metadata (8.3 kB)
Collecting transformers<5.0.0,>=4.35.2 (from -r ./requirements/requirements-convert.txt (line 3))
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
     ---------------------------------------- 0.0/138.0 kB ? eta -:--:--
     -- ------------------------------------- 10.2/138.0 kB ? eta -:--:--
     ---------- -------------------------- 41.0/138.0 kB 487.6 kB/s eta 0:00:01
     ----------------------------- ------ 112.6/138.0 kB 939.4 kB/s eta 0:00:01
     ------------------------------------ 138.0/138.0 kB 905.9 kB/s eta 0:00:00
Collecting gguf>=0.1.0 (from -r ./requirements/requirements-convert.txt (line 4))
  Using cached gguf-0.6.0-py3-none-any

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llava 0.1.0 requires tokenizers==0.12.1, but you have tokenizers 0.19.1 which is incompatible.
torchaudio 2.0.1+cu117 requires torch==2.0.0, but you have torch 2.1.2 which is incompatible.
torchvision 0.15.1+cu117 requires torch==2.0.0, but you have torch 2.1.2 which is incompatible.


In [17]:
# !python convert-lora-to-ggml.py [path to opdx folder]/adapter_config.json

Traceback (most recent call last):
  File "c:\Users\asus\Desktop\Dl Project\DL-Final\temp\finetuning\llama.cpp\convert-lora-to-ggml.py", line 68, in <module>
    model = load_file(input_model, device="cpu")
  File "c:\Users\asus\anaconda3\envs\llava-med\lib\site-packages\safetensors\torch.py", line 308, in load_file
    with safe_open(filename, framework="pt", device=device) as f:
FileNotFoundError: No such file or directory: "../opdx/adapter_config.json\\adapter_model.safetensors"


In [None]:
# ollama pull tinyllama
# touch ModelfileTinyllama

In [None]:
!ps

    PID TTY          TIME CMD
      1 ?        00:00:00 docker-init
      6 ?        00:00:08 node
     10 ?        00:00:01 oom_monitor.sh
     12 ?        00:00:00 run.sh
     14 ?        00:00:02 kernel_manager_
     35 ?        00:00:00 tail
     58 ?        00:00:05 python3 <defunct>
     59 ?        00:00:00 colab-fileshim.
    105 ?        00:00:08 jupyter-noteboo
    106 ?        00:00:05 dap_multiplexer
    238 ?        00:00:09 python3
  13965 ?        00:00:02 python3
  13994 ?        00:00:00 python3
  14029 ?        00:00:00 language_servic
  14034 ?        00:00:10 node
  14104 ?        00:00:00 sleep
  14105 ?        00:00:00 ps


In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer

# Define the model and tokenizer names
model_name = "llSourcell/medllama2_7b"
adapters_name = 'opdx'

# Load the model onto the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
m = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map={"": 0}
)
m = PeftModel.from_pretrained(m, adapters_name)
m = m.to(device)

# Load the tokenizer
tok = LlamaTokenizer.from_pretrained(model_name)
tok.bos_token_id = 1

# Move inputs to the CPU
inputs = tok("Today was an amazing day because", return_tensors="pt")
inputs = {k: v.to("cpu") for k, v in inputs.items()}

# Generate text
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = m.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)

decoded_outputs = tok.batch_decode(outputs, skip_special_tokens=True)
print(decoded_outputs)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



['Today was an amazing day because: I woke up feeling more energized and alert than I have been. I have been experiencing shortness of breath and fatigue. I feel like I am constantly in a haze or surrounded by fog. I was able to get out of bed and perform daily activities with less pain and difficulty compared to yesterday.\nWhat I ate and my whereabouts immediately prior to the incident are: I ate a fruit salad with a sweetener and white wine. I also did']


In [None]:
torch.save(m.state_dict(), './opdx-full')

NameError: name 'torch' is not defined

In [None]:
prompt = "What are the symptoms of Bronchitis ?"
inputs = tok(prompt, return_tensors="pt")
inputs = {k: v.to("cpu") for k, v in inputs.items()}

# Generate text
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = m.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)

decoded_outputs = tok.batch_decode(outputs, skip_special_tokens=True)
print(decoded_outputs)

['What are the symptoms of Bronchitis ?\nMajor symptoms of bronchitis include a cough producing mucus (sputum), coughing up blood, wheezing sounds when breathing out, shortness of breath, pain when breathing in deeply, and a pinkish tint to the nose.\nWhat are the causes of chronic bronchitis?\nThe causes of chronic bronchitis are not clearly identified, but smoking, exposure to secondhand smoke, environmental poll']
