In [1]:
# modified from: https://www.datacamp.com/tutorial/llama3-fine-tuning-locally
# read also: https://huggingface.co/google/gemma-7b/blob/main/examples/example_sft_qlora.py
# read also: https://huggingface.co/google/gemma-2b-it

# pip install bitsandbytes==0.43.2


In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [3]:
#from huggingface_hub import login
#from kaggle_secrets import UserSecretsClient
#user_secrets = UserSecretsClient()

#hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
#login(token = hf_token)

#wb_token = user_secrets.get_secret("wandb")

#wandb.login(key=wb_token)
#run = wandb.init(
#    project='Fine-tune Llama 3 8B on Medical Dataset', 
#    job_type="training", 
#    anonymous="allow"
#)

In [4]:
base_model = "google/gemma-2b-it"
#"google/codegemma-1.1-7b-it"
#"google/gemma-2b"
#"google/codegemma-2b"
#"google/gemma-2b-it"
new_model = base_model.replace("google/", "haesleinhuepf/") + "-bia"

In [5]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [8]:
tokenizer.padding_side = 'right'

In [9]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [10]:
qa_jsonl_filename = "questions_answers.jsonl"

In [11]:
import json

data = []
with open(qa_jsonl_filename, 'r') as file:
    for line in file:
        json_object = json.loads(line.strip())
        data.append(json_object)


In [12]:
# Convert text to list of dictionaries
training_data = []

for d in data:
    question = d["question"]
    answer =   d["answer"]
    
    training_data.append(
        {
            "messages": [
                # {"role": "system", "content": """Enter a smart system message here."""},
                {"role": "user", "content": question},
                {"role": "assistant", "content": answer}
            ]
        })

training_data[0]

{'messages': [{'role': 'user',
   'content': 'How can we calculate the average values along the first axis or ```axis=0``` in Python code?'},
  {'role': 'assistant',
   'content': '\nThis code imports the numpy library and creates two numpy arrays: image1 and image2. Image1 is initialized with all elements as 1, while image2 is filled with random numbers between 0 and 1. The np.mean function is then used on image2 with the axis parameter set to 0 to calculate the mean along each column of the array.\n\n```python\n\nimport numpy as np\n\nimage1 = np.ones((3,5))\nimage1\n\nimage2 = np.random.random((3,5))\nimage2\n\nnp.mean(image2, axis=0)\n\n```\n'}]}

In [13]:
dataset = [tokenizer.apply_chat_template(row_json["messages"], tokenize=False) for row_json in training_data]
dataset = {"text":dataset}
dataset["text"][0]

'<|im_start|>user\nHow can we calculate the average values along the first axis or ```axis=0``` in Python code?<|im_end|>\n<|im_start|>assistant\n\nThis code imports the numpy library and creates two numpy arrays: image1 and image2. Image1 is initialized with all elements as 1, while image2 is filled with random numbers between 0 and 1. The np.mean function is then used on image2 with the axis parameter set to 0 to calculate the mean along each column of the array.\n\n```python\n\nimport numpy as np\n\nimage1 = np.ones((3,5))\nimage1\n\nimage2 = np.random.random((3,5))\nimage2\n\nnp.mean(image2, axis=0)\n\n```\n<|im_end|>\n'

In [14]:
from datasets import Dataset
import pandas as pd

# Convert the array into a pandas DataFrame
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,text
0,<|im_start|>user\nHow can we calculate the ave...
1,<|im_start|>user\nHow can I write Python code ...
2,<|im_start|>user\nHow can we obtain the precis...
3,<|im_start|>user\nHow can we use indices in Py...
4,<|im_start|>user\nHow can we write Python code...


In [15]:
# Create a Hugging Face dataset from the DataFrame
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['text'],
    num_rows: 130
})

In [16]:
#dataset

In [17]:
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 91
    })
    test: Dataset({
        features: ['text'],
        num_rows: 39
    })
})

In [18]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=10,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [19]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

In [20]:
trainer.train()

wandb: Currently logged in as: haesleinhuepf (haesleinhuepf-leipzig-university). Use `wandb login --relogin` to force relogin


Step,Training Loss,Validation Loss
90,0.776,1.193855
180,0.1931,1.523187
270,0.0947,1.793938
360,0.0603,2.079079
450,0.0376,2.227421




TrainOutput(global_step=450, training_loss=0.38125387470341393, metrics={'train_runtime': 219.034, 'train_samples_per_second': 4.155, 'train_steps_per_second': 2.054, 'total_flos': 1963683800358912.0, 'train_loss': 0.38125387470341393, 'epoch': 9.89010989010989})

In [21]:
trainer.save_model(new_model)



## merging

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, new_model)

model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Test model

In [23]:
messages = [{"role": "user", "content": """
Write Python code to load the image ../11a_prompt_engineering/data/blobs.tif,
segment the nuclei in it and
show the result
"""}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

  attn_output = torch.nn.functional.scaled_dot_product_attention(


<|im_start|>user

Write Python code to load the image ../11a_prompt_engineering/data/blobs.tif,
segment the nuclei in it and
show the result
<|im_end|>
<|im_start|>assistant
The code uses the pyclesperanto_prototype library to load an image from the file "../11a_prompt_engineering/data/blobs.tif", segment the nuclei in it, and display the result.

```python
import pyclesperanto_prototype as cle

image = cle.load_image("../../data/blobs.tif")
nuclei = cle.segment_nuclei(image)
cle.imshow(nuclei)
```
The pyclesperanto_prototype library provides a high-level interface for image processing tasks in Python. It simplifies the


In [24]:
model.save_pretrained(new_model + "_ft")

In [25]:
trainer.model.push_to_hub(new_model + "_ft", use_temp_dir=False)

HfHubHTTPError:  (Request ID: Root=1-66a797c6-66cd874414133ec00516a708;c98d2e3a-4e8e-4e24-86dc-7dbe404ae210)

403 Forbidden: You don't have the rights to create a model under the namespace "haesleinhuepf".
Cannot access content at: https://huggingface.co/api/repos/create.
If you are trying to create or update content, make sure you have a token with the `write` role.