## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token,  # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


In [2]:
model_name = "Waktaverse-Llama-3-KO-8B-Instruct-ORPO"  # ADD YOUR MODEL NAME HERE
username = "PathFinderKR"  # ADD YOUR USERNAME HERE
repo_id = f"{username}/{model_name}"  # repository id

## Login to Weights & Biases

In [3]:
import wandb

api_key = os.getenv("WANDB_API_KEY")
wandb.login(
    key=api_key  # ADD YOUR API KEY HERE
)
wandb.init(project=model_name)

[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m ([33mwaktaverse[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pathfinder/.netrc


True

## Downloads

In [4]:
#!pip install huggingface_hub
#!pip install wandb
#!pip install transformers
#!pip install bitsandbytes
#!pip install peft
#!pip install trl
#!pip install accelerate
#!pip install datasets
#!pip install scikit-learn
#!pip install packaging
#!pip install ninja
#!pip install flash-attn --no-build-isolation

## Imports

In [5]:
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from peft import LoraConfig, PeftModel
from trl import ORPOConfig, ORPOTrainer

# datasets
from datasets import load_dataset

## Device

In [6]:
# Device setup
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


In [7]:
# Flash Attention Implementation
if device == "cuda:0":
    if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
        attn_implementation = "flash_attention_2"
        torch_dtype = torch.bfloat16
    else:
        attn_implementation = "eager"
        torch_dtype = torch.float16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float32
print(f"Attention Implementation = {attn_implementation}")

Attention Implementation = flash_attention_2


## Hyperparameters

In [8]:
################################################################################
# seed
################################################################################
seed=42
torch.manual_seed(seed)

################################################################################
# Tokenizer parameters
################################################################################
max_tokens=1024
padding="do_not_pad" # "max_length", "longest", "do_not_pad"
truncation=True

################################################################################
# Generation parameters
################################################################################
num_return_sequences=1
min_new_tokens=1
max_new_tokens=1024
do_sample=True
temperature=0.6
top_k=40
top_p=0.9
repetition_penalty=1.1

################################################################################
# Dataset parameters
################################################################################
validation_size=0.1

################################################################################
# bitsandbytes parameters
################################################################################
load_in_4bit=True
bnb_4bit_compute_dtype=torch_dtype
bnb_4bit_quant_type="nf4" # "nf4", #fp4"
bnb_4bit_use_double_quant=True

################################################################################
# LoRA parameters
################################################################################
task_type="CAUSAL_LM"
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
r=8
lora_alpha=16
lora_dropout=0.05
bias="none"

################################################################################
# ORPO parameters
################################################################################
output_dir="./results"
logging_dir="./logs"
save_strategy="epoch" # "steps", "epoch"
logging_strategy="steps" # "steps", "epoch"
if logging_strategy == "steps":
    logging_steps=10
else:
    logging_steps=None
evaluation_strategy="steps" # "steps", "epoch"
if evaluation_strategy == "steps":
    eval_steps=10
else:
    eval_steps=None
save_total_limit=1
report_to="wandb"

learning_rate=8e-6
beta=0.1
max_length=1024
max_prompt_length=512
num_train_epochs=1
per_device_train_batch_size=1
per_device_eval_batch_size=1
gradient_accumulation_steps=1
optim="adamw_torch" # "sgd", "adamw_torch"
weight_decay=0.1
lr_scheduler_type="cosine" # "constant", "linear", "cosine"
warmup_steps=10
warmup_ratio=0.1

## Model

In [9]:
# Model List

# gemma variants
# "google/gemma-1.1-7b-it"
# "google/codegemma-7b-it"

# llama2 variants
# "meta-llama/Meta-Llama-3-8B-Instruct" // downloaded
# "codellama/CodeLlama-7b-Instruct-hf"
# "PathFinderKR/Waktaverse-Llama-3-KO-8B-Instruct"

# mistral variants
# "mistralai/Mistral-7B-Instruct-v0.2"

# solar variants
# "upstage/SOLAR-10.7B-Instruct-v1.0" // downloaded
# "PathFinderKR/Waktaverse-SOLAR-KO-10.7B-Instruct"

In [10]:
# Model ID for base model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [11]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # add padding token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# Quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant
)

In [13]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation=attn_implementation,
    torch_dtype=torch_dtype,
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
# display the model architecture
display(Markdown(f'```{model}```'))

```LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)```

## Dataset

In [15]:
# Dataset ID
dataset_id = "orpo-explorers/OpenHermesPreferences-10k"

In [16]:
# Load dataset
dataset = load_dataset(dataset_id)

In [17]:
# Dataset information
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'category', 'prompt', 'candidates_completions', 'candidate_policies', 'ranks', 'rank_str', 'chosen_policy', 'chosen', 'rejected_policy', 'rejected'],
        num_rows: 10000
    })
})

In [18]:
# Dataset example
dataset["train"][0]

{'source': None,
 'category': None,
 'prompt': 'You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old. Read this: Southeast Tucson continues to experience rapid residential development. The area includes Davis-Monthan Air Force Base. The area is considered to be south of Golf Links Road. It is the home of Santa Rita High School, Chuck Ford Park (Lakeside Park), Lakeside Lake, Lincoln Park (upper and lower), The Lakecrest Neighborhoods, and Pima Community College East Campus. The Atterbury Wash with its access to excellent bird watching is also located in the Southeast Tucson area. The suburban community of Rita Ranch houses many of the military families from Davis-Monthan, and is near the southeastern-most expansion of the current city limits. Close by Rita Ranch and also within the city limits lies Civano, a planned development meant to showcase ecologically sound building practices and lifestyles.\n\nWhat is the name of the High 

## Preprocessing

In [19]:
# ORPO dataset format: 
# {"prompt": [str], "chosen": [str], "rejected": [str]} in chat template
def preprocess_function(example):
    example["chosen"] = tokenizer.apply_chat_template(example["chosen"], tokenize=False)
    example["rejected"] = tokenizer.apply_chat_template(example["rejected"], tokenize=False)
    return example

dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [20]:
# Split the dataset into a training and a validation dataset
dataset = dataset["train"].train_test_split(test_size=validation_size, seed=seed)

# Number of questions in the train, validation dataset
print(f"Number of questions in the train dataset: {len(dataset['train'])}")
print(f"Number of questions in the validation dataset: {len(dataset['test'])}")

Number of questions in the train dataset: 9000
Number of questions in the validation dataset: 1000


In [21]:
# Display the first example
dataset["train"][0]

{'source': None,
 'category': None,
 'prompt': "User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer. Given the task definition and input, reply with output. In this task, you're given four sentences of a story written in natural language, and one last sentence (Sentence5). Your job is to classify whether the last sentence completes the rest of the story coherently or not by providing 'Yes' or 'No'.\n\nSentence1: Ann and Tim loved astrology. Sentence2: In fact, the met while they were both in astronomy class in college! Sentence3: When they married and became pregnant, they had a dilemma. Sentence4: They couldn't decide on a meaningful name for their baby daughter. \n Sentence5: Tim and Anna decided they would ask their veterinarian.",
 'candidates_completions': [' No, the last sentence does not complete the rest of the story coherently. The story is about the main c

In [22]:
# Dataset example
print(f"Chosen: {dataset['train'][0]['chosen']}")
print(f"Rejected: {dataset['train'][0]['rejected']}")

Chosen: <|begin_of_text|><|start_header_id|>user<|end_header_id|>

User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer. Given the task definition and input, reply with output. In this task, you're given four sentences of a story written in natural language, and one last sentence (Sentence5). Your job is to classify whether the last sentence completes the rest of the story coherently or not by providing 'Yes' or 'No'.

Sentence1: Ann and Tim loved astrology. Sentence2: In fact, the met while they were both in astronomy class in college! Sentence3: When they married and became pregnant, they had a dilemma. Sentence4: They couldn't decide on a meaningful name for their baby daughter. 
 Sentence5: Tim and Anna decided they would ask their veterinarian.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

No, the last sentence does not complete the rest of the story c

In [23]:
print(f"Chosen: {dataset['test'][0]['chosen']}")
print(f"Rejected: {dataset['test'][0]['rejected']}")

Chosen: <|begin_of_text|><|start_header_id|>user<|end_header_id|>

How can I create a two-dimensional array of size 5 by 5 in C language?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

You can create a two-dimensional array of size 5 by 5 in C language using the following code:

```C
#include<stdio.h>

int main() {
    int arr[5][5];

    for(int i = 0; i < 5; i++) {
        for (int j = 0; j < 5; j++) {
            arr[i][j] = 0;
        }
    }

    return 0;
}
```

In this code, we declare a two-dimensional array `arr` of size 5 by 5 using the syntax `int arr[5][5];`. This creates a 5x5 matrix with 25 elements.

To initialize all the elements of the array to 0, we use nested `for` loops. The outer loop iterates over the rows, and the inner loop iterates over the columns. Inside the nested loops, we assign the value 0 to each element of the array using the statement `arr[i][j] = 0;`.

After executing this code, you will have a two-dimensional array `arr` of size 5 by 5, with

In [24]:
# Train on only a subset of the dataset for demonstration purposes
dataset["train"] = dataset["train"].select(range(100))
dataset["test"] = dataset["test"].select(range(10))

## Inference before Training

In [25]:
def generate_response(system ,user):
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": user}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=False
    )
    
    input_ids = tokenizer.encode(
        prompt,
        max_length=max_tokens,
        padding=padding,
        truncation=truncation,
        add_special_tokens=True,
        return_tensors="pt"
    ).to(device)
    
    outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=num_return_sequences,
        min_new_tokens=min_new_tokens,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=False)

In [26]:
system_prompt = "You are a helpful assistant. Respond to the following user prompt. Use Korean only. 한국어만 사용하세요."

In [27]:
#user_prompt = "Write me a poem about Machine Learning."
user_prompt = "머신러닝에 대한 시를 써주세요."

In [28]:
response = generate_response(system_prompt, user_prompt)
print(response)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant. Respond to the following user prompt. Use Korean only. 한국어만 사용하세요.<|eot_id|><|start_header_id|>user<|end_header_id|>

머신러닝에 대한 시를 써주세요.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

머신 러닝의 노래

머신 러닝의 꿈은 하늘 높이
데이터를 모으면 새로운 세상
알고리즘의 춤을 추는 밤
인공지능의 새벽을 맞이할 것

자료의 흐름 속에서 우리는 찾는다
패턴과 상관관계를 파악하여
새로운 지식을 얻는 데 성공하라
인간의 삶을 더 잘 이해할 수 있게

머신 러닝의 노래는 끝나지 않아
계속적으로 발전하고 있는 그릇
인류의 미래를 예측하는 데 도움이 되리
새로운 세상을 창조하는 데 일조하게

(Note: This is a poem written in Korean about machine learning.)<|eot_id|>


##  Odds Ratio Preference Optimization 

In [29]:
lora_config = LoraConfig(
    task_type=task_type,
    target_modules=target_modules,
    r=r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=bias
)

In [30]:
orpo_args = ORPOConfig(
    output_dir=output_dir,
    logging_dir=logging_dir,
    save_strategy=save_strategy,
    logging_strategy=logging_strategy,
    evaluation_strategy=evaluation_strategy,
    save_total_limit=save_total_limit,
    report_to=report_to,
    
    learning_rate=learning_rate,
    beta=beta,
    max_length=max_length,
    max_prompt_length=max_prompt_length,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    weight_decay=weight_decay,
    lr_scheduler_type=lr_scheduler_type,
    warmup_steps=warmup_steps,
    warmup_ratio=warmup_ratio,
    seed=seed
)

In [31]:
trainer = ORPOTrainer(
    model=model,
    tokenizer=tokenizer,
    args=orpo_args,
    peft_config=lora_config,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [32]:
trainer.train()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112820344447352, max=1.0…

../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [132,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [132,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [132,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [132,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [132,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [132,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [132,

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
wandb.finish()
trainer.save_model(model_name)

## Inference after Training

In [None]:
system_prompt = "You are a helpful assistant. Respond to the following user prompt. Use Korean only. 한국어만 사용하세요."

In [None]:
#user_prompt = "Write me a poem about Machine Learning."
user_prompt = "머신러닝에 대한 시를 써주세요."

In [None]:
response = generate_response(system_prompt, user_prompt)
print(response)

## Upload Model

In [None]:
# Flush memory
import gc

del trainer, model
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(base_model, model_name)
model = model.merge_and_unload()

In [None]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Push model and tokenizer to Hugging Face Hub
model.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)
tokenizer.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)