In [1]:
# !pip install peft trl wandb

In [2]:
# !pip install -U bitsandbytes

In [3]:
# ! pip install --force-reinstall typing-extensions==4.5.0

In [4]:
# ! pip install datasets

In [5]:
# ! pip3 install -U transformers

In [6]:
# ! pip3 install evaluate

In [7]:
# ! pip3 install accelerate

In [32]:
# pip install -U flash-attn --no-build-isolation

In [33]:
# pip freeze | grep flash-attn

flash-attn==2.5.7
Note: you may need to restart the kernel to use updated packages.


## dataset train_test_split

In [9]:
import gc
import os
import torch
import pandas as pd
import numpy as np
import multiprocessing

from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import EarlyStoppingCallback
from transformers import BitsAndBytesConfig
from transformers import pipeline

from datasets import load_dataset

from trl import ORPOConfig, ORPOTrainer, setup_chat_format

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Notice! 
-> if you have some problem with datasets library or transformers library
---
Problem 1. error "no module named dataset" <br>
solution 1. !pip3 install datasets<br>
---
Problem 2. huggingface_hub Error <br>
Solution 2. ! pip3 install -U transformers <br>

--- 
are those errors belong GPU session was closed,
all the installation information was formatting 
so, if you restart GPU session, you must reinstall all the library, when install library or file before closed session
not Kernel restart only GPU session restart
---
and also follow this solution
1. pip3 install -r requirements.txt

In [10]:
# !pip freeze >> requirements.txt

### before Starting make a simple function 
#### this function use for working clock to parameter value

## CUDA load

In [11]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else :
    device = torch.device("cpu")
device

device(type='cuda')

In [12]:
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype=torch.bfloat16
else :
    attn_implementation = "eager"
    torch_dtype = torch.float16

In [13]:
attn_implementation

'flash_attention_2'

In [14]:
torch_dtype

torch.bfloat16

## Dataset load

In [15]:
ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca",split="train")

In [16]:
dataset_name = "iamtarun/python_code_instructions_18k_alpaca"

In [17]:
access_token = "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"

In [18]:
model_name = "google/gemma-2b-it"
tokenizer_name = "google/gemma-2b-it"

## Checking Datasets Type and features

In [19]:
ds

Dataset({
    features: ['instruction', 'input', 'output', 'prompt'],
    num_rows: 18612
})

In [20]:
ds.shape

(18612, 4)

### using dataset's train_test_split function

In [21]:
ds.train_test_split(test_size=0.3)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 13028
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 5584
    })
})

## using sklearn.model_selection's train_test_split() 

In [22]:
# from sklearn.model_selection import train_test_split

In [23]:
# ds_sklearn = load_dataset(dataset_name)

In [24]:
# ds_sklearn

In [25]:
# train_set, test_set = train_test_split(ds_sklearn["train"], test_size=.3, random_state=1832)

In [26]:
# train_set

## Config

In [27]:
bnb_config=BitsAndBytesConfig(load_in_4bit=True,
                             bnb_4bit_quant_type="nf4",
                             bnb_4bit_compute_dtype=torch_dtype,
                             bnb_4bit_use_double_quant=True)

In [28]:
peft_config = LoraConfig(r=16,
                        lora_alpha=32,
                        lora_dropout=.05,
                        bias="none",
                        task_type="CAUSAL_LM",
                        target_modules=["up_proj","donw_proj","gate_proj","k_proj",
                                       "q_proj","v_proj","o_proj"])

## model & tokenizer load

In [29]:
model=AutoModelForCausalLM.from_pretrained(model_name,
                                          token=access_token,
                                          device_map="auto",
                                            attn_implementation=attn_implementation,
                                           quantization_config=bnb_config
                                          )

ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: you need flash_attn package version to be greater or equal than 2.1.0. Detected version 0.2.8.dev0. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
                                         token=access_token,
                                         truncation=True,
                                         padding=True,
                                         max_length=100,
                                         )
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
! nvidia-smi

## processing Function

In [None]:
ds = ds.train_test_split(test_size=.3)

In [None]:
ds

In [None]:
train_set = ds["train"]

In [None]:
train_set

In [None]:
test_set = ds["test"]
test_set

In [None]:
def process(row):
    return tokenizer(row["instruction"],row["input"],row["output"],row["prompt"], return_tensors="pt", truncation=True, padding=True, max_length=100)

In [None]:
import multiprocessing

In [None]:
ds = ds.map(process,
           num_proc = multiprocessing.cpu_count(),
           load_from_cache_file=False,
           batched=True)
train_dataset = ds["train"]
test_dataset = ds["test"]

In [None]:
train_dataset

In [None]:
test_dataset

## model & Trainer arguments

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [None]:
training_args = TrainingArguments(output_dir="./eval_results_4",
                                 num_train_epochs=3,
                                 per_device_train_batch_size=2,
                                 per_device_eval_batch_size=2,
                                 weight_decay=0.01,
                                 logging_dir="./eval_logs_4",
                                 logging_steps=500,
                                 warmup_steps=300,
                                 dataloader_num_workers=4,
                                 eval_accumulation_steps=1,
                                 gradient_accumulation_steps=2,
                                 optim="adamw_torch",
                                 evaluation_strategy="steps",
                                 save_strategy="steps",
                                 do_eval=True,
                                 load_best_model_at_end=True)

In [None]:
import numpy as np
import evaluate

In [None]:
# ! pip3 install evaluate

In [None]:
acc_metrix = evaluate.load("accuracy")

In [None]:
def compute_matrix(eval_pred):
    logit, labels =eval_pred
    predict = np.argmax(logit, axis=-1)
    return acc_metrix.compute(predictions=predictions, references=labels)

In [None]:
model_trainer = Trainer(model,
                       args=training_args,
                       train_dataset=train_dataset,
                       eval_dataset=test_dataset,
                       tokenizer=tokenizer,
                       compute_metrics=compute_matrix,
#                        callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
                       )

In [None]:
model_trainer.train()

In [None]:
torch.cuda.is_available()