## dataset train_test_split

In [1]:
import torch
import pandas as pd
import numpy as np
import multiprocessing

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import EarlyStoppingCallback
from datasets import load_dataset

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# ! pip install datasets

In [3]:
# ! pip3 freeze | grep transformers

In [4]:
# ! pip3 install -U transformers

In [5]:
# ! pip3 install evaluate

## Notice! 
-> if you have some problem with datasets library or transformers library
---
Problem 1. error "no module named dataset" <br>
solution 1. !pip3 install datasets<br>
---
Problem 2. huggingface_hub Error <br>
Solution 2. ! pip3 install -U transformers <br>

--- 
are those errors belong GPU session was closed,
all the installation information was formatting 
so, if you restart GPU session, you must reinstall all the library, when install library or file before closed session
not Kernel restart only GPU session restart
---
and also follow this solution
1. pip3 install -r requirements.txt

In [6]:
# !pip freeze >> requirements.txt

In [7]:
# ! pip3 install accelerate

### before Starting make a simple function 
#### this function use for working clock to parameter value

## CUDA load

In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else :
    device = torch.device("cpu")
device

device(type='cuda')

## Dataset load

In [9]:
ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca",split="train")

In [10]:
dataset_name = "iamtarun/python_code_instructions_18k_alpaca"

In [11]:
access_token = "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"

In [12]:
model_name = "google/gemma-2b-it"
tokenizer_name = "google/gemma-2b-it"

## Checking Datasets Type and features

In [13]:
ds

Dataset({
    features: ['instruction', 'input', 'output', 'prompt'],
    num_rows: 18612
})

In [14]:
ds.shape

(18612, 4)

### using dataset's train_test_split function

In [15]:
ds.train_test_split(test_size=0.3)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 13028
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 5584
    })
})

## using sklearn.model_selection's train_test_split() 

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
ds_sklearn = load_dataset(dataset_name)

In [18]:
ds_sklearn

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 18612
    })
})

In [19]:
train_set, test_set = train_test_split(ds_sklearn["train"], test_size=.3, random_state=1832)

In [20]:
# train_set

## model & tokenizer load

In [21]:
model=AutoModelForCausalLM.from_pretrained(model_name,
                                          token=access_token,
                                          device_map="auto",
                                           torch_dtype=torch.float32,
                                          )

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
                                         token=access_token,
                                         truncation=True,
                                         padding=True,
                                         max_length=100,
                                         )
tokenizer.pad_token_id = tokenizer.eos_token_id

In [23]:
! nvidia-smi

Mon Apr 22 07:10:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  CUDA GPU                       On  | 00000000:E3:00.0 Off |                    0 |
| N/A   44C    P0              73W / 300W |  10024MiB / 81074MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## processing Function

In [24]:
ds = ds.train_test_split(test_size=.3)

In [25]:
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 13028
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 5584
    })
})

In [26]:
train_set = ds["train"]

In [27]:
train_set

Dataset({
    features: ['instruction', 'input', 'output', 'prompt'],
    num_rows: 13028
})

In [28]:
test_set = ds["test"]
test_set

Dataset({
    features: ['instruction', 'input', 'output', 'prompt'],
    num_rows: 5584
})

In [29]:
def process(row):
    return tokenizer(row["instruction"],row["input"],row["output"],row["prompt"], return_tensors="pt", truncation=True, padding=True, max_length=100)

In [30]:
import multiprocessing

In [31]:
ds = ds.map(process,
           num_proc = multiprocessing.cpu_count(),
           load_from_cache_file=False,
           batched=True)
train_dataset = ds["train"]
test_dataset = ds["test"]

Map (num_proc=4):   0%|          | 0/13028 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5584 [00:00<?, ? examples/s]

In [32]:
train_dataset

Dataset({
    features: ['instruction', 'input', 'output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 13028
})

In [33]:
test_dataset

Dataset({
    features: ['instruction', 'input', 'output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5584
})

## model & Trainer arguments

In [34]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [35]:
training_args = TrainingArguments(output_dir="./eval_results_4",
                                 num_train_epochs=3,
                                 per_device_train_batch_size=2,
                                 per_device_eval_batch_size=2,
                                 weight_decay=0.01,
                                 logging_dir="./eval_logs_4",
                                 logging_steps=500,
                                 warmup_steps=300,
                                 dataloader_num_workers=4,
                                 eval_accumulation_steps=1,
                                 gradient_accumulation_steps=2,
                                 optim="adamw_torch",
                                 evaluation_strategy="steps",
                                 save_strategy="steps",
                                 do_eval=True,
                                 load_best_model_at_end=True)

In [36]:
import numpy as np
import evaluate

In [37]:
# ! pip3 install evaluate

In [38]:
acc_metrix = evaluate.load("accuracy")

In [39]:
def compute_matrix(eval_pred):
    logit, labels =eval_pred
    predict = np.argmax(logit, axis=-1)
    return acc_metrix.compute(predictions=predictions, references=labels)

In [40]:
model_trainer = Trainer(model,
                       args=training_args,
                       train_dataset=train_dataset,
                       eval_dataset=test_dataset,
                       tokenizer=tokenizer,
                       compute_metrics=compute_matrix,
#                        callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
                       )

In [None]:
model_trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mrlfdnjs9839[0m ([33mlineworld[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


wandb: Network error (ReadTimeout), entering retry loop.


In [None]:
torch.cuda.is_available()