## dataset train_test_split

In [1]:
import torch
import pandas as pd
import numpy as np
import multiprocessing

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

from datasets import load_dataset

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# ! pip install datasets

In [3]:
# ! pip3 freeze | grep transformers

In [4]:
# ! pip3 install -U transformers

## Notice! 
-> if you have some problem with datasets library or transformers library
---
Problem 1. error "no module named dataset" <br>
solution 1. !pip3 install datasets<br>
---
Problem 2. huggingface_hub Error <br>
Solution 2. ! pip3 install -U transformers <br>

--- 
are those errors belong GPU session was closed,
all the installation information was formatting 
so, if you restart GPU session, you must reinstall all the library, when install library or file before closed session
not Kernel restart only GPU session restart
---
and also follow this solution
1. pip3 install -r requirements.txt

In [5]:
# !pip freeze >> requirements.txt

In [6]:
# ! pip3 install accelerate

### before Starting make a simple function 
#### this function use for working clock to parameter value

## Dataset load

In [7]:
ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca",split="train")

In [8]:
dataset_name = "iamtarun/python_code_instructions_18k_alpaca"

In [9]:
access_token = "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"

In [10]:
model_name = "google/gemma-2b-it"
tokenizer_name = "google/gemma-2b-it"

## Checking Datasets Type and features

In [11]:
ds

Dataset({
    features: ['instruction', 'input', 'output', 'prompt'],
    num_rows: 18612
})

In [12]:
ds.shape

(18612, 4)

### using dataset's train_test_split function

In [13]:
ds.train_test_split(test_size=0.3)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 13028
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 5584
    })
})

## using sklearn.model_selection's train_test_split() 

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
ds_sklearn = load_dataset(dataset_name)

In [16]:
ds_sklearn

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 18612
    })
})

In [17]:
train_set, test_set = train_test_split(ds_sklearn["train"], test_size=.3, random_state=1832)

In [20]:
# train_set

## model & tokenizer load

In [23]:
model=AutoModelForCausalLM.from_pretrained(model_name,
                                          token=access_token,
                                          device_map="auto",
                                           torch_dtype=torch.bfloat16,
                                          )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
                                         token=access_token,
                                         truncation=True,
                                         padding=True,
                                         max_length=100)

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [25]:
! nvidia-smi

Sun Apr 21 09:42:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  CUDA GPU                       On  | 00000000:E3:00.0 Off |                    0 |
| N/A   45C    P0              73W / 300W |  10168MiB / 81074MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## processing Function

In [30]:
ds = ds.train_test_split(test_size=.3)

In [31]:
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 13028
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 5584
    })
})

In [32]:
train_set = ds["train"]

In [33]:
train_set

Dataset({
    features: ['instruction', 'input', 'output', 'prompt'],
    num_rows: 13028
})

In [34]:
test_set = ds["test"]
test_set

Dataset({
    features: ['instruction', 'input', 'output', 'prompt'],
    num_rows: 5584
})

In [40]:
def process(row):
    return tokenizer(row["instruction"],row["input"],row["output"],row["prompt"], return_tensors="pt", truncation=True, padding=True, max_length=100)

In [41]:
import multiprocessing

In [42]:
ds = ds.map(process,
           num_proc = multiprocessing.cpu_count(),
           load_from_cache_file=False,
           batched=True)
train_dataset = ds["train"]
test_dataset = ds["test"]

Map (num_proc=4):   0%|          | 0/13028 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5584 [00:00<?, ? examples/s]

In [43]:
train_dataset

Dataset({
    features: ['instruction', 'input', 'output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 13028
})