In [1]:
from tqdm import tqdm

In [6]:
progress_bar = tqdm(range(1000))


3000it [00:37, 80.67it/s]                                                                                         | 0/1000 [00:00<?, ?it/s][A


In [10]:
for i in range(1000):
    progress_bar.update({'step':1})

TypeError: '<' not supported between instances of 'dict' and 'int'

999

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer
import os

import warnings
from typing import Dict, List, Optional, Union

import numpy as np

try:
    from datasets import (
        Dataset,
        DatasetDict,
        Features,
        Sequence,
        Value,
        concatenate_datasets,
        load_dataset,
    )
except ImportError:
    warnings.warn("Datasets not installed, you'll be unable to use these dataset processing functions.")

# Import SFT processing functions for backward compatibility


def clm_process(
    raw_dataset: "Dataset",
    tokenizer,
    text_column_name: str,
    dataset_processing_num_proc_per_process: int,
    dataset_overwrite_cache: bool,
    sequence_length: int,
):
    """
    Concatenate all texts from raw_dataset and generate chunks of `sequence_length + 1`,
    where chunks overlap by a single token.

    Args:
        raw_dataset: Dataset containing raw text
        tokenizer: HuggingFace tokenizer
        text_column_name: Name of the column containing text data
        dataset_processing_num_proc_per_process: Number of processes for parallelization
        dataset_overwrite_cache: Whether to overwrite the cache
        sequence_length: Maximum sequence length

    Returns:
        Processed dataset with tokenized sequences
    """
    # Adapted from https://github.com/huggingface/transformers/blob/47e1676255e5dd86b9541f734cd4f4bdcbb50f4a/examples/pytorch/language-modeling/run_clm.py#L391-L439

    def group_texts(examples: Dict[str, List[np.ndarray]]) -> Dict[str, List[np.ndarray]]:
        # Concatenate all texts.
        concatenated_examples = {k: np.concatenate(v) for k, v in examples.items()}
        total_length = len(concatenated_examples[next(iter(examples.keys()))])
        # WARNING: We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= sequence_length + 1:
            total_length = ((total_length - 1) // sequence_length) * sequence_length + 1
        # Split by chunks of sequence_length.
        result = {
            k: [
                t[i : i + sequence_length + 1] for i in range(0, total_length - (sequence_length + 1), sequence_length)
            ]
            for k, t in concatenated_examples.items()
        }
        return result

    def _tokenize_and_group_texts(texts: List[str]) -> Dict[str, List[np.ndarray]]:
        # add_eos_token
        texts = [i+tokenizer.eos_token for i in texts]
        # print(texts[0])
        tokenized_batch = tokenizer.batch_encode_plus(texts, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) # add_special_tokens = False
        tokenized_batch = {k: [np.array(tokenized_texts) for tokenized_texts in v] for k, v in tokenized_batch.items()}
        return group_texts(tokenized_batch)

    train_dataset = raw_dataset.map(
        _tokenize_and_group_texts,
        input_columns=text_column_name,
        remove_columns=raw_dataset.column_names,
        features=Features({"input_ids": Sequence(feature=Value(dtype="int64"), length=sequence_length + 1)}),
        batched=True,
        num_proc=dataset_processing_num_proc_per_process,
        load_from_cache_file=not dataset_overwrite_cache,
        desc=f"Grouping texts in chunks of {sequence_length+1}",
    )
    return train_dataset



In [4]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

text_column_name = 'text'

num_proc = 1

from datasets import load_dataset
# get Croatian data
ds = load_dataset("HuggingFaceFW/fineweb-2", name="kor_Hang", streaming=True, split='test')

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

In [5]:
from torch.utils.data import DataLoader

In [6]:
dataloader = DataLoader(ds, batch_size = 1)

In [8]:
for i in tqdm(dataloader):
    i

0it [00:07, ?it/s]


KeyboardInterrupt: 

In [19]:
tokenizer.save_pretrained('./data/tiny_llama')

('./data/tiny_llama\\tokenizer_config.json',
 './data/tiny_llama\\special_tokens_map.json',
 './data/tiny_llama\\tokenizer.model',
 './data/tiny_llama\\added_tokens.json',
 './data/tiny_llama\\tokenizer.json')

In [14]:
data = []
for i in ds:
    if len(data)==1000:
        break
    data.append(i)

In [15]:
from datasets import Dataset

In [16]:
ds = Dataset.from_list(data)

In [17]:
# For pretraining, use existing CLM processing
train_dataset = clm_process(
    raw_dataset = ds,
    tokenizer = tokenizer,
    text_column_name = 'text',
    dataset_processing_num_proc_per_process = num_proc,
    dataset_overwrite_cache = True,
    sequence_length = 1024,
)

# # For pretraining, use existing CLM processing
# test_dataset = clm_process(
#     raw_dataset = ds['test'],
#     tokenizer = tokenizer,
#     text_column_name = 'text',
#     dataset_processing_num_proc_per_process = num_proc,
#     dataset_overwrite_cache = True,
#     sequence_length = 1024('./data/tokenized/fineweb-2-ko/train')('./data/tokenized/fineweb-2-ko/train'),
# )

# # train-test split for english 
# en_train, en_test = ds_en['train'].train_test_split(test_size=0.001)



# # For pretraining, use existing CLM processing
# en_train_dataset = clm_process(
#     raw_dataset = ds_en['train'],
#     tokenizer = tokenizer,
#     text_column_name = 'text',
#     dataset_processing_num_proc_per_process = num_proc,
#     dataset_overwrite_cache = True,
#     sequence_length = 1024,
# )

Grouping texts in chunks of 1025:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [18]:
train_dataset.save_to_disk('./data/tokenized/fineweb-2-ko/test') #('./data/tokenized/fineweb-2-ko/train')

Saving the dataset (0/1 shards):   0%|          | 0/1151 [00:00<?, ? examples/s]

In [20]:
from datasets import load_from_disk

In [None]:
load_from_disk(

In [None]:
train_dataset.save_to_disk('