In [1]:
!pip install huggingface_hub transformers accelerate torch datasets

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill,

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

In [3]:
model_name = "faizalnf1800/QuantumQuill-300m-GPT2-Medium"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

In [8]:
dataset = load_dataset("pavlichenko/WizardLM_evol_instruct_70k_train_val_split")
# dataset['train'] = dataset['test']
# del dataset["test"]

max_seq_length = 128
num_proc = 4

# def tokenize_function(examples):
#     examples["instruction"] = [line for line in examples["instruction"] if len(line) > 0 and not line.isspace()]
#     examples["output"] = [line for line in examples["output"] if len(line) > 0 and not line.isspace()]

#     tokenized_inputs = tokenizer(
#         examples['instruction'],
#         examples['output'],
#         padding=True,
#         truncation=True,
#         max_length=max_seq_length,
#         return_tensors='pt'
#     )
#     return tokenized_inputs

def tokenize_function(examples):
    examples["instruction"] = [line for line in examples["instruction"] if len(line) > 0 and not line.isspace()]
    examples["output"] = [line for line in examples["output"] if len(line) > 0 and not line.isspace()]

    # Ensure both input and output lists have the same length
    min_length = min(len(examples['instruction']), len(examples['output']))

    tokenized_inputs = tokenizer(
        examples['instruction'][:min_length],
        examples['output'][:min_length],
        padding=True,
        truncation=True,
        max_length=max_seq_length,
        return_tensors='pt'
    )
    return tokenized_inputs


tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=num_proc,
    remove_columns=["instruction", "output"]
)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop,
    # you can customize this part to your needs.
    total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=num_proc,
)


Map (num_proc=4):   0%|          | 0/65000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/64999 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

In [9]:
output = "QuantumQuill-300m-GPT2-Medium-Instruct"

In [10]:
# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output,
    overwrite_output_dir=True,
    num_train_epochs=4,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    save_steps=180,
    save_total_limit=5,
    learning_rate=3e-5,
    warmup_ratio=0.3,
    max_steps=180,
    logging_steps=10,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
)

# Fine-tune the model
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.7106
20,2.7062
30,2.5164
40,2.454
50,2.3924
60,2.2902
70,2.2887
80,2.2431
90,2.2852
100,2.2013


TrainOutput(global_step=180, training_loss=2.305574427710639, metrics={'train_runtime': 172.1309, 'train_samples_per_second': 8.366, 'train_steps_per_second': 1.046, 'total_flos': 334332250030080.0, 'train_loss': 2.305574427710639, 'epoch': 0.02})

In [11]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [12]:
trainer.push_to_hub(output)
tokenizer.push_to_hub(output)

events.out.tfevents.1702398119.1fe2e85fa015.239.0:   0%|          | 0.00/7.77k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/faizalnf1800/QuantumQuill-300m-GPT2-Medium-Instruct/commit/9e607eb1bc599c390671f439a6853c67f1264bcd', commit_message='Upload tokenizer', commit_description='', oid='9e607eb1bc599c390671f439a6853c67f1264bcd', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
model.save_pretrained(output)
tokenizer.save_pretrained(output)

('QuantumQuill-300m-GPT2-Medium-Instruct/tokenizer_config.json',
 'QuantumQuill-300m-GPT2-Medium-Instruct/special_tokens_map.json',
 'QuantumQuill-300m-GPT2-Medium-Instruct/vocab.json',
 'QuantumQuill-300m-GPT2-Medium-Instruct/merges.txt',
 'QuantumQuill-300m-GPT2-Medium-Instruct/added_tokens.json',
 'QuantumQuill-300m-GPT2-Medium-Instruct/tokenizer.json')

In [None]:
from huggingface_hub import create_repo
create_repo(f"faizalnf1800/{output}")

In [None]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path=f"/content/{output}",
    repo_id=f"faizalnf1800/{output}",
    repo_type="model",
)

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

optimizer.pt:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

'https://huggingface.co/faizalnf1800/QuantumQuill-300m-GPT2-Medium-Instruct/tree/main/'

In [None]:
from google.colab import runtime
runtime.unassign()