In [None]:
!pip install huggingface_hub transformers accelerate torch datasets

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

In [None]:
model_name = "gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
dataset = load_dataset("faizalnf1800/bard-scifi-story-instruct")
# dataset['train'] = dataset['test']
# del dataset["test"]

max_seq_length = 128
num_proc = 4

text_task = []

def tokenize_function(examples):
    examples["prompt"] = [line for line in examples["prompt"] if len(line) > 0 and not line.isspace()]
    examples["completion"] = [line for line in examples["completion"] if len(line) > 0 and not line.isspace()]
    tokenized_inputs = tokenizer(
        examples['prompt'],
        examples['completion'],
        padding=True,
        truncation=True,
        max_length=max_seq_length
    )
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=num_proc,
    remove_columns=["prompt", "completion"]
)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop,
    # you can customize this part to your needs.
    total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=num_proc,
)

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

In [None]:
# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

output = "clm_gpt2_medium_scifi_story_instruct"

# Define training arguments
training_args = TrainingArguments(
    output_dir=output,
    overwrite_output_dir=True,
    num_train_epochs=4,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    save_steps=180,
    save_total_limit=5,
    learning_rate=5e-5,
    warmup_ratio=0.3,
    max_steps=180,
    logging_steps=10,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
)

# Fine-tune the model
trainer.train()

In [7]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
trainer.push_to_hub(output)

In [None]:
trainer.save_model()

In [None]:
tokenizer.save_pretrained(output)

In [None]:
from huggingface_hub import create_repo
create_repo(f"faizalnf1800/{output}")

In [None]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path=f"faizalnf1800/{output}",
    repo_id=f"faizalnf1800/{output}",
    repo_type="model",
)