In [None]:
!pip install huggingface_hub transformers accelerate torch datasets

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset,Dataset

In [None]:
model_name = "gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
from datasets import load_dataset, concatenate_datasets

# Load datasets
dataset1 = load_dataset("faizalnf1800/Science-OA-STM-Corpus")
dataset2 = load_dataset("faizalnf1800/SciFi_Stories_Text_Corpus")
dataset3 = load_dataset("faizalnf1800/scifi_webnovel_preprocessed")

# Function to split text into chunks of 128 words
def split_text_into_chunks(text, chunk_size=128):
    words = text.split()
    chunks = [words[i:i+chunk_size] for i in range(0, len(words), chunk_size)]
    return [" ".join(chunk) for chunk in chunks]

# Initialize an empty list to store text chunks
text_chunks_list = []

# Iterate through the dataset and split text into chunks
for i in range(len(dataset1["train"])):
    text = dataset1["train"]["Title"][i] + ":\n\n " + dataset1["train"]["Text"][i]
    text_chunks = split_text_into_chunks(text)
    text_chunks_list.extend(text_chunks)

# Create a dictionary for the dataset
dataset_dict = {"text": text_chunks_list}

# Create a Hugging Face dataset
dataset1 = Dataset.from_dict(dataset_dict)

# # Create a new dataset with merged title and text columns
merged_dataset = concatenate_datasets([dataset1, dataset2["train"], dataset3["train"]])

# Preprocessing
max_seq_length = 128
num_proc = 4

def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples['text'],
        padding=True,
        truncation=True,
        max_length=max_seq_length
    )
    return tokenized_inputs

tokenized_datasets = merged_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=num_proc,
    remove_columns=["text"]
)

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // max_seq_length) * max_seq_length
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=num_proc,
)


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.87M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading readme:   0%|          | 0.00/281 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/95.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/207993 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/156k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/312 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/214157 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/214157 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

In [5]:
output = "QuantumQuill-300m-GPT2-Medium"

In [6]:
# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output,
    overwrite_output_dir=True,
    num_train_epochs=4,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    save_steps=180,
    save_total_limit=5,
    learning_rate=5e-5,
    warmup_ratio=0.3,
    max_steps=180,
    logging_steps=10,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

# Fine-tune the model
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,4.0798
20,3.9523
30,4.0728
40,3.959
50,3.873
60,4.0526
70,3.8779
80,3.9803
90,3.984
100,3.8827


TrainOutput(global_step=180, training_loss=3.938369984096951, metrics={'train_runtime': 174.9585, 'train_samples_per_second': 8.231, 'train_steps_per_second': 1.029, 'total_flos': 334332250030080.0, 'train_loss': 3.938369984096951, 'epoch': 0.01})

In [7]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
trainer.push_to_hub(output)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

events.out.tfevents.1702392679.e63fb228842c.298.0:   0%|          | 0.00/7.68k [00:00<?, ?B/s]

'https://huggingface.co/faizalnf1800/QuantumQuill-300m-GPT2-Medium/tree/main/'

In [9]:
tokenizer.push_to_hub(output)

CommitInfo(commit_url='https://huggingface.co/faizalnf1800/QuantumQuill-300m-GPT2-Medium/commit/e9109485e09bedc266ab1c8ce44acfbe9e227557', commit_message='Upload tokenizer', commit_description='', oid='e9109485e09bedc266ab1c8ce44acfbe9e227557', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
model.save_pretrained(output)
tokenizer.save_pretrained(output)

('QuantumQuill-300m-GPT2-Medium/tokenizer_config.json',
 'QuantumQuill-300m-GPT2-Medium/special_tokens_map.json',
 'QuantumQuill-300m-GPT2-Medium/vocab.json',
 'QuantumQuill-300m-GPT2-Medium/merges.txt',
 'QuantumQuill-300m-GPT2-Medium/added_tokens.json',
 'QuantumQuill-300m-GPT2-Medium/tokenizer.json')

In [None]:
from huggingface_hub import create_repo

create_repo(f"faizalnf1800/{output}")

In [11]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path=f"/content/{output}",
    repo_id=f"faizalnf1800/{output}",
    repo_type="model",
)

optimizer.pt:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

'https://huggingface.co/faizalnf1800/QuantumQuill-300m-GPT2-Medium/tree/main/'

In [None]:
from google.colab import runtime
runtime.unassign()