In [5]:
!pip install --upgrade --no-cache-dir "transformers[torch]>=4.40.0" datasets accelerate





In [1]:
import transformers
print(transformers.__version__)


4.53.2


In [2]:
from datasets import Dataset
from transformers import GPT2Tokenizer

# Load text
with open("Sample_Text.txt", "r") as f:
    lines = [line.strip() for line in f if line.strip()]
dataset = Dataset.from_dict({"text": lines})

# Tokenize
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [3]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_ds = split_dataset["train"]
val_ds = split_dataset["test"]


In [6]:
from transformers import GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Data collator for auto-regressive language modeling (no masking)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training arguments (this is the exact place)
training_args = TrainingArguments(
    output_dir="./gpt2-custom",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_steps=50,
    save_steps=500,
    logging_dir="./logs",
    do_eval=True,  # replaces evaluation_strategy
)


# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Fully unhook WandB from Trainer integrations
from transformers.integrations import WandbCallback
trainer.remove_callback(WandbCallback)


In [11]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=15, training_loss=2.4348159790039063, metrics={'train_runtime': 14.7062, 'train_samples_per_second': 3.672, 'train_steps_per_second': 1.02, 'total_flos': 3527442432000.0, 'train_loss': 2.4348159790039063, 'epoch': 3.0})

In [14]:
from transformers import pipeline

# Load generator pipeline with your fine-tuned model
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a custom prompt
output = generator("The future of AI is", max_new_tokens=50, num_return_sequences=1)

# Show result
print(output[0]["generated_text"])


Device set to use cuda:0


The future of AI is not yet written, but will come.

Will it happen?


I'm not sure, but it's likely.

The data is what we see. We can write it.

The future is not yet written,


In [16]:
model.save_pretrained("./gpt2-custom")
tokenizer.save_pretrained("./gpt2-custom")
trainer.save_model("./gpt2-custom")