In [1]:
!pip install -q transformers==4.41.2 datasets==2.20.0 torch==2.3.1 accelerate==0.31.0
!pip install -U transformers==4.44.2 accelerate==0.34.2
# !pip install -U "numpy<2" datasets==2.20.0
!pip install -U "numpy<2" torch==2.3.1 transformers==4.44.2 datasets==2.20.0 accelerate==0.34.2


Collecting transformers==4.44.2
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting accelerate==0.34.2
  Using cached accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Using cached accelerate-0.34.2-py3-none-any.whl (324 kB)
Installing collected packages: transformers, accelerate
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.31.0
    Uninstalling accelerate-0.31.0:
      Successfully uninstalled accelerate-0.31.0
Successfully installed accelerate-0.34.2 transformers-4.44.2


In [2]:



from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# 1️⃣ Load tokenizer first
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

# 2️⃣ Load model
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))

# 3️⃣ Tiny dataset for testing (replace with yours)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
tokenized_ds = dataset.map(tokenize_function, batched=True)
tokenized_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])

# 4️⃣ Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 5️⃣ Training args
training_args = TrainingArguments(
    output_dir="./distilgpt2-finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,  # smaller batch
    num_train_epochs=1,
    save_strategy="no",
    logging_steps=10,
    report_to="none"
)

# 6️⃣ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator
)

# 7️⃣ Train
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

Step,Training Loss
10,4.0546
20,4.0637
30,4.0186
40,4.3859
50,2.4676
60,4.3541
70,4.3305
80,4.6234
90,2.4029
100,3.961


TrainOutput(global_step=367, training_loss=3.575228153197577, metrics={'train_runtime': 875.3145, 'train_samples_per_second': 0.419, 'train_steps_per_second': 0.419, 'total_flos': 11986988433408.0, 'train_loss': 3.575228153197577, 'epoch': 1.0})

In [5]:

# # 9️⃣ Evaluate
# print("\n📊 Evaluating...")
# eval_results = trainer.evaluate()
# perplexity = math.exp(eval_results["eval_loss"])
# print(f"Perplexity: {perplexity:.2f}")

# 🔟 Save locally
print("\n💾 Saving model...")
trainer.save_model("./distilgpt2-finetuned")
tokenizer.save_pretrained("./distilgpt2-finetuned")


💾 Saving model...


('./distilgpt2-finetuned/tokenizer_config.json',
 './distilgpt2-finetuned/special_tokens_map.json',
 './distilgpt2-finetuned/vocab.json',
 './distilgpt2-finetuned/merges.txt',
 './distilgpt2-finetuned/added_tokens.json')

In [7]:

# 11️⃣ Generate sample text
prompt = "In the near future, Egypt will eat"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=60,
    temperature=0.8,
    top_p=0.9,
    do_sample=True
)
print("\n📝 Generated text:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📝 Generated text:
 In the near future, Egypt will eat the land of the Pharaoh, a land that was previously ruled by the Egyptian Pharaohs, and the Sinai, as the latter was a land that was previously ruled by the Egyptian rulers. The land of the Pharaoh, Egypt, was originally a land of land of the Egyptian rulers, and was not a


In [None]:

# 12️⃣ Optional: Upload to Hugging Face Hub
upload = input("\n🌐 Do you want to upload this model to Hugging Face? (y/n): ").strip().lower()
if upload == "y":
    login()  # paste your token
    trainer.push_to_hub("your-username/gpt2-finetuned-demo")
    tokenizer.push_to_hub("your-username/gpt2-finetuned-demo")
    print("✅ Uploaded to Hugging Face Hub!")
else:
    print("❎ Skipping upload. Model saved locally.")