In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch

# --- Step 1: Define your tiny training data ---
texts = [
    "Aruniel is a city floating on clouds, known for its silver rivers.",
    "In Aruniel, every night the towers glow with soft blue light.",
    "Travelers say Aruniel's air smells like citrus and stardust."
]
dataset = Dataset.from_dict({"text": texts})

# --- Step 2: Tokenize ---
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 has no pad_token by default

def tokenize_fn(examples):
    # For causal LM, label is the same as input_ids
    encodings = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )
    encodings["labels"] = encodings["input_ids"].copy()
    return encodings

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# --- Step 3: Load model ---
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# --- Workaround for MPS RuntimeError ---
device = torch.device("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
model.to(device)

# The default Trainer will use the device set by `torch.device` if CUDA is available.
# On macOS with MPS, we need to always make sure tensors for generation and testing are also on the right device.

# --- Step 4: Train ---
args = TrainingArguments(
    output_dir="./aruniel-model",
    per_device_train_batch_size=2,
    num_train_epochs=30,
    logging_steps=1,
    save_steps=5,
    save_total_limit=1,
    learning_rate=5e-5,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()

# --- Step 5: Quick test generation ---
prompt = "Aruniel is"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.inference_mode():
    outputs = model.generate(**inputs, max_length=30, num_return_sequences=1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Map: 100%|██████████| 3/3 [00:00<00:00, 940.43 examples/s]


Step,Training Loss
1,5.2007
2,5.0657
3,4.1856
4,3.7631
5,3.3837
6,3.205
7,2.8541
8,2.5342
9,2.3965
10,2.2347


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Aruniel is a city floating on clouds, known for its silver rivers. Its silver rivers form rivers. Its bright blue rivers, known for its


In [9]:
prompt = "Explain heat death.\n\nSure! Light death is"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.inference_mode():
    outputs = model.generate(**inputs, max_length=30, num_return_sequences=1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Explain heat death.

Sure! Light death is a sight, but it's not like it's bright. It's soft, it's


In [11]:
# --- HF Upload Demo ---
from huggingface_hub import HfApi, login
import os

# Login to HF (you'll need to get a token from https://huggingface.co/settings/tokens)
# login()  # Uncomment and run this first to authenticate

api = HfApi()

# 1. Upload the trained model
model_id = "rohans02/aruniel-model-demo"
print(f"Uploading model to {model_id}...")

# Create repo and push model
api.create_repo(repo_id=model_id, private=True, repo_type="model", exist_ok=True)
api.upload_folder(
    folder_path="./aruniel-model",
    repo_id=model_id,
    repo_type="model"
)
print("✅ Model uploaded successfully!")

# 2. Upload the dataset
dataset_id = "rohans02/aruniel-dataset-demo"
print(f"Uploading dataset to {dataset_id}...")

# Create repo and push dataset
api.create_repo(repo_id=dataset_id, private=True, repo_type="dataset", exist_ok=True)

# Upload the actual dataset (not the model files)
dataset.push_to_hub(dataset_id)
print("✅ Dataset uploaded successfully!")

print(f"\n🎉 Demo complete! Check your models at:")
print(f"Model: https://huggingface.co/{model_id}")
print(f"Dataset: https://huggingface.co/{dataset_id}")


Uploading model to rohans02/aruniel-model-demo...


Processing Files (5 / 5): 100%|██████████|  983MB /  983MB,  526MB/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
No files have been modified since last commit. Skipping to prevent empty commit.


✅ Model uploaded successfully!
Uploading dataset to rohans02/aruniel-dataset-demo...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 522.78ba/s]
Processing Files (1 / 1): 100%|██████████| 1.25kB / 1.25kB,  0.00B/s  
New Data Upload: 100%|██████████| 1.25kB / 1.25kB,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.47 shards/s]


✅ Dataset uploaded successfully!

🎉 Demo complete! Check your models at:
Model: https://huggingface.co/rohans02/aruniel-model-demo
Dataset: https://huggingface.co/rohans02/aruniel-dataset-demo


In [12]:
from datasets import load_dataset

# Download the dataset from HF Hub
dataset = load_dataset("rohans02/aruniel-dataset-demo", split="train")

# Print some example entries from the dataset
for i, example in enumerate(dataset.select(range(3))):
    print(f"Example {i}:\n{example}\n")

Generating train split: 100%|██████████| 3/3 [00:00<00:00, 572.94 examples/s]

Example 0:
{'text': 'Aruniel is a city floating on clouds, known for its silver rivers.'}

Example 1:
{'text': 'In Aruniel, every night the towers glow with soft blue light.'}

Example 2:
{'text': "Travelers say Aruniel's air smells like citrus and stardust."}




