In [1]:
print("test")

test


generate data to file1

text = """Hello world.
I love cats.
Cats are cute.
I love dogs.
Dogs are friendly.
"""
open("tiny.txt", "w").write(text)

start train

In [2]:
import torch
from transformers import AutoTokenizer, GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


#### Tokenizer (we'll reuse GPT-2's tokenizer for simplicity)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token   
pad_id = tokenizer.eos_token_id

#### 2. Load dataset

In [4]:
dataset = load_dataset("text", data_files={"train": "tiny.txt"})["train"]
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=32)
dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

In [5]:

# (Optional) safety filter—ensure no empty sequences
dataset = dataset.filter(lambda x: len(x["input_ids"]) > 0)

In [6]:
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [7]:
print(dataset[0].keys())

dict_keys(['input_ids', 'attention_mask'])



#### 3. Model config — small version

In [8]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=32,
    n_ctx=32,
    n_embd=64,
    n_layer=2,
    n_head=2,
    pad_token_id=pad_id
)
model = GPT2LMHeadModel(config)


In [9]:
# 5) ✅ Use a collator that pads and builds labels for LM
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

#### 4. Trainer

In [10]:
args = TrainingArguments(
    output_dir="tiny-model",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    num_train_epochs=50,
    learning_rate=5e-4,
    logging_steps=10,
    bf16=torch.cuda.is_available(),
    report_to=[]
)
trainer = Trainer(model=model, args=args, train_dataset=dataset,data_collator=collator,)
trainer.train()

model.save_pretrained("tiny-model")
tokenizer.save_pretrained("tiny-model")

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,9.7234
20,9.1085
30,8.6099
40,8.5071
50,7.9902
60,7.4633
70,7.2331
80,7.0264
90,6.5525
100,6.2677


('tiny-model/tokenizer_config.json',
 'tiny-model/special_tokens_map.json',
 'tiny-model/vocab.json',
 'tiny-model/merges.txt',
 'tiny-model/added_tokens.json',
 'tiny-model/tokenizer.json')

In [12]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [13]:
print(type(dataset))             # datasets.arrow_dataset.Dataset
print(dataset[0].keys())         # 'input_ids', 'attention_mask' (and maybe 'special_tokens_mask')
print(len(dataset[0]["input_ids"]))

<class 'datasets.arrow_dataset.Dataset'>
dict_keys(['input_ids', 'attention_mask'])
3


In [14]:
from transformers import pipeline
gen = pipeline("text-generation",
               model="tiny-model",
               tokenizer="tiny-model",
               device=0)  # or -1 for CPU
print(gen("I love", max_new_tokens=10, do_sample=False)[0]["generated_text"])

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


I love..........


In [15]:
from transformers import AutoTokenizer, GPT2LMHeadModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Load the tokenizer and model you TRAINED/SAVED
tok = AutoTokenizer.from_pretrained("tiny-model")
model = GPT2LMHeadModel.from_pretrained("tiny-model")

# 2) Safety: ensure embedding size matches tokenizer length
if model.get_input_embeddings().weight.size(0) != len(tok):
    model.resize_token_embeddings(len(tok))

# 3) Make sure pad/eos are set consistently (we used eos as pad during training)
model.config.pad_token_id = tok.eos_token_id
if hasattr(model, "generation_config"):
    model.generation_config.pad_token_id = tok.eos_token_id

# 4) Generate
model.to(device).eval()
inputs = tok("I love", return_tensors="pt").to(device)
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=10, do_sample=False)
print(tok.decode(out[0], skip_special_tokens=True))

I love..........


In [16]:
out = model.generate(
    **tok("Kitty", return_tensors="pt").to(device),
    max_new_tokens=20,
    do_sample=True,          # use sampling instead of greedy
    temperature=0.8,         # lower = safer, higher = more random
    top_p=0.9,               # nucleus sampling
    repetition_penalty=1.2,  # discourage loops
    no_repeat_ngram_size=2,  # avoid repeating short n-grams
    eos_token_id=tok.eos_token_id,
    pad_token_id=tok.eos_token_id
)
print(tok.decode(out[0], skip_special_tokens=True))

Kitty Legionsleesoster suddenlyapache ghostfoundation harmful Armoryfail transistor Broken scriptingAUD Ming franchiversal enthusiasm eclips slave
