In [1]:
import pandas as pd

In [2]:
import os

In [3]:
os.chdir('/kaggle/')

In [4]:
data = pd.read_csv('input/name-bio/name_bio.csv').fillna('')
data = data[data['bio'] != '']

## 1. Preprocess data

In [5]:
from transformers import GPT2Tokenizer
from datasets import DatasetDict, Dataset

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

In [8]:
def bio_length(bio):
    return len(tokenizer(bio)["input_ids"])

In [9]:
from tqdm import tqdm

In [10]:
bio_lengths = {}

In [11]:
for bio in tqdm(data["bio"]):
    bio_lengths[bio] = bio_length(bio)

100%|██████████| 63871/63871 [00:25<00:00, 2459.03it/s]


In [12]:
token_threshold = 50

In [13]:
data_short = data[data["bio"].apply(lambda x: bio_lengths[x] < token_threshold)]
data_long = data[~data["bio"].apply(lambda x: bio_lengths[x] < token_threshold)]

In [14]:
len(data_long)

20584

In [15]:
long_bio_lengths = [bio_lengths[bio] for bio in data_long['bio']]

In [16]:
import numpy as np

In [17]:
percentiles = np.percentile(long_bio_lengths, [90, 95, 99])

print(f"90th percentile: {percentiles[0]}")
print(f"95th percentile: {percentiles[1]}")
print(f"99th percentile: {percentiles[2]}")
print(f"Max length: {max(long_bio_lengths)}")

90th percentile: 155.0
95th percentile: 193.0
99th percentile: 288.16999999999825
Max length: 616


In [18]:
data_short.head()

Unnamed: 0,name,bio
4,Ayaka Hanamura,The older sister.
5,Bay,A rear admiral of the Free Planets Alliance wh...
6,Enishi,Miyagi's childhood friend.
8,Farang,(Source: Wiki)
13,Hanae Wada,An eccentric but kind old woman with gray hair...


In [19]:
data_long.head()

Unnamed: 0,name,bio
37,Lupin III,"When not involved in criminal activities, Lupi..."
40,Mignon,"Mignon is a young, up-and-rising cage fighter...."
51,Murozono,Murozono was the girl who discovered Ikeda Gor...
59,Niikura,"Supposedly Nagumo's best friend, Niikura spend..."
62,Odawara,"Leader of ""14 Orchestra of Evil."" Well known a..."


In [20]:
max_length = 200 # roughly 95th percentile

In [21]:
def chunk_text(text, tokenizer, max_length=200, stride=50):
    """Splits text into overlapping chunks while preserving context."""
    tokens = tokenizer(text)["input_ids"]
    chunks = [tokens[i : i + max_length] for i in range(0, len(tokens), max_length - stride)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

In [22]:
def format_for_gpt2(name, bio):
    bio_chunks = chunk_text(bio, tokenizer, max_length)
    return [f"[CHARACTER] {name}\n[BIO] {chunk} [END]" for chunk in bio_chunks]

In [23]:
import random
myrandom = random.Random(42)

In [24]:
preprocessed_texts = []

In [25]:
for _, row in tqdm(data_long.iterrows(), total=len(data_long)):
    name = row['name']
    bio = row['bio']
    preprocessed_texts += format_for_gpt2(row['name'], row['bio'])

100%|██████████| 20584/20584 [00:53<00:00, 383.48it/s]


In [26]:
print(len(data_long))
print(len(preprocessed_texts))

20584
23039


In [28]:
myrandom.shuffle(preprocessed_texts)

In [30]:
split_ratio = 0.9
split_idx = int(len(preprocessed_texts) * split_ratio)

train_texts = preprocessed_texts[:split_idx]
eval_texts = preprocessed_texts[split_idx:]

In [31]:
import pickle

In [32]:
with open('working/gpt2_dataset.pkl', 'wb') as a:
    pickle.dump({'train': train_texts, 'eval': eval_texts}, a)

In [33]:
print(train_texts[:2])

['[CHARACTER] Seiji Isami\n[BIO] Seiji is an NPC in the "Tokimeki Utopia" online virtual reality dating sim, with the role of "school prince". He competes with Haru for Ichi\'s love. Despite this, he is only in love with Chihiro, another male NPC. [END]', '[CHARACTER] Harry Champ\n[BIO] He is "a man destined to be king," as he regularly states numerous times with each appearance. He is heir to half of the Champ\'s family fortunes, along with his elder sister, Mary Champ; but he couldn\'t care less about his fortune when it comes to his unrequited love for Leena Toros. Harry also has two robots named, Benjamin and Sebastian. Because of his wealth, he owns a menagerie of Zoids, claiming to have everything "from a Gojulas to Cannon Tortoises." His main Zoid is a customized Dark Horn, but he has also piloted an Iron Kong, and a Cannon Tortoise. Many more Zoids are seen in his hangar, among them a Red Horn, a Shield Liger, and a Gordos. [END]']


In [34]:
dataset = DatasetDict({
    "train": Dataset.from_dict({"text": train_texts}),
    "eval": Dataset.from_dict({"text": eval_texts})
})

In [35]:
print(f"Train size: {len(dataset['train'])}, Eval size: {len(dataset['eval'])}")

Train size: 20735, Eval size: 2304


In [37]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)

In [38]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/20735 [00:00<?, ? examples/s]

Map:   0%|          | 0/2304 [00:00<?, ? examples/s]

## 2. Load model

In [39]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

In [40]:
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

In [41]:
import torch

In [42]:
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [43]:
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

GPU 0: Tesla T4
GPU 1: Tesla T4


In [44]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [56]:
training_args = TrainingArguments(
    output_dir="./working/gpt2-bio-generator_checkpoints",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",  # Evaluate every epoch
    save_strategy="epoch",  # Save checkpoint every epoch
    save_total_limit=3,  # Keep only the last 3 checkpoints
    load_best_model_at_end=True,  # Restore the best model
    metric_for_best_model="eval_loss",  # Track evaluation loss
    num_train_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    logging_strategy='steps',
    logging_steps=50,
    report_to="none",
    dataloader_pin_memory=True,
    dataloader_num_workers=8,
    ddp_find_unused_parameters=False
)

In [57]:
model = model.to("cuda")

In [58]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [59]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.3615,3.281799
2,3.2646,3.243477
3,3.2018,3.221042
4,3.1593,3.209684
5,3.1115,3.199245
6,3.0656,3.195789
7,3.0248,3.195221
8,3.0067,3.190681
9,3.0053,3.191773
10,2.9874,3.191575


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=6480, training_loss=3.1237910753414955, metrics={'train_runtime': 5055.6998, 'train_samples_per_second': 41.013, 'train_steps_per_second': 1.282, 'total_flos': 1.058200805376e+16, 'train_loss': 3.1237910753414955, 'epoch': 10.0})

In [60]:
model_path = 'working/gpt2-bio-generator-best'

In [61]:
model.save_pretrained('working/gpt2-bio-generator-best')
tokenizer.save_pretrained('working/gpt2-bio-generator-best')

('working/gpt2-bio-generator-best/tokenizer_config.json',
 'working/gpt2-bio-generator-best/special_tokens_map.json',
 'working/gpt2-bio-generator-best/vocab.json',
 'working/gpt2-bio-generator-best/merges.txt',
 'working/gpt2-bio-generator-best/added_tokens.json')

In [62]:
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [63]:
import torch

In [64]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [85]:
def generate_bio(name, max_length=450, temperature=1.3):
    """
    Generates a character bio using the fine-tuned GPT-2 model.
    
    Args:
        name (str): Character name.
        max_length (int): Maximum output length.
        temperature (float): Controls randomness (higher = more creative).

    Returns:
        str: Generated bio.
    """
    prompt = f"[CHARACTER] {name}\n[BIO]"
    
    prompt_tokens = tokenizer(prompt, return_tensors="pt")
    input_ids = prompt_tokens.input_ids.to(model.device)
    attention_mask = prompt_tokens.attention_mask.to(model.device)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.2
    )

    bio = tokenizer.decode(output[0], skip_special_tokens=True)

    # Manually cut off text at "[END]" if GPT-2 generates extra tokens
    if "[END]" in bio:
        bio = bio.split("[END]")[0].strip()

    return bio

In [115]:
print(generate_bio("Nikita Mzhelsky"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[CHARACTER] Nikita Mzhelsky
[BIO] A young officer assigned to the Energetic Development Team in order for Katri Kuga, who has been in a close friendship with Konno before. Mikami calls him “Maz-kun” and is his most faithful friend (though perhaps not truly understanding). He loves Kotoharu. Despite being very polite he has known her since childhood and even helped her escape the dark of despair. While on duty, she was found crying by Kanako and the other four of them in shock because they recognized one another very closely despite their lacklustre attitude toward eachothers situation during training camp which led only Makihara at home rather than on their way home from work. They then spent much more time together after completing three months apart.
