In [3]:
from transformers import Trainer, TrainingArguments, GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset

# Load dataset
dataset = load_dataset('json', data_files='personal.json')
train_test_split = dataset['train'].train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Load tokenizer & model
model_name = 'distilgpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token if not present
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
model.resize_token_embeddings(len(tokenizer))

# Reformat dataset
def preprocess_function(examples):
    text = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["input"], examples["output"])]
    tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=100)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=50,
    learning_rate=5e-4,   # Higher because dataset is small
    per_device_train_batch_size=1,
    num_train_epochs=20,  # More epochs so it memorizes
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    save_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset
)

# Train model
trainer.train()

# Save final model & tokenizer
tokenizer.save_pretrained('./results')
model.save_pretrained('./results')


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
50,0.202,1.557153
100,0.099,1.632788
150,0.058,1.670857
200,0.0322,1.782033




In [4]:
# testing
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = './results'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()

def generate_response(question, max_length=50):
    prompt = f"Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            top_p=0.95,
            top_k=50,
            pad_token_id=tokenizer.pad_token_id
        )

    # Remove the prompt from the generated text
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.replace(prompt, "").strip()

# Example
question = "What is your name?"
print("Q:", question)
print("A:", generate_response(question))


The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Q: What is your name?
A: My name is Sarmad Rj. AI, and building useful projects like school management systems and crypto converters. AI.lit, while staying close to my values and faith.to
