In [24]:
import torch
import transformers
import datasets


In [25]:
from datasets import load_dataset

# Load your text files - think of these as your training materials
dataset = load_dataset('text', data_files={'train': 'train.txt', 'valid': 'valid.txt'})

In [26]:
from transformers import GPT2Tokenizer

# Load the same tokenizer that GPT-2 was originally trained with
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # This is crucial - I'll explain why below!

def tokenize_function(examples):
    return tokenizer(examples['text'], 
                   truncation=True,      # Cut off text that's too long
                   max_length=512,       # Maximum length we can handle
                   padding='max_length') # Make all examples the same length

# Apply tokenization to our entire dataset
tokenized_data = dataset.map(tokenize_function, batched=True)

In [27]:
from transformers import GPT2LMHeadModel

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',              # Where to save checkpoints
    num_train_epochs=15,                  # How many times to read through all data
    per_device_train_batch_size=4,       # How many examples to study at once
    gradient_accumulation_steps=8,       # Memory trick - I'll explain below
    learning_rate=2e-5,                  # How big steps to take when learning
    fp16=True                           # Use less memory (if you have a modern GPU)
)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_data['train'],
#     eval_dataset=tokenized_data['valid']
# )

SyntaxError: unmatched ')' (1894747246.py, line 17)

In [None]:
# Start the training process
# To fix the ValueError (model did not return a loss), ensure 'labels' are included during training.
# We'll create a data collator that sets 'labels' = 'input_ids' for language modeling.

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're using causal LM like GPT-2, not masked LM
)

# Re-create trainer to include the correct data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['valid'],
    data_collator=data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=3, training_loss=5.135722478230794, metrics={'train_runtime': 4.8912, 'train_samples_per_second': 12.267, 'train_steps_per_second': 0.613, 'total_flos': 15677521920000.0, 'train_loss': 5.135722478230794, 'epoch': 3.0})

In [None]:
# Check how well our model performs on the validation set
results = trainer.evaluate()
print(f"Validation Loss: {results['eval_loss']}")

# Save our fine-tuned model - this is your trained specialist!
model.save_pretrained('fine-tuned-gpt2')
tokenizer.save_pretrained('fine-tuned-gpt2')

Validation Loss: 4.752215385437012


('fine-tuned-gpt2/tokenizer_config.json',
 'fine-tuned-gpt2/special_tokens_map.json',
 'fine-tuned-gpt2/vocab.json',
 'fine-tuned-gpt2/merges.txt',
 'fine-tuned-gpt2/added_tokens.json')