In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Load dataset
dataset = load_dataset("phosseini/multimodal_satire")   # Images/Text/URL's
dataset = dataset.select_columns("headline")            # We select this as the other columns are not text

dataset = dataset["train"].train_test_split(test_size=0.1)

In [6]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

In [7]:

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Filter out empty texts
dataset = dataset.filter(lambda example: len(example['headline'].strip()) > 0)

# Then tokenize
def tokenize_function(examples):
    inputs = tokenizer(examples['headline'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = inputs['input_ids'] 
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

# Train the model
# Uncomment the line below if you want to resume from a checkpoint
# trainer.train(resume_from_checkpoint='./results/checkpoint-15')
trainer.train()

# save the model and tokenizer explicitly
model_output_dir = './results/model'

model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,0.5322,0.511942


('./results/model/tokenizer_config.json',
 './results/model/special_tokens_map.json',
 './results/model/vocab.json',
 './results/model/merges.txt',
 './results/model/added_tokens.json',
 './results/model/tokenizer.json')

In [74]:
# Path to the latest checkpoint
latest_checkpoint = './results/checkpoint-2250'

# Resume training from the latest checkpoint
trainer.train(resume_from_checkpoint=latest_checkpoint)

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=2250, training_loss=0.0, metrics={'train_runtime': 0.0053, 'train_samples_per_second': 1705540.867, 'train_steps_per_second': 426385.217, 'total_flos': 587907072000000.0, 'train_loss': 0.0, 'epoch': 1.0})

In [116]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    return total_params

def generate_headline(subject, model_path="./results/model"):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # Style prompt that conditions the model
    prompt = f"Satirical headline: {subject}"

    inputs = tokenizer(prompt, return_tensors="pt")

    outputs = model.generate(
        **inputs,
        max_new_tokens=30,
        min_length=inputs["input_ids"].shape[1] + 20,
        do_sample=True,
        temperature=1.1,
        top_p=0.95,
        top_k=50,
        pad_token_id=tokenizer.eos_token_id,
    )

    # decode full output (including prompt)
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return full_text

# Example usage in notebook:
input_prompt = "Micheal B Jordan"
result = generate_text(input_prompt)
print(result)

 Reveals He 'wasn't aware' the death of his former girlfriend was caused by cocaine
