In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# import kagglehub
# import shutil
# import os

# # Download latest version
# path = kagglehub.dataset_download("rmisra/news-headlines-dataset-for-sarcasm-detection")

# print("Path to dataset files:", path)


# dataset = load_dataset("phosseini/multimodal_satire")   # Images/Text/URL's
# dataset = dataset.select_columns("headline")            # We select this as the other columns are not text


# # where KaggleHub downloaded the dataset
# src = path    

# # where YOU want the dataset to go
# dst = "./kaggleds/sarcasm"   # <-- create your own folder
# os.makedirs(dst, exist_ok=True)

# # Copy entire folder tree
# shutil.copytree(src, dst, dirs_exist_ok=True)
# print("Dataset copied to:", dst)

Path to dataset files: /home/sebas/.cache/kagglehub/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection/versions/2


In [2]:
from datasets import load_dataset, concatenate_datasets
import os

# Load original satire dataset (phosseini)
satire_ds = load_dataset("phosseini/multimodal_satire")
satire_ds = satire_ds.select_columns("headline")
satire_train = satire_ds["train"]

# Load Kaggle sarcasm JSON
sarcasm_json_path = "kaggleds/sarcasm/Sarcasm_Headlines_Dataset_v2.json"
sarcasm_ds = load_dataset("json", data_files=sarcasm_json_path)["train"]

# Filter for sarcastic entries only (is_sarcastic == 1)
sarcasm_ds = sarcasm_ds.filter(lambda x: x["is_sarcastic"] == 1)

# Keep only headline column
sarcasm_headlines = sarcasm_ds.select_columns("headline")

# Combine datasets
combined_raw = concatenate_datasets([satire_train, sarcasm_headlines])

# Shuffle + train/test split
combined = combined_raw.shuffle(seed=42).train_test_split(test_size=0.1)

train_ds = combined["train"]
eval_ds = combined["test"]

print(train_ds)
print(eval_ds)


Dataset({
    features: ['headline'],
    num_rows: 21270
})
Dataset({
    features: ['headline'],
    num_rows: 2364
})


In [None]:
# # Load dataset
# dataset = load_dataset("phosseini/multimodal_satire")   # Images/Text/URL's
# dataset = dataset.select_columns("headline")            # We select this as the other columns are not text

# dataset = dataset["train"].train_test_split(test_size=0.1)

In [None]:
# combined = combined_raw.shuffle(seed=42).train_test_split(test_size=0.1)

# train_ds = combined["train"]
# eval_ds  = combined["test"]

In [3]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

In [10]:
# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Filter out empty texts
train_ds = train_ds.filter(lambda example: len(example['headline'].strip()) > 0)
eval_ds = eval_ds.filter(lambda example: len(example['headline'].strip()) > 0)

# Then tokenize
def tokenize_function(examples):
    inputs = tokenizer(examples['headline'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = inputs['input_ids'] 
    return inputs

train_tokenized_datasets = train_ds.map(tokenize_function, batched=True)
eval_tokenized_datasets = eval_ds.map(tokenize_function, batched=True)

Filter:   0%|          | 0/21270 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2364 [00:00<?, ? examples/s]

Map:   0%|          | 0/21270 [00:00<?, ? examples/s]

Map:   0%|          | 0/2364 [00:00<?, ? examples/s]

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',       
    save_strategy='epoch',             # recommended
    num_train_epochs=5,                # better than 1
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    warmup_steps=50,                   # realistically sized
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none",
    gradient_accumulation_steps=4,                  # avoids TB errors
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=eval_tokenized_datasets,
)

trainer.train()

# save final model + tokenizer
model_output_dir = './results/model'
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [74]:
# Path to the latest checkpoint
latest_checkpoint = './results/checkpoint-2250'

# Resume training from the latest checkpoint
trainer.train(resume_from_checkpoint=latest_checkpoint)

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=2250, training_loss=0.0, metrics={'train_runtime': 0.0053, 'train_samples_per_second': 1705540.867, 'train_steps_per_second': 426385.217, 'total_flos': 587907072000000.0, 'train_loss': 0.0, 'epoch': 1.0})

In [126]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    return total_params

def generate_headline(subject, model_path="./results/model"):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # Style prompt that conditions the model
    prompt = f"Satirical headline: {subject}"

    inputs = tokenizer(prompt, return_tensors="pt")

    outputs = model.generate(
        **inputs,
        max_new_tokens=30,
        min_length=inputs["input_ids"].shape[1] + 20,
        do_sample=True,
        temperature=1.1,
        top_p=0.95,
        top_k=50,
        pad_token_id=tokenizer.eos_token_id,
    )

    # decode full output (including prompt)
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return full_text

# Example usage in notebook:
input_prompt = "Micheal B Jordan"
result = generate_text(input_prompt)
print(result)

 was a senior at St. Luke University
