In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load dataset
ds = load_dataset("phosseini/multimodal_satire")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Filter out empty texts
dataset = dataset.filter(lambda example: len(example['text'].strip()) > 0)

# Then tokenize
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = inputs['input_ids'] 
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
# Uncomment the line below if you want to resume from a checkpoint
# trainer.train(resume_from_checkpoint='./results/checkpoint-15')
trainer.train()

# save the model and tokenizer explicitly
model_output_dir = './results/model'

model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

Using device: cuda


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,1.956,1.937084


('./results/model/tokenizer_config.json',
 './results/model/special_tokens_map.json',
 './results/model/vocab.json',
 './results/model/merges.txt',
 './results/model/added_tokens.json',
 './results/model/tokenizer.json')

In [5]:
# Path to the latest checkpoint
latest_checkpoint = './results/checkpoint-5942'

# Resume training from the latest checkpoint
trainer.train(resume_from_checkpoint=latest_checkpoint)

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=5942, training_loss=0.0, metrics={'train_runtime': 0.0039, 'train_samples_per_second': 6107089.577, 'train_steps_per_second': 1526836.633, 'total_flos': 1552531931136000.0, 'train_loss': 0.0, 'epoch': 1.0})

In [6]:
model.save_pretrained('./results/model')
tokenizer.save_pretrained('./results/model')

('./results/model/tokenizer_config.json',
 './results/model/special_tokens_map.json',
 './results/model/vocab.json',
 './results/model/merges.txt',
 './results/model/added_tokens.json',
 './results/model/tokenizer.json')

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    return total_params

def generate_text(input_text, model_path='./results/model', max_length=50):
    # Load the tokenizer and model from the saved directory
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # Calculate the Number of Parameters in the model being used for inference
    total_params = get_model_parameters(model)
    print(f"Total number of parameters: {total_params:,}")

    # Prepare the input text you want to generate predictions for
    inputs = tokenizer(input_text, return_tensors='pt')

    # Generate Text
    outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

# Example usage in notebook:
input_prompt = "The history of satire"
result = generate_text(input_prompt)
print(result)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Total number of parameters: 124,439,808
The history of satire in the United States is largely based on the work of the late American satirist William Faulkner . Faulkner was a prolific satirist , and his work was often criticized for its lack of satire . Faulkner '
