In [1]:
# 1. Import necessary libraries
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments

# 2. Load dataset (for demonstration, using the CNN/DailyMail summarization dataset)
dataset = load_dataset("cnn_dailymail", "3.0.0")

# 3. Load the pre-trained BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

# 4. Tokenize the dataset (for both source text and summaries)
def tokenize_function(batch):
    inputs = tokenizer(batch['article'], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    outputs = tokenizer(batch['highlights'], max_length=150, truncation=True, padding="max_length", return_tensors="pt")
    batch['input_ids'] = inputs['input_ids']
    batch['attention_mask'] = inputs['attention_mask']
    batch['labels'] = outputs['input_ids']
    return batch

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 5. Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# 6. Set up the Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

# 7. Start training
trainer.train()


ModuleNotFoundError: No module named 'datasets'