In [None]:
import pandas as pd
from datasets import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import train_test_split

# Load the dataset
data_path = "train_data.csv"  # Path to your training data
df = pd.read_csv(data_path)

# Split into train and validation sets
train_texts, val_texts, train_summaries, val_summaries = train_test_split(
    df["input"], df["summary"], test_size=0.2, random_state=42
)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({"input": train_texts, "summary": train_summaries})
val_dataset = Dataset.from_dict({"input": val_texts, "summary": val_summaries})

# Load BART tokenizer and model
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Tokenize the data
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"], max_length=1024, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=150, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_total_limit=2,
    save_steps=10_000,
    logging_dir="./logs",
    predict_with_generate=True,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./bart-finetuned")
tokenizer.save_pretrained("./bart-finetuned")
