In [1]:
import pandas as pd
from datasets import Dataset

# Load your dataset (replace with actual file path if saved as a CSV)
# For this example, assuming it's in a file named "dialogues.csv"
data = pd.read_csv("samsum-test.csv")  # Replace with your file path
# If using the string directly from the prompt, you'd need to parse it (ask if you need help with this)

# Select relevant columns
data = data[["dialogue", "summary"]]

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Split into train (80%) and test (20%) sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"Training samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")

  from .autonotebook import tqdm as notebook_tqdm


Training samples: 655, Test samples: 164


In [2]:
from transformers import BartTokenizer, BartForConditionalGeneration

model_name = "philschmid/bart-large-cnn-samsum"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [3]:
def preprocess_function(examples):
    inputs = examples["dialogue"]
    targets = examples["summary"]
    
    # Tokenize inputs (dialogues)
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    
    # Tokenize targets (summaries)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    
    # Replace padding token IDs in labels with -100 to ignore them in loss computation
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]
    return model_inputs

# Apply preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Remove original text columns to save memory
tokenized_train = tokenized_train.remove_columns(["dialogue", "summary"])
tokenized_test = tokenized_test.remove_columns(["dialogue", "summary"])

Map: 100%|██████████| 655/655 [00:03<00:00, 177.01 examples/s]
Map: 100%|██████████| 164/164 [00:00<00:00, 166.69 examples/s]


In [4]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [5]:
import torch
print(torch.__version__)

2.6.0+cpu


In [6]:
import transformers
print(transformers.__version__)

4.49.0


In [7]:
import accelerate
print(accelerate.__version__)

1.4.0


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bart-large-cnn-samsum-finetuned",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    save_strategy="epoch",
)

print("TrainingArguments initialized successfully!")

TrainingArguments initialized successfully!


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Compute ROUGE scores
from datasets import load_metric

rouge = load_metric("rouge")
predictions = trainer.predict(tokenized_test)
decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
print("ROUGE Scores:", rouge_result)

In [None]:
model.save_pretrained("./bart-large-cnn-samsum-finetuned")
tokenizer.save_pretrained("./bart-large-cnn-samsum-finetuned")

In [None]:
# Load the fine-tuned model
fine_tuned_model = BartForConditionalGeneration.from_pretrained("./bart-large-cnn-samsum-finetuned")
fine_tuned_tokenizer = BartTokenizer.from_pretrained("./bart-large-cnn-samsum-finetuned")

# Test with a sample dialogue
sample_dialogue = "Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye"
inputs = fine_tuned_tokenizer(sample_dialogue, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = fine_tuned_model.generate(
    inputs["input_ids"],
    max_length=128,
    num_beams=4,
    early_stopping=True
)
summary = fine_tuned_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", summary)