In [5]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Step 1: Load the dataset
data_path = "samsum-train.csv"  # Update with your file path
df = pd.read_csv(data_path)

# Handle missing values in the 'dialogue' column
df['dialogue'] = df['dialogue'].fillna("")  # Replace NaN with an empty string

# Step 2: Preprocess the data for extractive summarization
def preprocess_data(row):
    sentences = row['dialogue'].split('. ')  # Split dialogues into sentences
    data = []
    for idx, sentence in enumerate(sentences):
        # Mark the first sentence as a summary (label=1), others as non-summary (label=0)
        data.append({"text": sentence.strip(), "label": 1 if idx == 0 else 0})
    return data

# Apply preprocessing to the dataset
processed_data = []
for _, row in df.iterrows():
    processed_data.extend(preprocess_data(row))

# Convert processed data into a DataFrame
processed_df = pd.DataFrame(processed_data)

# Step 3: Split the data into train and evaluation sets
train_df, eval_df = train_test_split(processed_df, test_size=0.1, random_state=42)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Step 4: Tokenize the data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize the training and evaluation datasets
train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
train_tokenized_dataset = train_tokenized_dataset.rename_column("label", "labels")
train_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

eval_tokenized_dataset = eval_dataset.map(tokenize_function, batched=True)
eval_tokenized_dataset = eval_tokenized_dataset.rename_column("label", "labels")
eval_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Step 5: Fine-tune the BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_summarization",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_steps=100,
    logging_steps=10,
    save_total_limit=2,
    fp16=True,  # Enable mixed precision for faster training on GPUs
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,  # Add evaluation dataset
    tokenizer=tokenizer
)

# Step 6: Train the model
print("Starting training...")
trainer.train()

# Step 7: Save the fine-tuned model
print("Saving the model...")
trainer.save_model("./bert_summarization_finetuned")
tokenizer.save_pretrained("./bert_summarization_finetuned")
print("Model saved to ./bert_summarization_finetuned")


Map:   0%|          | 0/40227 [00:00<?, ? examples/s]

Map:   0%|          | 0/4470 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.1329,0.081033
2,0.0389,0.095157
3,0.0008,0.131915


Saving the model...
Model saved to ./bert_summarization_finetuned


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [11]:
!pip install rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=719ba0d78871956d6f1531498d4a79eefa07c88f13871d3082736ee17b7778b5
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [16]:
import torch
import numpy as np  # Import numpy
from sklearn.metrics import accuracy_score
import evaluate
from transformers import Trainer, TrainingArguments

# Step 1: Calculate final training loss

# Get the trainer's final training loss
train_loss = trainer.state.best_metric  # This is the best validation loss tracked by Trainer
print(f"Final training loss: {train_loss}")

# Step 2: Evaluation on the validation dataset

# Define a function to compute metrics during evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Convert logits and labels to torch tensors if they are not already
    logits = torch.tensor(logits) if isinstance(logits, np.ndarray) else logits
    labels = torch.tensor(labels) if isinstance(labels, np.ndarray) else labels

    # Calculate predictions using argmax
    predictions = torch.argmax(logits, dim=-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels.numpy(), predictions.numpy())

    # ROUGE scores (for summarization tasks)
    rouge = evaluate.load("rouge")
    references = [str(label) for label in labels.numpy()]
    predicted_summaries = [str(prediction) for prediction in predictions.numpy()]
    result = rouge.compute(predictions=predicted_summaries, references=references)
    return {
        'accuracy': accuracy,
        'rouge1': result['rouge1'],
        'rouge2': result['rouge2'],
        'rougeL': result['rougeL'],
    }

# Set up evaluation arguments
eval_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    do_eval=True
)

# Run evaluation
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=eval_tokenized_dataset,
    compute_metrics=compute_metrics
)

eval_results = trainer.evaluate()

# Print evaluation results
print("Evaluation results: ")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"ROUGE-1: {eval_results['eval_rouge1']:.4f}")
print(f"ROUGE-2: {eval_results['eval_rouge2']:.4f}")
print(f"ROUGE-L: {eval_results['eval_rougeL']:.4f}")

# Step 3: Save evaluation metrics
with open("./bert_summarization_finetuned/evaluation_metrics.txt", "w") as f:
    f.write(f"Final training loss: {train_loss}\n")
    f.write(f"Evaluation Accuracy: {eval_results['eval_accuracy']:.4f}\n")
    f.write(f"ROUGE-1: {eval_results['eval_rouge1']:.4f}\n")
    f.write(f"ROUGE-2: {eval_results['eval_rouge2']:.4f}\n")
    f.write(f"ROUGE-L: {eval_results['eval_rougeL']:.4f}\n")

print("Evaluation metrics saved to ./bert_summarization_finetuned/evaluation_metrics.txt")


Final training loss: None


Evaluation results: 
Accuracy: 0.9723
ROUGE-1: 0.9725
ROUGE-2: 0.0000
ROUGE-L: 0.9720
Evaluation metrics saved to ./bert_summarization_finetuned/evaluation_metrics.txt


In [21]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
import random

# Step 1: Load the fine-tuned model and tokenizer
model_path = "./bert_summarization_finetuned"  # Path to your fine-tuned model
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Step 2: Load a sample text from the dataset
data_path = "/content/samsum-train.csv"  # Update with the path to your dataset
df = pd.read_csv(data_path)

# Take a random sample from the dataset
random_row = random.choice(df.index)  # Select a random index from the dataset
sample_text = df.iloc[random_row]['dialogue']  # Get the 'dialogue' column from the random row

print(f"Original Text: {sample_text}")

# Step 3: Preprocess the sample text for classification (for BERT)
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Step 4: Make a prediction
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()

# Step 5: Show the predicted summary (you can define how to interpret the output)
if prediction == 1:
    print("Predicted Summary: First sentence of the dialogue")
else:
    print("Predicted Summary: Not the first sentence")

# If you want to generate actual summaries (like GPT or T5), you'd need to use a generative model


Original Text: Maria: We've passed the security and we're waiting for the boarding
Aldona: no problems?
Philip: not at all, was smooth
Aldona: I told you
Maria: I know, but I am traumatised after the stopped me the last time
Maria: it was so stressful and horrible, quite humiliating
Aldona: I know, but it should not happen again
Philip: She is panicking all the time
Philip: it would be even funny, if it wasn't so annoying
Maria: I'm sorry, it's kind of a phobia I guess
Aldona: no, it will pass, you just had very bad experiences
Maria: maybe you're right
Maria: I feel already better, thanks guys for your support
Aldona: 👍
Predicted Summary: First sentence of the dialogue
