In [1]:
!pip install datasets



In [2]:
# from datasets import load_dataset

# dataset = load_dataset("kritsadaK/EDGAR-CORPUS-Financial-Summarization")

In [3]:
# %%
!pip install transformers datasets evaluate sentence_transformers rouge_score bert_score scikit-learn streamlit

# %%
import torch
import os
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Training on: {device}")

# Load tokenizer and model
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

# Load dataset
dataset = load_dataset("kritsadaK/EDGAR-CORPUS-Financial-Summarization")

# Convert dataset to Pandas DataFrame
df = dataset["train"].to_pandas()

# Rename columns (Ensure they match the dataset's column names)
df = df.rename(columns={"article": "input", "summary": "summary"})

# Split dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Wrap datasets inside a DatasetDict
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# Preprocessing function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input"], max_length=1024, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=256, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
tokenized_train_dataset = dataset_dict["train"].map(preprocess_function, batched=True)
tokenized_test_dataset = dataset_dict["test"].map(preprocess_function, batched=True)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./bart_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./bart_logs",
    save_total_limit=2,
    report_to="none",
    fp16=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Train model
trainer.train()

# Save model & tokenizer
model.save_pretrained("./fine_tuned_bart_model")
tokenizer.save_pretrained("./fine_tuned_bart_model")

print("Training complete and model saved.")


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m82.5 

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.42k [00:00<?, ?B/s]

EDGAR-CORPUS-Financial-Summarization.csv:   0%|          | 0.00/794M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10610 [00:00<?, ? examples/s]

Map:   0%|          | 0/9549 [00:00<?, ? examples/s]



Map:   0%|          | 0/1061 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.2449,1.197466
2,1.0481,1.102999
3,0.9139,1.074477




Training complete and model saved.


In [4]:
# %%
!pip install transformers datasets evaluate sentence_transformers rouge_score bert_score scikit-learn nltk

# %%
import torch
import os
import numpy as np
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Evaluating on: {device}")

# Load tokenizer and model
model_path = "./fine_tuned_bart_model"
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path).to(device)

# Load dataset
dataset = load_dataset("kritsadaK/EDGAR-CORPUS-Financial-Summarization")

# Convert dataset to Pandas DataFrame
df = dataset["train"].to_pandas()

# Rename columns
df = df.rename(columns={"article": "input", "summary": "summary"})

# Split dataset into test set (use only test set for evaluation)
_, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)

# Preprocessing function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input"], max_length=1024, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=256, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize test dataset
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Load evaluation metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# Function to generate predictions
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(inputs.input_ids, max_length=256, min_length=50, num_beams=4, length_penalty=2.0)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Evaluate model
references = []
predictions = []

for example in test_df.sample(100).itertuples():  # Sample 100 examples for faster evaluation
    input_text = example.input
    reference_summary = example.summary
    generated_summary = generate_summary(input_text)
    
    references.append([reference_summary])  # BLEU requires list of lists
    predictions.append(generated_summary)

# Compute ROUGE Score
rouge_scores = rouge.compute(predictions=predictions, references=[ref[0] for ref in references])
print("ROUGE Scores:", rouge_scores)

# Compute BERTScore
bertscore_results = bertscore.compute(predictions=predictions, references=[ref[0] for ref in references], lang="en")
print("BERTScore (F1 Mean):", np.mean(bertscore_results["f1"]))

# Compute BLEU Score
bleu_score = corpus_bleu(references, predictions)
print("BLEU Score:", bleu_score)


Evaluating on: cuda




Map:   0%|          | 0/1061 [00:00<?, ? examples/s]



Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]



ROUGE Scores: {'rouge1': 0.22935471156412018, 'rouge2': 0.0872917637946399, 'rougeL': 0.13106643254722683, 'rougeLsum': 0.20915650158887916}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore (F1 Mean): 0.5319436860084533
BLEU Score: 0.26002396504062025


In [5]:
!zip -r fine_tuned_bart_model.zip ./fine_tuned_bart_model

  adding: fine_tuned_bart_model/ (stored 0%)
  adding: fine_tuned_bart_model/merges.txt (deflated 53%)
  adding: fine_tuned_bart_model/vocab.json (deflated 68%)
  adding: fine_tuned_bart_model/model.safetensors (deflated 7%)
  adding: fine_tuned_bart_model/tokenizer_config.json (deflated 75%)
  adding: fine_tuned_bart_model/special_tokens_map.json (deflated 85%)
  adding: fine_tuned_bart_model/generation_config.json (deflated 47%)
  adding: fine_tuned_bart_model/config.json (deflated 61%)


In [6]:
from IPython.display import FileLink
FileLink(r'fine_tuned_bart_model.zip')