In [1]:
!pip install datasets rouge opencv-python-headless evaluate rouge_score

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [2]:
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset


In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [4]:
# ds = load_dataset("microsoft/MeetingBank-QA-Summary")
# df = pd.read_parquet("hf://datasets/microsoft/MeetingBank-QA-Summary/data/test-00000-of-00001.parquet")

In [5]:
df = pd.read_parquet("/content/0000.parquet")
df.drop(["idx","QA_pairs","summary"],axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,prompt,gpt4_summary
0,"The report of the Civil Rights, Utilities, Eco...","The Civil Rights, Utilities, Economic Developm..."
1,"Madam Court, could you please read docket 1239...",The Committee on Government Operations discuss...
2,"Item 15, report from City Manager Recommendati...",The City Manager recommended the adoption of t...
3,"Item five, proposed ordinance 2016 0392. This ...",The meeting discussed proposed ordinance 2016 ...
4,Very good. Any comments? Those in favor of con...,The meeting discussed the confirmation of an a...


In [7]:
def tokenize_function(examples):
    inputs = tokenizer(examples["prompt"], truncation=True, max_length=1024, padding="max_length")
    targets = tokenizer(examples["gpt4_summary"], truncation=True, max_length=512, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

dataset = Dataset.from_pandas(df)


sample_text = df.iloc[0]["prompt"]
sample_description = df.iloc[0]["gpt4_summary"]
print("Sample Input:", sample_text)
print("Sample Target:", sample_description)

tokenized_sample = tokenizer(sample_text, truncation=True, max_length=128, padding="max_length")
print("Tokenized Input:", tokenized_sample)

tokenized_target = tokenizer(sample_description, truncation=True, max_length=128, padding="max_length")
print("Tokenized Target:", tokenized_target)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "gpt4_summary"])

Sample Input: The report of the Civil Rights, Utilities, Economic Development and Arts Committee Agenda Item three Resolution 31669 Encouraging as a best practice the use of an individualized tenant assessment using the Fair Housing Act's discriminatory effect standards to avoid Fair Housing Act violations when criminal history is used as a screening criterion in the Landlord Screening Process, Committee recommends that the resolution be adopted as amended grade. I move to amend Resolution 31669 by substituting D four for version D three, which includes a new attachment. A And I understand Councilmember Bagshaw also has an amendment, but let's first, if we could, let me just go through the changes to the resolution since the last committee meeting. The changes are found in two recitals, as well as sections one and five are the primary changes. We added a recital that again lifts up the HUD guidance to show that a criminal history screening policy is next must serve a substantial, legit

Map:   0%|          | 0/862 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 862
})

In [9]:
train_test_split = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
valid_dataset = train_test_split["test"]
print(f"Train Size: {len(train_test_split['train'])}, Test Size: {len(train_test_split['test'])}")

Train Size: 775, Test Size: 87


In [10]:
from evaluate import load
rouge = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: round(v * 100, 2) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
#training arguments
training_args = TrainingArguments(
    output_dir="./BART-finetuned-meetings",
    evaluation_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    learning_rate=5e-5,
    logging_steps=100,
    gradient_accumulation_steps=4,
    dataloader_num_workers=2,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    processing_class=tokenizer,
)
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshalavyaagrawal[0m ([33mshalavyaagrawal-vellore-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,1.749029
2,No log,0.545974
3,3.220300,0.467662
4,0.477500,0.452154


TrainOutput(global_step=240, training_loss=1.6097103595733642, metrics={'train_runtime': 295.2073, 'train_samples_per_second': 13.126, 'train_steps_per_second': 0.813, 'total_flos': 2319437487144960.0, 'train_loss': 1.6097103595733642, 'epoch': 4.907216494845361})

In [12]:
trainer.evaluate()

{'eval_loss': 0.4521537721157074,
 'eval_runtime': 2.1625,
 'eval_samples_per_second': 40.231,
 'eval_steps_per_second': 10.173,
 'epoch': 4.907216494845361}

In [13]:
trainer.save_model("./saved-bart-meeting-summary")

