In [1]:
import pandas as pd

# 1. Read the dataset
df = pd.read_parquet("cdp-minutes-item-generation-dataset/dataset.parquet")
df

Unnamed: 0,infrastructure,session_id,event_id,transcript_path,minutes_items
0,cdp-seattle-21723dcf,001f1a4e4e3c,d916e9acd175,cdp-minutes-item-generation-dataset/transcript...,"[ADOPTION OF OTHER RESOLUTIONS, PRESENTATIONS,..."
1,cdp-seattle-21723dcf,008f4e8d253c,75bb0f7c2ba4,cdp-minutes-item-generation-dataset/transcript...,"[CB 120084, CB 120083, Call To Order, Adjournm..."
2,cdp-seattle-21723dcf,015dd602acce,30411cba563e,cdp-minutes-item-generation-dataset/transcript...,"[CB 120265, Inf 2011, Public Comment, CB 12026..."
3,cdp-seattle-21723dcf,01a6d09dd442,9f581faa5ece,cdp-minutes-item-generation-dataset/transcript...,"[Inf 1736, Inf 1735, President's Report, Appro..."
4,cdp-seattle-21723dcf,01e75165fea1,9486903291a7,cdp-minutes-item-generation-dataset/transcript...,"[Approval of the Minutes, Inf 1972, Inf 1961, ..."
...,...,...,...,...,...
516,cdp-seattle-21723dcf,fd5d2f907449,34f160b4a508,cdp-minutes-item-generation-dataset/transcript...,"[ADJOURNMENT, ADOPTION OF OTHER RESOLUTIONS, C..."
517,cdp-seattle-21723dcf,fd7f4922bcc3,d0d75a1259aa,cdp-minutes-item-generation-dataset/transcript...,"[Inf 1878, Inf 1876, Session I - 9:30 a.m., In..."
518,cdp-seattle-21723dcf,fe09e3d1564f,9fc2e3743166,cdp-minutes-item-generation-dataset/transcript...,"[Approval of the Agenda, Inf 1812, Public Comm..."
519,cdp-seattle-21723dcf,fe7c8aa0dd58,7b13838e9e3e,cdp-minutes-item-generation-dataset/transcript...,"[Preview of Today’s City Council Actions, Coun..."


In [2]:
# Selects a random sample
sample = df.sample(n=200, random_state=1)
sample["minutes_items"] = sample.minutes_items.apply(lambda x: ";".join(x))
sample

Unnamed: 0,infrastructure,session_id,event_id,transcript_path,minutes_items
273,cdp-seattle-21723dcf,884c13780b1d,7f5c5388633c,cdp-minutes-item-generation-dataset/transcript...,Approval of the Minutes;President's Report;Sig...
272,cdp-seattle-21723dcf,880bc2244617,bfc50250df4b,cdp-minutes-item-generation-dataset/transcript...,CB 119831;Public Comment;Adjournment;Approval ...
329,cdp-seattle-21723dcf,a7d9aa54081e,5d144a76c5fb,cdp-minutes-item-generation-dataset/transcript...,Inf 1662;Public Comment;Session I - 10:00 a.m....
481,cdp-seattle-21723dcf,f16d100e510a,af7492eec004,cdp-minutes-item-generation-dataset/transcript...,CB 119827;Inf 1677;Call To Order;Public Commen...
173,cdp-seattle-21723dcf,59e82e951afe,924690a901d9,cdp-minutes-item-generation-dataset/transcript...,Inf 1962;Approval of the Agenda;Inf 1963;Publi...
...,...,...,...,...,...
59,cdp-seattle-21723dcf,1ca25fd1350f,ee7672ff5985,cdp-minutes-item-generation-dataset/transcript...,Inf 1834;Public Comment;Adjournment;Approval o...
120,cdp-seattle-21723dcf,3f46d6319e6f,b542150ecb3e,cdp-minutes-item-generation-dataset/transcript...,Appt 02183;Public Comment;Adjournment;Inf 2040...
12,cdp-seattle-21723dcf,062f6b2b2003,79fb834ac65c,cdp-minutes-item-generation-dataset/transcript...,Res 32029;Approval of the Agenda;Call To Order...
507,cdp-seattle-21723dcf,f9bbf34ef7fe,4fa4155c9d49,cdp-minutes-item-generation-dataset/transcript...,Appt 01533;CALL TO ORDER;Appt 01534;Appt 01535...


In [9]:
# 3. Split the selected subset into train, validation, and test splits
from sklearn.model_selection import train_test_split

In [22]:
X = []
y = []
for index, row in sample.iterrows():
    transcript_sentences = pd.read_csv(row.transcript_path)
    text = transcript_sentences.text.str.cat(sep=' ')
    X.append(text)
    y.append(row.minutes_items)

In [23]:
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [37]:
temp = {"train": [pd.DataFrame({'sentences': X_train}), pd.DataFrame({'minutes_items': y_train})],
        "valid": [pd.DataFrame({'sentences': X_valid}), pd.DataFrame({'minutes_items': y_valid})],
        "test": [pd.DataFrame({'sentences': X_test}), pd.DataFrame({'minutes_items': y_test})]}

In [38]:
data = pd.DataFrame(temp)

In [None]:
print

In [6]:
# 4. training using mT5

## Preprocessing the data

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Maximum length of text
max_input_length = 512

# Maximum length of outlines
max_target_length = 30


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["sentences"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["minutes_items"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = data.map(preprocess_function, batched=True)

## Metrics for text summarization

In [29]:
import evaluate

rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## Creating a strong baseline

In [31]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ethanzhuang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


print(three_sentence_summary(data["train"]["sentences"]))

In [None]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["sentences"]]
    return metric.compute(predictions=summaries, references=dataset["minute_items"])

In [None]:
import pandas as pd

score = evaluate_baseline(data["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)
rouge_dict

## Fine-tuning mT5 with Keras

In [None]:
# load the pretrained model from the mt5-small checkpoint
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# dynamically pad the inputs and the labels
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
# remove the columns with strings
tokenized_datasets = tokenized_datasets.remove_columns(
    data["train"].column_names
)

In [None]:
# convert our datasets to tf.data
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8,
)
tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=8,
)

In [None]:
# define our training hyperparameters and compile
from transformers import create_optimizer
import tensorflow as tf

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_epochs = 8
num_train_steps = len(tf_train_dataset) * num_train_epochs
model_name = model_checkpoint.split("/")[-1]

optimizer, schedule = create_optimizer(
    init_lr=5.6e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
# fit the model
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned-amazon-en-es", tokenizer=tokenizer
)

model.fit(
    tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback], epochs=8
)

In [None]:
# build some lists of labels and predictions for the ROUGE metric to compare
from tqdm import tqdm
import numpy as np

generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=320
)

tf_generate_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=generation_data_collator,
    shuffle=False,
    batch_size=8,
    drop_remainder=True,
)


@tf.function(jit_compile=True)
def generate_with_xla(batch):
    return model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=32,
    )


all_preds = []
all_labels = []
for batch, labels in tqdm(tf_generate_dataset):
    predictions = generate_with_xla(batch)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = labels.numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    all_preds.extend(decoded_preds)
    all_labels.extend(decoded_labels)

In [None]:
# compue the ROUGE score
result = rouge_score.compute(
    predictions=decoded_preds, references=decoded_labels, use_stemmer=True
)
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
{k: round(v, 4) for k, v in result.items()}

## Using fine-tuned model

In [None]:
from transformers import pipeline

hub_model_id = "huggingface-course/mt5-small-finetuned-amazon-en-es"
summarizer = pipeline("summarization", model=hub_model_id)

In [None]:
def print_summary(idx):
    sentences = data["test"]["sentences"]
    minute_items = data["test"]["minute_items"]
    summary = summarizer(data["test"]["minute_items"])["summary_text"]
    print(f"'>>> Transcripts: {sentences}'")
    print(f"\n'>>> Minute_items: {minute_items}'")
    print(f"\n'>>> Summary: {summary}'")