# Training Whole Comment Seg Classification Model

First we need to create a copy of the transcripts in this directory.

In [1]:
from cdp_data import datasets, CDPInstances
import numpy as np
from pathlib import Path
import pandas as pd

# Set randomness
np.random.seed(60)

# Get random 200 from Seattle
ds = datasets.get_session_dataset(
    CDPInstances.Seattle,
    store_transcript=True,
    store_transcript_as_csv=True,
    start_datetime="2020-01-01",
    end_datetime="2024-01-01",
    sample=200,
)

# overall directory for saving
storage_dir = Path("seattle-transcripts/")
storage_dir.mkdir(exist_ok=True)

# iter sessions 
for _, row in ds.iterrows():
    # create the copy path
    transcript_copy_path = storage_dir / f"{row['id']}.csv"

    # read the original transcript
    transcript = pd.read_csv(row.transcript_as_csv_path)

    # keep only the index and text columns
    transcript = transcript[[
        "index",
        "text",
    ]]

    # rename index to sentence_index
    transcript = transcript.rename(columns={"index": "sentence_index"})

    # add column for session id
    transcript["session_id"] = row["id"]

    # add column for council
    transcript["council"] = CDPInstances.Seattle

    # save the modified transcript
    transcript.to_csv(transcript_copy_path, index=False)

  from .autonotebook import tqdm as notebook_tqdm
Fetching each model attached to event_ref: 100%|██████████| 200/200 [00:02<00:00, 93.23it/s] 
Fetching transcripts: 100%|██████████| 200/200 [00:01<00:00, 107.76it/s]
Converting transcripts: 100%|██████████| 200/200 [00:00<00:00, 6387.72it/s]


# Data Preprocessing

We need to convert the annotations into a set of examples ready for training, with context windows.

In [2]:
import pandas as pd
import numpy as np

np.random.seed(60)

# read the annotations
annotations = pd.read_csv("training-data/whole-period-seg-seattle.csv")

# we will always take N random negative samples from the same session
n_random_samples = 3

# Create context window sets
# single sentence means the context window is 1 (only the sentence itself)
# three sentence means the context window is 3 (1 before and 1 after)
# five sentence means the context window is 5 (2 before and 2 after)
single_sentence_examples = []
three_sentence_examples = []
five_sentence_examples = []

def get_context_windows(transcript, center_index) -> tuple[str, str, str]:
    # calculate offsets for the three and five sentence examples
    # if the offset would go negative, we just use 0 or len(transcript)
    three_sentence_start_index = max(center_index - 1, 0)
    three_sentence_end_index = min(center_index + 2, len(transcript) - 1)
    five_sentence_start_index = max(center_index - 2, 0)
    five_sentence_end_index = min(center_index + 3, len(transcript) - 1)

    # process the single sentence example
    single_sentence = transcript.iloc[center_index]["text"].strip()

    # process the three sentence example
    three_sentence = " ".join(
        transcript.iloc[
            three_sentence_start_index:
            three_sentence_end_index
        ]["text"]
    ).strip()

    # process the five sentence example
    five_sentence = " ".join(
        transcript.iloc[
            five_sentence_start_index:
            five_sentence_end_index
        ]["text"]
    ).strip()

    return single_sentence, three_sentence, five_sentence

# iterate over the rows of the annotations and create the context window sets
for _, row in annotations.iterrows():
    # load the session transcript csv
    transcript = pd.read_csv(f"seattle-transcripts/{row.session_id.strip()}.csv")

    # Convert rows with text as NaN to empty string
    transcript["text"] = transcript["text"].fillna("")

    # if we have a start sentence index, add all of the context windows samples
    if not np.isnan(row.period_start_sentence_index):
        # get the start sentence index
        period_start_sentence_index = int(row.period_start_sentence_index)

        # get the context windows
        single_sentence, three_sentence, five_sentence = get_context_windows(
            transcript,
            period_start_sentence_index,
        )

        # add all as examples
        single_sentence_examples.append({
            "session_id": row.session_id,
            "text": single_sentence,
            "label": "comment-period-start",
        })
        three_sentence_examples.append({
            "session_id": row.session_id,
            "text": three_sentence,
            "label": "comment-period-start",
        })
        five_sentence_examples.append({
            "session_id": row.session_id,
            "text": five_sentence,
            "label": "comment-period-start",
        })

    # if we have a end sentence index, add all of the context windows samples
    if not np.isnan(row.period_end_sentence_index):
        # get the end sentence index
        period_end_sentence_index = int(row.period_end_sentence_index)

        # get the context windows
        single_sentence, three_sentence, five_sentence = get_context_windows(
            transcript,
            period_end_sentence_index,
        )

        # add all as examples
        single_sentence_examples.append({
            "session_id": row.session_id,
            "text": single_sentence,
            "label": "comment-period-end",
        })
        three_sentence_examples.append({
            "session_id": row.session_id,
            "text": three_sentence,
            "label": "comment-period-end",
        })
        five_sentence_examples.append({
            "session_id": row.session_id,
            "text": five_sentence,
            "label": "comment-period-end",
        })
    
    # choose N random negative samples from the same session
    # we start by finding N random negative sentence indicies to use as
    # the center of the context windows
    # make sure we specifically exclude the start and end sentence indices
    valid_indices = list(range(len(transcript)))
    if not np.isnan(row.period_start_sentence_index):
        # remove the period start sentence index, and the two before and after
        valid_indices = list(
            set(valid_indices) - set(
                range(
                    int(row.period_start_sentence_index) - 2,
                    int(row.period_start_sentence_index) + 3,
                )
            )
        )
    if not np.isnan(row.period_end_sentence_index):
        # remove the period end sentence index, and the two before and after
        valid_indices = list(
            set(valid_indices) - set(
                range(
                    int(row.period_end_sentence_index) - 2,
                    int(row.period_end_sentence_index) + 3,
                )
            )
        )
    
    # choose N random negative samples
    negative_samples = np.random.choice(valid_indices, n_random_samples, replace=False)

    # process the negative samples
    for negative_sample in negative_samples:
        # get the context windows
        single_sentence, three_sentence, five_sentence = get_context_windows(
            transcript,
            negative_sample,
        )

        # add all as examples
        single_sentence_examples.append({
            "session_id": row.session_id,
            "text": single_sentence,
            "label": "other",
        })
        three_sentence_examples.append({
            "session_id": row.session_id,
            "text": three_sentence,
            "label": "other",
        })
        five_sentence_examples.append({
            "session_id": row.session_id,
            "text": five_sentence,
            "label": "other",
        })

# create the dataframes
single_sentence_examples = pd.DataFrame(single_sentence_examples)
three_sentence_examples = pd.DataFrame(three_sentence_examples)
five_sentence_examples = pd.DataFrame(five_sentence_examples)

# save the dataframes
single_sentence_examples.to_csv("training-data/whole-comment-seg-single-sentence-examples.csv", index=False)
three_sentence_examples.to_csv("training-data/whole-comment-seg-three-sentence-examples.csv", index=False)
five_sentence_examples.to_csv("training-data/whole-comment-seg-five-sentence-examples.csv", index=False)

three_sentence_examples.sample(5)

Unnamed: 0,session_id,text,label
483,19555e51df3c,"Mr. Ahn, we can continue. Okay. Four in favor,...",other
587,ab8e206e24c6,Another vendor has developed a helmet that loc...,other
585,ab8e206e24c6,Once you have completed your public comment we...,comment-period-start
711,d13fd84e7845,"And thank you, Wayne, for bringing these to us...",other
70,03e2809fffe1,"Thank you. So on slide four, the slide include...",other
