# Training Whole Comment Seg Classification Model

First we need to create a copy of the transcripts in this directory.

In [1]:
from cdp_data import datasets, CDPInstances
import numpy as np
from pathlib import Path
import pandas as pd

# Set randomness
np.random.seed(60)

for council in [
    CDPInstances.Seattle,
    CDPInstances.Oakland,
    CDPInstances.Richmond,
]:
    ds = datasets.get_session_dataset(
        council,
        store_transcript=True,
        store_transcript_as_csv=True,
        start_datetime="2020-01-01",
        end_datetime="2024-01-01",
        raise_on_error=False,
    )

    # overall directory for saving
    storage_dir = Path(f"{council}-transcripts/")
    storage_dir.mkdir(exist_ok=True)

    # iter sessions 
    for _, row in ds.iterrows():
        # create the copy path
        transcript_copy_path = storage_dir / f"{row['id']}.csv"

        # read the original transcript
        transcript = pd.read_csv(row.transcript_as_csv_path)

        # keep only the index and text columns
        transcript = transcript[[
            "index",
            "text",
        ]]

        # rename index to sentence_index
        transcript = transcript.rename(columns={"index": "sentence_index"})

        # add column for session id
        transcript["session_id"] = row["id"]

        # add column for council
        transcript["council"] = council

        # save the modified transcript
        transcript.to_csv(transcript_copy_path, index=False)

  from .autonotebook import tqdm as notebook_tqdm
Fetching each model attached to event_ref: 100%|██████████| 640/640 [00:03<00:00, 161.30it/s]
Fetching transcripts: 100%|██████████| 640/640 [00:08<00:00, 77.10it/s]
Converting transcripts: 100%|██████████| 640/640 [00:01<00:00, 389.23it/s]
Fetching each model attached to event_ref: 100%|██████████| 308/308 [00:03<00:00, 88.02it/s] 
Fetching transcripts: 100%|██████████| 308/308 [00:03<00:00, 78.17it/s]
Converting transcripts: 100%|██████████| 307/307 [00:01<00:00, 196.14it/s]
Fetching each model attached to event_ref: 100%|██████████| 544/544 [00:06<00:00, 89.91it/s] 
Fetching transcripts: 100%|██████████| 544/544 [00:08<00:00, 62.48it/s]
Converting transcripts: 100%|██████████| 544/544 [00:01<00:00, 347.64it/s]


# Data Preprocessing

We need to convert the annotations into a set of examples ready for training, with context windows.

In [2]:
import pandas as pd
import numpy as np
from cdp_data import CDPInstances
from tqdm import tqdm

np.random.seed(60)

# read all annotations
annotation_pds = []
for council_short_name, council_infra_slug in {
    "seattle": CDPInstances.Seattle,
    "oakland": CDPInstances.Oakland,
    "richmond": CDPInstances.Richmond,
}.items():
    annotations = pd.read_csv(f"training-data/whole-period-seg-{council_short_name}.csv")

    # add council column
    annotations["council"] = council_short_name
    annotations["council_infra_slug"] = council_infra_slug

    annotation_pds.append(annotations)

# Combine all annotations
annotations = pd.concat(annotation_pds, ignore_index=True)

# Convert nans in "transcript_quality" to "good-safe-use"
# Then drop any rows that aren't good-safe-use
annotations["transcript_quality"] = annotations["transcript_quality"].fillna("good-safe-use")
annotations["transcript_quality"] = annotations["transcript_quality"].replace({"good-safe-to-use": "good-safe-use"})
annotations = annotations[annotations["transcript_quality"] == "good-safe-use"]

# Convert -1 in period_start_sentence_index and period_end_sentence_index to nan
annotations["period_start_sentence_index"] = annotations["period_start_sentence_index"].replace(-1, np.nan)

# Convert nans in "comment_or_hearing" to "comment"
annotations["comment_or_hearing"] = annotations["comment_or_hearing"].fillna("comment")

# Keep only the columns we need
annotations = annotations[[
    "council",
    "council_infra_slug",
    "session_id",
    "period_start_sentence_index",
    "period_end_sentence_index",
    "comment_or_hearing",
]]

# Drop any sessions with nan session_id
annotations = annotations.dropna(subset=["session_id"])

# Ensure that all session_ids are strings
annotations["session_id"] = annotations["session_id"].astype(str)

# we will always take N random negative samples from the same session
n_random_samples = 2

# Create context window sets
# single sentence means the context window is 1 (only the sentence itself)
# three sentence means the context window is 3 (1 before and 1 after)
# five sentence means the context window is 5 (2 before and 2 after)
single_sentence_examples = []
three_sentence_examples = []
five_sentence_examples = []

def get_context_windows(transcript, center_index) -> tuple[str, str, str]:
    # calculate offsets for the three and five sentence examples
    # if the offset would go negative, we just use 0 or len(transcript)
    three_sentence_start_index = max(center_index - 1, 0)
    three_sentence_end_index = min(center_index + 2, len(transcript) - 1)
    five_sentence_start_index = max(center_index - 2, 0)
    five_sentence_end_index = min(center_index + 3, len(transcript) - 1)

    # process the single sentence example
    single_sentence = transcript.iloc[center_index]["text"].strip()

    # process the three sentence example
    three_sentence = " ".join(
        transcript.iloc[
            three_sentence_start_index:
            three_sentence_end_index
        ]["text"]
    ).strip()

    # process the five sentence example
    five_sentence = " ".join(
        transcript.iloc[
            five_sentence_start_index:
            five_sentence_end_index
        ]["text"]
    ).strip()

    return single_sentence, three_sentence, five_sentence


# iterate over the rows of the annotations and create the context window sets
for _, row in tqdm(annotations.iterrows(), total=len(annotations)):
    try:
        # load the session transcript csv
        transcript = pd.read_csv(f"{row.council_infra_slug}-transcripts/{row.session_id.strip()}.csv")

        # Convert rows with text as NaN to empty string
        transcript["text"] = transcript["text"].fillna("")

        # if we have a start sentence index, add all of the context windows samples
        if not np.isnan(row.period_start_sentence_index):
            # get the start sentence index
            period_start_sentence_index = int(row.period_start_sentence_index)

            # get the context windows
            single_sentence, three_sentence, five_sentence = get_context_windows(
                transcript,
                period_start_sentence_index,
            )

            # add all as examples
            single_sentence_examples.append({
                "council": row.council,
                "session_id": row.session_id,
                "text": single_sentence,
                "label": "comment-period-start",
                "comment_or_hearing": row.comment_or_hearing,
            })
            three_sentence_examples.append({
                "council": row.council,
                "session_id": row.session_id,
                "text": three_sentence,
                "label": "comment-period-start",
                "comment_or_hearing": row.comment_or_hearing,
            })
            five_sentence_examples.append({
                "council": row.council,
                "session_id": row.session_id,
                "text": five_sentence,
                "label": "comment-period-start",
                "comment_or_hearing": row.comment_or_hearing,
            })

        # if we have a end sentence index, add all of the context windows samples
        if not np.isnan(row.period_end_sentence_index):
            # get the end sentence index
            period_end_sentence_index = int(row.period_end_sentence_index)

            # get the context windows
            single_sentence, three_sentence, five_sentence = get_context_windows(
                transcript,
                period_end_sentence_index,
            )

            # add all as examples
            single_sentence_examples.append({
                "council": row.council,
                "session_id": row.session_id,
                "text": single_sentence,
                "label": "comment-period-end",
                "comment_or_hearing": row.comment_or_hearing,
            })
            three_sentence_examples.append({
                "council": row.council,
                "session_id": row.session_id,
                "text": three_sentence,
                "label": "comment-period-end",
                "comment_or_hearing": row.comment_or_hearing,
            })
            five_sentence_examples.append({
                "council": row.council,
                "session_id": row.session_id,
                "text": five_sentence,
                "label": "comment-period-end",
                "comment_or_hearing": row.comment_or_hearing,
            })
        
        # choose N random negative samples from the same session
        # we start by finding N random negative sentence indicies to use as
        # the center of the context windows
        # make sure we specifically exclude the start and end sentence indices
        block_length = 5
        valid_indices = list(range(len(transcript)))
        if not np.isnan(row.period_start_sentence_index):
            # remove the period start sentence index, and the block length before and after
            valid_indices = list(
                set(valid_indices) - set(
                    range(
                        int(row.period_start_sentence_index) - block_length,
                        int(row.period_start_sentence_index) + block_length + 1,
                    )
                )
            )
        if not np.isnan(row.period_end_sentence_index):
            # remove the period end sentence index, and the block length before and after
            valid_indices = list(
                set(valid_indices) - set(
                    range(
                        int(row.period_end_sentence_index) - block_length,
                        int(row.period_end_sentence_index) + block_length + 1,
                    )
                )
            )
        
        # choose N random negative samples
        negative_samples = np.random.choice(valid_indices, n_random_samples, replace=False)

        # process the negative samples
        for negative_sample in negative_samples:
            # get the context windows
            single_sentence, three_sentence, five_sentence = get_context_windows(
                transcript,
                negative_sample,
            )

            # add all as examples
            single_sentence_examples.append({
                "council": row.council,
                "session_id": row.session_id,
                "text": single_sentence,
                "label": "other",
                "comment_or_hearing": "other",
            })
            three_sentence_examples.append({
                "council": row.council,
                "session_id": row.session_id,
                "text": three_sentence,
                "label": "other",
                "comment_or_hearing": "other",
            })
            five_sentence_examples.append({
                "council": row.council,
                "session_id": row.session_id,
                "text": five_sentence,
                "label": "other",
                "comment_or_hearing": "other",
            })

    except Exception as e:
        print(f"Row errored: {row.council} - {row.session_id}")
        print(e)
        continue

# create the dataframes
single_sentence_examples = pd.DataFrame(single_sentence_examples)
three_sentence_examples = pd.DataFrame(three_sentence_examples)
five_sentence_examples = pd.DataFrame(five_sentence_examples)

# save the dataframes
single_sentence_examples.to_csv("training-data/whole-comment-seg-single-sentence-examples.csv", index=False)
three_sentence_examples.to_csv("training-data/whole-comment-seg-three-sentence-examples.csv", index=False)
five_sentence_examples.to_csv("training-data/whole-comment-seg-five-sentence-examples.csv", index=False)

three_sentence_examples.sample(10)

 82%|████████▏ | 571/694 [00:01<00:00, 514.75it/s]

Row errored: oakland - 363760c5a502
[Errno 2] No such file or directory: 'cdp-oakland-ba81c097-transcripts/363760c5a502.csv'
Row errored: oakland - 43df4943ac92
[Errno 2] No such file or directory: 'cdp-oakland-ba81c097-transcripts/43df4943ac92.csv'
Row errored: oakland - cfb96e89ee1d																				
single positional indexer is out-of-bounds
Row errored: oakland - 60ec6e138396
[Errno 2] No such file or directory: 'cdp-oakland-ba81c097-transcripts/60ec6e138396.csv'
Row errored: oakland - 60ec6e138397
[Errno 2] No such file or directory: 'cdp-oakland-ba81c097-transcripts/60ec6e138397.csv'
Row errored: oakland - 7719f0e0db102
[Errno 2] No such file or directory: 'cdp-oakland-ba81c097-transcripts/7719f0e0db102.csv'


100%|██████████| 694/694 [00:01<00:00, 504.04it/s]


Unnamed: 0,council,session_id,text,label,comment_or_hearing
1478,oakland,c1762edb4651,"In our Mid-Cycle budget in June, I did add to ...",other,other
2147,oakland,8fc230ade8f7,Madam city clerk? Through the chair to the cit...,other,other
2522,richmond,4740e71a6966,Public hearing. The committee will now hold a ...,comment-period-start,hearing
1123,oakland,6e8e3394a044,"Oh, the CEO of Unity Council. I'm not surprise...",other,other
1157,oakland,6fb6a975adeb,"I do apologize, your two Thank you to all the ...",comment-period-end,comment
1168,oakland,6fc22799cf42,"And then, as if you can stop sharing your scre...",comment-period-start,hearing
295,seattle,730ba0763750,Thank you very much. That was our final speake...,comment-period-end,comment
326,seattle,2315e3a68f6e,Okay. Okay. Okay.,other,other
2478,richmond,5a19cf40c561,So let's do that. It's open for public comment...,comment-period-start,comment
2380,richmond,acb20ef0075e,Do I hear a motion? Mr. Wheeler is so moved. A...,other,other


In [3]:
three_sentence_examples.label.value_counts()

other                   1376
comment-period-end       633
comment-period-start     619
Name: label, dtype: int64

In [4]:
three_sentence_examples.council.value_counts()

oakland     1544
seattle      688
richmond     396
Name: council, dtype: int64