In [1]:
from cdp_data import CDPInstances, datasets
import pandas as pd

RANDOM_STATE = 12

sample_dses = []
for infra in [
    CDPInstances.Seattle,
    CDPInstances.Louisville,
    CDPInstances.Oakland,
]:
    ds = datasets.get_session_dataset(
        infra,
        start_datetime="2020-01-01",
        end_datetime="2023-01-01",
        store_transcript=True,
        raise_on_error=False,
    )
    ds["muni"] = infra
    sample_dses.append(ds.sample(15, random_state=RANDOM_STATE))
    
sample_ds = pd.concat(sample_dses, ignore_index=True)

Fetching each model attached to event_ref:   0%|          | 0/563 [00:00<?, ?it/s]

Fetching transcripts:   0%|          | 0/563 [00:00<?, ?it/s]

Fetching each model attached to event_ref:   0%|          | 0/1044 [00:00<?, ?it/s]

Fetching transcripts:   0%|          | 0/1044 [00:00<?, ?it/s]

Fetching each model attached to event_ref:   0%|          | 0/275 [00:00<?, ?it/s]

Fetching transcripts:   0%|          | 0/275 [00:00<?, ?it/s]

In [2]:
SECTIONS_TO_PULL = {
    "seattle-6c40d8abf3c9": [
        45,
        64,
        85,
    ],
    "seattle-c6bbc7ceec24": [
        193,
        215,
        270,
    ],
    "seattle-d9f30ac22162": [
        39,
        51,
        85,
    ],
    "seattle-5ebad5eae48c": [
        34,
        45,
    ],
    "lousiville-29beba7b6ecd": [
        54,
        94,
        143,
    ],
    "louisville-0f76fa93d94b": [
        201,
        235,
        398,
        473,
    ],
    "oakland-d83519594701": [
        31,
        38,
        49,
    ],
    "oakland-c658b9361f53": [
        673,
        723,
    ],
    "oakland-0e66fa4c81f5": [
        41,
        56,
        66,
        128,
    ],
    "oakland-b7cdf3723be0": [
        45,
    ],
}

In [3]:
from cdp_backend.pipeline.transcript_model import Transcript

pulled_sections = []
for session_muni_id, sentence_indices in SECTIONS_TO_PULL.items():
    # Split into muni and id
    parts = session_muni_id.split("-")
    muni = parts[0]
    session_id = parts[1]
    
    # Get the session details
    selected_session = sample_ds[sample_ds["id"] == session_id].iloc[0]
    
    # Load the transcript
    with open(selected_session.transcript_path) as open_f:
        transcript = Transcript.from_json(open_f.read())
        sentences = transcript.sentences
        
    # Pull each section
    for start_i in sentence_indices:
        print(f"{muni} - {session_id} -- {start_i}")
        pulled_sections.append({
            "text": " ".join([sentences[s_i].text for s_i in range(start_i, start_i + 5)]).strip(),
            "meta": {"muni": muni, "session_id": session_id},
        })

to_annotate = pd.DataFrame(pulled_sections)
to_annotate

seattle - 6c40d8abf3c9 -- 45
seattle - 6c40d8abf3c9 -- 64
seattle - 6c40d8abf3c9 -- 85
seattle - c6bbc7ceec24 -- 193
seattle - c6bbc7ceec24 -- 215
seattle - c6bbc7ceec24 -- 270
seattle - d9f30ac22162 -- 39
seattle - d9f30ac22162 -- 51
seattle - d9f30ac22162 -- 85
seattle - 5ebad5eae48c -- 34
seattle - 5ebad5eae48c -- 45
lousiville - 29beba7b6ecd -- 54
lousiville - 29beba7b6ecd -- 94
lousiville - 29beba7b6ecd -- 143
louisville - 0f76fa93d94b -- 201
louisville - 0f76fa93d94b -- 235
louisville - 0f76fa93d94b -- 398
louisville - 0f76fa93d94b -- 473
oakland - d83519594701 -- 31
oakland - d83519594701 -- 38
oakland - d83519594701 -- 49
oakland - c658b9361f53 -- 673
oakland - c658b9361f53 -- 723
oakland - 0e66fa4c81f5 -- 41
oakland - 0e66fa4c81f5 -- 56
oakland - 0e66fa4c81f5 -- 66
oakland - 0e66fa4c81f5 -- 128
oakland - b7cdf3723be0 -- 45


Unnamed: 0,text,meta
0,"Good morning. As you said, I'm a downtown resi...","{'muni': 'seattle', 'session_id': '6c40d8abf3c9'}"
1,Down morning. I'm chair of tree pack. It's dis...,"{'muni': 'seattle', 'session_id': '6c40d8abf3c9'}"
2,"So Doug and Andrew if you are out there, call ...","{'muni': 'seattle', 'session_id': '6c40d8abf3c9'}"
3,"Thank you. Hi, I just want to bring attention ...","{'muni': 'seattle', 'session_id': 'c6bbc7ceec24'}"
4,"Yes, I'm here. I'm unmuted, it appears. Yes. O...","{'muni': 'seattle', 'session_id': 'c6bbc7ceec24'}"
5,"Good afternoon, Council. Thank you for the cha...","{'muni': 'seattle', 'session_id': 'c6bbc7ceec24'}"
6,"Good morning, Pete her. Good morning. I'm in d...","{'muni': 'seattle', 'session_id': 'd9f30ac22162'}"
7,"Jacob, good morning. Hi. I'm Jacob sheer, Orga...","{'muni': 'seattle', 'session_id': 'd9f30ac22162'}"
8,"Good morning. I am Madison, resident of distri...","{'muni': 'seattle', 'session_id': 'd9f30ac22162'}"
9,Good morning. I want to address your agenda it...,"{'muni': 'seattle', 'session_id': '5ebad5eae48c'}"


In [4]:
to_annotate.to_json("local-interest-groups-irr-annotation-set.jsonl", lines=True, orient="records")