In [1]:
from cdp_data import CDPInstances, datasets
import pandas as pd

In [2]:
def create_annotation_dataset(city, output_file_name):
    # 'city' should be one of seattle, louisville, oakland, or alameda
    
    if city == 'seattle':
        city_transcripts = datasets.get_session_dataset(
            CDPInstances.Seattle,  # specify the city (or county) council we want data from
            start_datetime="2020-01-01",  # YYYY-MM-DD format
            end_datetime="2023-04-01",  # YYYY-MM-DD format
            store_transcript=True,  # store transcripts locally for fast file reading
            store_transcript_as_csv=True,  # store transcripts as CSVs for easy pandas reading
            raise_on_error=False
        )
    elif city == 'louisville':
        city_transcripts = datasets.get_session_dataset(
            CDPInstances.Louisville,
            start_datetime="2020-01-01",
            end_datetime="2023-04-01",
            store_transcript=True,
            store_transcript_as_csv=True,
            raise_on_error=False
        )
    elif city == 'oakland':
        city_transcripts = datasets.get_session_dataset(
            CDPInstances.Oakland,  
            start_datetime="2020-01-01",
            end_datetime="2023-04-01",
            store_transcript=True,
            store_transcript_as_csv=True,
            raise_on_error=False
        )
    elif city == 'alameda':
        city_transcripts = datasets.get_session_dataset(
            CDPInstances.Alameda,  
            start_datetime="2020-01-01",  
            end_datetime="2023-04-01",  
            store_transcript=True,  
            store_transcript_as_csv=True,  
            raise_on_error=False
        )

    city_transcripts['muni'] = city

    # Randomly sample 50 events from all of the events from each council 
    random_50 = city_transcripts.sample(n=50)

    # get all sentences
    city_sentences_session_df = {}

    for i, session in random_50.iterrows():
    
        sentence_df = pd.read_csv(session.transcript_as_csv_path)
    
        # to keep session_id information associated with each sentence, 
        # create another dataframe with session_id column and muni column
        city_sentences_session_df[session.transcript_as_csv_path] = sentence_df.assign(session_id = session.id, muni = session.muni)
    

    city_sentences_session_df = pd.concat(city_sentences_session_df).reset_index()
    city_sentences_session_df = city_sentences_session_df.drop(columns=['level_0', 'level_1'])
    # city_sentences_session_df has all sentence data for all 50 events (with session id)


    # Create chunks of 5 sentences at a time. 

    # group by session_id
    grouped = city_sentences_session_df.groupby('session_id')

    chunks = []

    for session_id, group in grouped:
        for i in range(0, len(group), 5):
            chunk = group.iloc[i:i+5]
            chunks.append(chunk)


    cleaned_chunks = []

    for df in chunks:
        df = df[['text', 'session_id', 'muni']]
        df = df.reset_index(drop=True)
        cleaned_chunks.append(df)


    chunks_list = []
    session_id_list = []
    muni_list = []

    for chunk in cleaned_chunks:
        text = ''.join(str(chunk['text'].tolist())).replace('[','').replace(']','').replace("'",'').replace(',', ' ').replace('"', '')
        chunks_list.append(text)
        session_id_list.append(chunk['session_id'][0])
        muni_list.append(chunk['muni'][0])

    chunks_df = pd.DataFrame({'text': chunks_list, 'session_id': session_id_list, 'muni': muni_list})


    # Randomly sample 1000 of the chunks

    random_1000_from_chunks_df = chunks_df.sample(n=1000).reset_index()


    random_1000_df = random_1000_from_chunks_df.drop(columns=['index'])

    # add a new column meta
    random_1000_df['meta'] = random_1000_df.apply(lambda row: {'muni': row['muni'], 'session_id': row['session_id']}, axis=1)


    cleaned_random_1000_df = random_1000_df.drop(columns=['muni', 'session_id'])
    
    # export
    cleaned_random_1000_df.to_json(output_file_name, orient='records', lines=True) 


In [3]:
create_annotation_dataset('seattle', 'seattle_dataset.jsonl')
create_annotation_dataset('louisville', 'louisville_dataset.jsonl')
create_annotation_dataset('oakland', 'oakland_dataset.jsonl')
create_annotation_dataset('alameda', 'alameda_dataset.jsonl')

Fetching each model attached to event_ref:   0%|          | 0/585 [00:00<?, ?it/s]

Fetching transcripts:   0%|          | 0/585 [00:00<?, ?it/s]

Converting transcripts:   0%|          | 0/585 [00:00<?, ?it/s]

Fetching each model attached to event_ref:   0%|          | 0/1059 [00:00<?, ?it/s]

Fetching transcripts:   0%|          | 0/1059 [00:00<?, ?it/s]

  converted_transcript_infos = process_map(


Converting transcripts:   0%|          | 0/1059 [00:00<?, ?it/s]

Fetching each model attached to event_ref:   0%|          | 0/286 [00:00<?, ?it/s]

Fetching transcripts:   0%|          | 0/286 [00:00<?, ?it/s]

Converting transcripts:   0%|          | 0/285 [00:00<?, ?it/s]

Fetching each model attached to event_ref:   0%|          | 0/114 [00:00<?, ?it/s]

Fetching transcripts:   0%|          | 0/114 [00:00<?, ?it/s]

Converting transcripts:   0%|          | 0/114 [00:00<?, ?it/s]