In [58]:
### Imports
import pandas as pd
import numpy as np
import json

In [62]:
# Step 1: Load the JSON Data
with open("sessions.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Step 2: Extract Relevant Information
callouts_data = []

for session in data:
    for section in session["sections"]:
        for callout in section["callouts"]:
            if "caller" in callout:  # Ensure there's a caller identified
                callouts_data.append(
                    {
                        "date": session["date"],
                        "period": session["period"],
                        "session": session["sessionNumber"],
                        "speaker_id": section["speaker"],
                        "caller_id": callout.get("caller"),
                    }
                )

# Step 3: Create the DataFrame
df_callouts = pd.DataFrame(callouts_data)

# Display the first few rows of the DataFrame to verify
print(df_callouts)

                       date period  session speaker_id caller_id
0       2023-09-20T00:00:00  XXVII      230      14795     52687
1       2023-09-20T00:00:00  XXVII      230      14795     35514
2       2023-09-20T00:00:00  XXVII      230      14795     35514
3       2023-09-20T00:00:00  XXVII      230       6486     35520
4       2023-09-20T00:00:00  XXVII      230       6486     35520
...                     ...    ...      ...        ...       ...
136287  2002-12-20T00:00:00   XXII        1      13032      2768
136288  2002-12-20T00:00:00   XXII        1      13032      8243
136289  2002-12-20T00:00:00   XXII        1       8240      2964
136290  2002-12-20T00:00:00   XXII        1       8182      2964
136291  2002-12-20T00:00:00   XXII        1        946      2821

[136292 rows x 5 columns]


In [64]:
# Load the persons data ('persons.json')
with open("persons.json", "r", encoding="utf-8") as file:
    persons_data = json.load(file)

# Create the lookup tables
persons_lookup = {
    person["id"]: {"name": person["name"], "parties": person["parties"]}
    for person in persons_data
}

# Update the DataFrame with caller_name, speaker_name, caller_parties, and speaker_parties
df_callouts["caller_name"] = df_callouts["caller_id"].map(
    lambda x: persons_lookup.get(x, {}).get("name", "Unknown")
)
df_callouts["speaker_name"] = df_callouts["speaker_id"].map(
    lambda x: persons_lookup.get(x, {}).get("name", "Unknown")
)
df_callouts["caller_parties"] = df_callouts["caller_id"].map(
    lambda x: ", ".join(persons_lookup.get(x, {}).get("parties", []))
)
df_callouts["speaker_parties"] = df_callouts["speaker_id"].map(
    lambda x: ", ".join(persons_lookup.get(x, {}).get("parties", []))
)

# Display the updated DataFrame
print(df_callouts)

                       date period  session speaker_id caller_id  \
0       2023-09-20T00:00:00  XXVII      230      14795     52687   
1       2023-09-20T00:00:00  XXVII      230      14795     35514   
2       2023-09-20T00:00:00  XXVII      230      14795     35514   
3       2023-09-20T00:00:00  XXVII      230       6486     35520   
4       2023-09-20T00:00:00  XXVII      230       6486     35520   
...                     ...    ...      ...        ...       ...   
136287  2002-12-20T00:00:00   XXII        1      13032      2768   
136288  2002-12-20T00:00:00   XXII        1      13032      8243   
136289  2002-12-20T00:00:00   XXII        1       8240      2964   
136290  2002-12-20T00:00:00   XXII        1       8182      2964   
136291  2002-12-20T00:00:00   XXII        1        946      2821   

                     caller_name                  speaker_name caller_parties  \
0          Alois Stöger, diplômé               August Wöginger            SPÖ   
1                Wolf

In [34]:
df_callouts.to_csv(
    "df_callouts_total.tsv", sep="\t", index=False
)  # save to TSV because initial parsing takes long

In [48]:
df_callouts_XXVII = df_callouts[
    (df_callouts["period"] == "XXVII") 
]
# find people with multiple party asociations and manually correct for most current afiliation
multiple_parties = (df_callouts["caller_parties"].str.contains(",", na=False)) | (
    df_callouts["speaker_parties"].str.contains(",", na=False)
)

party_corrections = {"2345": "GRÜNE", "51577": "FPÖ", "2867": "FPÖ"}

for id_str, correct_party in party_corrections.items():
    # Check for matches in 'caller_id' and update 'caller_parties'
    match_caller = df_callouts_XXVII["caller_id"] == id_str
    df_callouts_XXVII.loc[match_caller, "caller_parties"] = correct_party

    # Check for matches in 'speaker_id' and update 'speaker_parties'
    match_speaker = df_callouts_XXVII["speaker_id"] == id_str
    df_callouts_XXVII.loc[match_speaker, "speaker_parties"] = correct_party

df_callouts_XXVII.replace("", np.nan, inplace=True)
df_callouts_XXVII

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_callouts_XXVII.replace("", np.nan, inplace=True)


Unnamed: 0,date,period,session,speaker_id,caller_id,caller_name,speaker_name,caller_parties,speaker_parties
0,2023-09-20T00:00:00,XXVII,230,14795,52687,"Alois Stöger, diplômé",August Wöginger,SPÖ,ÖVP
1,2023-09-20T00:00:00,XXVII,230,14795,35514,Wolfgang Zanger,August Wöginger,FPÖ,ÖVP
2,2023-09-20T00:00:00,XXVII,230,14795,35514,Wolfgang Zanger,August Wöginger,FPÖ,ÖVP
3,2023-09-20T00:00:00,XXVII,230,6486,35520,Herbert Kickl,MMag. Dr. Susanne Raab,FPÖ,ÖVP
4,2023-09-20T00:00:00,XXVII,230,6486,35520,Herbert Kickl,MMag. Dr. Susanne Raab,FPÖ,ÖVP
...,...,...,...,...,...,...,...,...,...
651,2023-09-20T00:00:00,XXVII,230,5652,78586,"Christian Hafenecker, MA",Mag. Ulrike Fischer,FPÖ,GRÜNE
652,2023-09-20T00:00:00,XXVII,230,5652,5678,Mag. Nina Tomaselli,Mag. Ulrike Fischer,GRÜNE,GRÜNE
653,2023-09-20T00:00:00,XXVII,230,5652,78586,"Christian Hafenecker, MA",Mag. Ulrike Fischer,FPÖ,GRÜNE
654,2023-09-20T00:00:00,XXVII,230,20281,35489,Franz Hörl,"MMag. Katharina Werner, Bakk.",ÖVP,NEOS


In [67]:
# Add a prefix to caller_id and speaker_id
df_callouts_XXVII['caller_id'] = 'c' + df_callouts_XXVII['caller_id'].astype(str)
df_callouts_XXVII['speaker_id'] = 's' + df_callouts_XXVII['speaker_id'].astype(str)

df_callouts_XXVII_grouped = (
    df_callouts_XXVII.groupby(
        [
            "caller_id",
            "speaker_id",
            "caller_name",
            "speaker_name",
            "caller_parties",
            "speaker_parties",
        ]
    )
    .size()
    .reset_index(name="counts")
)

# Assuming df_callouts_XXVII_grouped is your starting DataFrame
# First, calculate the total counts for each caller_id
total_counts_per_caller = df_callouts_XXVII_grouped.groupby('caller_id')['counts'].sum().reset_index(name='counts_total')

# Then, merge this total counts back into your original (or grouped) DataFrame
df_callouts_XXVII_grouped_with_totals = pd.merge(
    df_callouts_XXVII_grouped,
    total_counts_per_caller,
    on='caller_id',
    how='left'
)

# Sorting the DataFrame based on counts_total (you might also want to keep the original sorting as a secondary criterion)
df_sorted_by_total_counts = df_callouts_XXVII_grouped_with_totals.sort_values(by=['counts_total', 'counts'], ascending=False)


df_sorted_by_total_counts.to_csv(
    "df_callouts_XXVII.tsv", sep="\t", index=False
)  # save data as final .tsv

df_sorted_by_total_counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_callouts_XXVII['caller_id'] = 'c' + df_callouts_XXVII['caller_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_callouts_XXVII['speaker_id'] = 's' + df_callouts_XXVII['speaker_id'].astype(str)


Unnamed: 0,caller_id,speaker_id,caller_name,speaker_name,caller_parties,speaker_parties,counts,counts_total
1671,c35468,s2136,Dr. Dagmar Belakowitsch,"Karl Nehammer, MSc",FPÖ,ÖVP,146,2497
1737,c35468,s5676,Dr. Dagmar Belakowitsch,Ralph Schallmeiner,FPÖ,GRÜNE,112,2497
1770,c35468,s83101,Dr. Dagmar Belakowitsch,"Sigrid Maurer, BA",FPÖ,GRÜNE,95,2497
1638,c35468,s14795,Dr. Dagmar Belakowitsch,August Wöginger,FPÖ,ÖVP,85,2497
1759,c35468,s65321,Dr. Dagmar Belakowitsch,Sebastian Kurz,FPÖ,ÖVP,78,2497
...,...,...,...,...,...,...,...,...
3597,c5686,s2136,Dr. Johannes Margreiter,"Karl Nehammer, MSc",NEOS,ÖVP,1,1
3712,c6506,s6486,Mag. Romana Deckenbacher,MMag. Dr. Susanne Raab,ÖVP,ÖVP,1,1
3770,c65321,s78586,Sebastian Kurz,"Christian Hafenecker, MA",ÖVP,FPÖ,1,1
3808,c7106,s145,"Dr. Werner Saxinger, MSc",Doris Bures,ÖVP,SPÖ,1,1


In [70]:
# Extract unique caller and speaker information including party
unique_callers = df_callouts_XXVII[['caller_id', 'caller_name', 'caller_parties']].drop_duplicates().reset_index(drop=True)
unique_speakers = df_callouts_XXVII[['speaker_id', 'speaker_name', 'speaker_parties']].drop_duplicates().reset_index(drop=True)

# Format the IDs, names, and parties, and combine into a single list of nodes
nodes = [{"id": f"c{row['caller_id']}", "name": row['caller_name'], "party": row['caller_parties']} for index, row in unique_callers.iterrows()]
nodes += [{"id": f"s{row['speaker_id']}", "name": row['speaker_name'], "party": row['speaker_parties']} for index, row in unique_speakers.iterrows()]

# Extract and format the interruptions
interruptions = [{"caller": f"c{row['caller_id']}", "speaker": f"s{row['speaker_id']}", "session": row['session']} for index, row in df_callouts_XXVII.iterrows()]

import json

# Combine into a single structure
data_structure = {
    "nodes": nodes,
    "interruptions": interruptions
}

# Save to JSON file
with open("callouts_structure.json", "w", encoding="utf-8") as file:
    json.dump(data_structure, file, ensure_ascii=False, indent=4)
