In [27]:
### Imports
import pandas as pd
import numpy as np
import json

In [1]:
# Step 1: Load the JSON Data
with open("sessions.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Step 2: Extract Relevant Information
callouts_data = []

for session in data:
    for section in session["sections"]:
        for callout in section["callouts"]:
            if "caller" in callout:  # Ensure there's a caller identified
                callouts_data.append(
                    {
                        "date": session["date"],
                        "period": session["period"],
                        "session": session["sessionNumber"],
                        "speaker_id": section["speaker"],
                        "caller_id": callout.get("caller"),
                    }
                )

# Step 3: Create the DataFrame
df_callouts = pd.DataFrame(callouts_data)

# Display the first few rows of the DataFrame to verify
print(df_callouts.head())

                  date period  session speaker_id caller_id
0  2023-09-20T00:00:00  XXVII      230      14795     52687
1  2023-09-20T00:00:00  XXVII      230      14795     35514
2  2023-09-20T00:00:00  XXVII      230      14795     35514
3  2023-09-20T00:00:00  XXVII      230       6486     35520
4  2023-09-20T00:00:00  XXVII      230       6486     35520


In [2]:
# Load the persons data ('persons.json')
with open("persons.json", "r", encoding="utf-8") as file:
    persons_data = json.load(file)

# Create the lookup tables
persons_lookup = {
    person["id"]: {"name": person["name"], "parties": person["parties"]}
    for person in persons_data
}

# Update the DataFrame with caller_name, speaker_name, caller_parties, and speaker_parties
df_callouts["caller_name"] = df_callouts["caller_id"].map(
    lambda x: persons_lookup.get(x, {}).get("name", "Unknown")
)
df_callouts["speaker_name"] = df_callouts["speaker_id"].map(
    lambda x: persons_lookup.get(x, {}).get("name", "Unknown")
)
df_callouts["caller_parties"] = df_callouts["caller_id"].map(
    lambda x: ", ".join(persons_lookup.get(x, {}).get("parties", []))
)
df_callouts["speaker_parties"] = df_callouts["speaker_id"].map(
    lambda x: ", ".join(persons_lookup.get(x, {}).get("parties", []))
)

# Display the updated DataFrame
print(df_callouts.head())

                  date period  session speaker_id caller_id  \
0  2023-09-20T00:00:00  XXVII      230      14795     52687   
1  2023-09-20T00:00:00  XXVII      230      14795     35514   
2  2023-09-20T00:00:00  XXVII      230      14795     35514   
3  2023-09-20T00:00:00  XXVII      230       6486     35520   
4  2023-09-20T00:00:00  XXVII      230       6486     35520   

             caller_name            speaker_name caller_parties  \
0  Alois Stöger, diplômé         August Wöginger            SPÖ   
1        Wolfgang Zanger         August Wöginger            FPÖ   
2        Wolfgang Zanger         August Wöginger            FPÖ   
3          Herbert Kickl  MMag. Dr. Susanne Raab            FPÖ   
4          Herbert Kickl  MMag. Dr. Susanne Raab            FPÖ   

  speaker_parties  
0             ÖVP  
1             ÖVP  
2             ÖVP  
3             ÖVP  
4             ÖVP  


In [4]:
df_callouts.to_csv("df_callouts_total.tsv", sep="\t", index=False) #save to TSV because initial parsing takes long

In [32]:
df_callouts_XXVII = df_callouts[df_callouts["period"] == "XXVII"] # filter current parliamentary period

# find people with multiple party asociations and manually correct for most current afiliation
multiple_parties = (df_callouts["caller_parties"].str.contains(",", na=False)) | (
    df_callouts["speaker_parties"].str.contains(",", na=False)
)

party_corrections = {"35520": "GRÜNE", "2345": "GRÜNE", "51577": "FPÖ", "2867": "FPÖ"}

for id_str, correct_party in party_corrections.items():
    # Check for matches in 'caller_id' and update 'caller_parties'
    match_caller = df_callouts_XXVII["caller_id"] == id_str
    df_callouts_XXVII.loc[match_caller, "caller_parties"] = correct_party

    # Check for matches in 'speaker_id' and update 'speaker_parties'
    match_speaker = df_callouts_XXVII["speaker_id"] == id_str
    df_callouts_XXVII.loc[match_speaker, "speaker_parties"] = correct_party

df_callouts_XXVII.replace("", np.nan, inplace=True)
df_callouts_XXVII

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_callouts_XXVII.replace('', np.nan, inplace=True)


Unnamed: 0,date,period,session,speaker_id,caller_id,caller_name,speaker_name,caller_parties,speaker_parties
0,2023-09-20T00:00:00,XXVII,230,14795,52687,"Alois Stöger, diplômé",August Wöginger,SPÖ,ÖVP
1,2023-09-20T00:00:00,XXVII,230,14795,35514,Wolfgang Zanger,August Wöginger,FPÖ,ÖVP
2,2023-09-20T00:00:00,XXVII,230,14795,35514,Wolfgang Zanger,August Wöginger,FPÖ,ÖVP
3,2023-09-20T00:00:00,XXVII,230,6486,35520,Herbert Kickl,MMag. Dr. Susanne Raab,GRÜNE,ÖVP
4,2023-09-20T00:00:00,XXVII,230,6486,35520,Herbert Kickl,MMag. Dr. Susanne Raab,GRÜNE,ÖVP
...,...,...,...,...,...,...,...,...,...
23109,2019-10-23T00:00:00,XXVII,1,35468,83122,"Mag. Beate Meinl-Reisinger, MES",Dr. Dagmar Belakowitsch,NEOS,FPÖ
23110,2019-10-23T00:00:00,XXVII,1,35468,83122,"Mag. Beate Meinl-Reisinger, MES",Dr. Dagmar Belakowitsch,NEOS,FPÖ
23111,2019-10-23T00:00:00,XXVII,1,87146,83122,"Mag. Beate Meinl-Reisinger, MES",Dr. Ewa Ernst-Dziedzic,NEOS,GRÜNE
23112,2019-10-23T00:00:00,XXVII,1,87146,35520,Herbert Kickl,Dr. Ewa Ernst-Dziedzic,GRÜNE,GRÜNE


In [33]:
df_callouts_XXVII.to_csv("df_callouts_XXVII.tsv", sep="\t", index=False) #save data as final .tsv