In [1]:
import pandas as pd
import glob
import os
import json

In [2]:
def make_bill_json_df(source: str, year, save: bool = False) -> pd.DataFrame:
    json_files = glob.glob(os.path.join(source, "*.json"))
    columns = [
        "bill_id",
        "status",
        "state",
        "state_id",
        "bill_number",
        "bill_type",
        "bill_type_id",
        "body",
        "body_id",
        "current_body",
        "current_body_id",
        "title",
        "description"
    ]
    data = []
    for file in json_files:
        with open(file, "r") as f:
            json_data = json.load(f)
            row = json_data.get("bill")
            row = {col: row.get(col, None) for col in columns}
            data.append(row)
    response = pd.DataFrame(data)
    if save:
        response.to_csv(f'{year}.csv')
    return response

In [3]:
def load_json_folder(folder_path):
    files = glob.glob(os.path.join(folder_path, "*.json"))
    return [json.load(open(f)) for f in files]

In [4]:
path = "../../data/bills/"#2023/MA/"

years = [
    "2009",
    "2011",
    "2013",
    "2015",
    "2017",
    "2019",
    "2020",
    "2021",
    "2023",
    "2025"
]

In [5]:
courts = ["2009-2010_186th_General_Court",
         "2011-2012_187th_General_Court",
         "2013-2014_188th_General_Court",
         "2015-2016_189th_General_Court",
         "2017-2018_190th_General_Court",
         "2019-2020_191st_General_Court",
         "2021-2022_192nd_General_Court",
         "2023-2024_193rd_General_Court",
         "2025-2026_194th_General_Court"]

In [6]:
#/bill

for year, court in zip(years,courts):
    full_path = path +year+"/MA/"+ court +"/bill"
    print(full_path)
    make_bill_json_df(full_path, year=year, save=True)


../../data/bills/2009/MA/2009-2010_186th_General_Court/bill
../../data/bills/2011/MA/2011-2012_187th_General_Court/bill
../../data/bills/2013/MA/2013-2014_188th_General_Court/bill
../../data/bills/2015/MA/2015-2016_189th_General_Court/bill
../../data/bills/2017/MA/2017-2018_190th_General_Court/bill
../../data/bills/2019/MA/2019-2020_191st_General_Court/bill
../../data/bills/2020/MA/2021-2022_192nd_General_Court/bill
../../data/bills/2021/MA/2023-2024_193rd_General_Court/bill
../../data/bills/2023/MA/2025-2026_194th_General_Court/bill


In [7]:
!ls

2009.csv	    2019_vote.csv	graph.ipynb
2009_roll_call.csv  2020.csv		how_many_rows.ipynb
2009_vote.csv	    2020_roll_call.csv	ingest_bills.py
2011.csv	    2020_vote.csv	ingest_performance.py
2011_roll_call.csv  2021.csv		inget_spending.py
2011_vote.csv	    2021_roll_call.csv	nodes.csv
2013.csv	    2023.csv		people.csv
2013_roll_call.csv  2023_roll_call.csv	politician_bill_links.csv
2013_vote.csv	    2025.csv		politicians_edges.csv
2015.csv	    2025_roll_call.csv	politicians_nodes.csv
2015_roll_call.csv  2025_vote.csv	__pycache__
2015_vote.csv	    bill.csv		quick.py
2017.csv	    covote_edges.csv	test_mapping.ipynb
2017_roll_call.csv  covote_nodes.csv	uniques.csv
2017_vote.csv	    edges.csv		votes.csv
2019.csv	    exploratory.ipynb
2019_roll_call.csv  graph.html


In [8]:
all_dfs = []

for year in years:
    filename = f"{year}.csv"
    if os.path.exists(filename):
        try:
            df = pd.read_csv(filename)
            all_dfs.append(df)
            print(f"Successfully read: {filename}")
        except Exception as e:
            print(f"Error reading {filename}: {e}")
    else:
        print(f"File not found: {filename}")

if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    print("\nConcatenation successful. The combined DataFrame has:")
    print(f"- Number of rows: {len(combined_df)}")
    print(f"- Number of columns: {combined_df.shape[1]}")
else:
    print("\nNo CSV files were successfully read, so no DataFrame was created.")

Successfully read: 2009.csv
Successfully read: 2011.csv
Successfully read: 2013.csv
Successfully read: 2015.csv
Successfully read: 2017.csv
Successfully read: 2019.csv
Successfully read: 2020.csv
Successfully read: 2021.csv
Successfully read: 2023.csv
Successfully read: 2025.csv

Concatenation successful. The combined DataFrame has:
- Number of rows: 50164
- Number of columns: 14


In [9]:
combined_df["bill_id"].nunique()

50164

In [10]:
def make_people_json_df(source: str, year, save: bool = False) -> pd.DataFrame:
    json_files = glob.glob(os.path.join(source, "*.json"))
    columns = ["people_id", "name", "party", "state_id", "role", "district"]
    data = []
    for file in json_files:
        with open(file, "r") as f:
            row = json.load(f)
            row = row.get("person")
            row = {col: row.get(col, None) for col in columns}
            data.append(row)
    df = pd.DataFrame(data)
    if save:
        df.to_csv(f"{year}_vote.csv", index=False)
    return df

In [11]:
for year, court in zip(years,courts):
    full_path = path +year+"/MA/"+ court +"/people"
    print(full_path)
    make_people_json_df(full_path, year=year, save=True)

../../data/bills/2009/MA/2009-2010_186th_General_Court/people
../../data/bills/2011/MA/2011-2012_187th_General_Court/people
../../data/bills/2013/MA/2013-2014_188th_General_Court/people
../../data/bills/2015/MA/2015-2016_189th_General_Court/people
../../data/bills/2017/MA/2017-2018_190th_General_Court/people
../../data/bills/2019/MA/2019-2020_191st_General_Court/people
../../data/bills/2020/MA/2021-2022_192nd_General_Court/people
../../data/bills/2021/MA/2023-2024_193rd_General_Court/people
../../data/bills/2023/MA/2025-2026_194th_General_Court/people


In [31]:
all_politicians = []

for year in years:
    filename = f"{year}_vote.csv"
    if os.path.exists(filename):
        try:
            df = pd.read_csv(filename)
            all_politicians.append(df)
            print(f"Successfully read: {filename}")
        except Exception as e:
            print(f"Error reading {filename}: {e}")
    else:
        print(f"File not found: {filename}")

if all_politicians:
    combined_politicians_df = pd.concat(all_politicians, ignore_index=True)
    print("\nConcatenation successful. The combined DataFrame has:")
    print(f"- Number of rows: {len(combined_df)}")
    print(f"- Number of columns: {combined_df.shape[1]}")
else:
    print("\nNo CSV files were successfully read, so no DataFrame was created.")

Successfully read: 2009_vote.csv
Successfully read: 2011_vote.csv
Successfully read: 2013_vote.csv
Successfully read: 2015_vote.csv
Successfully read: 2017_vote.csv
Successfully read: 2019_vote.csv
Error reading 2020_vote.csv: No columns to parse from file
Error reading 2021_vote.csv: No columns to parse from file
Error reading 2023_vote.csv: No columns to parse from file
Successfully read: 2025_vote.csv

Concatenation successful. The combined DataFrame has:
- Number of rows: 50164
- Number of columns: 14


In [34]:
combined_politicians_df

Unnamed: 0,people_id,name,party,state_id,role,district
0,2388,Geraldo Alicea,D,21,Rep,HD-06-WOR
1,2484,Kevin Murphy,D,21,Rep,HD-18-MID
2,2522,Christopher Speranzo,D,21,Rep,HD-03-BER
3,2559,Robert Hedlund,R,21,Sen,SD-PLYMOU
4,2493,Alice Peisch,D,21,Rep,HD-14-NOR
...,...,...,...,...,...,...
1566,25796,Steven Ouellette,D,21,Rep,HD-08-BRI
1572,22730,Rob Consalvo,D,21,Rep,HD-14-SUF
1574,25790,Dennis Gallagher,D,21,Rep,HD-08-PLY
1575,24775,Priscila Sousa,D,21,Rep,HD-06-MID


In [33]:
combined_politicians_df = combined_politicians_df.drop_duplicates(subset="people_id")
combined_politicians_df.to_csv("politicians_unique.csv")

In [54]:
def make_votes_json_df(source: str, year, save: bool = False) -> pd.DataFrame:
    json_files = glob.glob(os.path.join(source, "*.json"))
    records = []
    for file in json_files:
        with open(file, "r") as f:
            json_data = json.load(f)
            json_data = json_data.get("roll_call")
            base_info = {
                "roll_call_id": json_data["roll_call_id"],
                "bill_id": json_data["bill_id"],
                "date": json_data["date"],
                "desc": json_data["desc"],
                "passed": json_data["passed"],
                "chamber": json_data["chamber"]
            }
            for v in json_data.get("votes"):
                record = {
                    **base_info,
                    "people_id": v["people_id"],
                    "vote_text": v["vote_text"]
                }
                records.append(record)
    df = pd.DataFrame(records)
    if save:
        df.to_csv(f"{year}_roll_call.csv", index=False)
    return df

In [73]:
for year, court in zip(years, courts):
    full_path = path +year+"/MA/"+ court +"/vote"
    print(full_path)
    make_votes_json_df(full_path, year=year, save=True)


../../data/bills/2009/MA/2009-2010_186th_General_Court/vote
../../data/bills/2011/MA/2011-2012_187th_General_Court/vote
../../data/bills/2013/MA/2013-2014_188th_General_Court/vote
../../data/bills/2015/MA/2015-2016_189th_General_Court/vote
../../data/bills/2017/MA/2017-2018_190th_General_Court/vote
../../data/bills/2019/MA/2019-2020_191st_General_Court/vote
../../data/bills/2020/MA/2021-2022_192nd_General_Court/vote
../../data/bills/2021/MA/2023-2024_193rd_General_Court/vote
../../data/bills/2023/MA/2025-2026_194th_General_Court/vote


In [76]:
all_dfs = []

for year in years:
    filename = f"{year}_roll_call.csv"
    if os.path.exists(filename):
        try:
            df = pd.read_csv(filename)
            all_dfs.append(df)
            print(f"Successfully read: {filename}")
        except Exception as e:
            print(f"Error reading {filename}: {e}")
    else:
        print(f"File not found: {filename}")

# Concatenate all DataFrames in the list
if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    print("\nConcatenation successful. The combined DataFrame has:")
    print(f"- Number of rows: {len(combined_df)}")
    print(f"- Number of columns: {combined_df.shape[1]}")
else:
    print("\nNo CSV files were successfully read, so no DataFrame was created.")

Error reading 2009_roll_call.csv: No columns to parse from file
Error reading 2011_roll_call.csv: No columns to parse from file
Error reading 2013_roll_call.csv: No columns to parse from file
Error reading 2015_roll_call.csv: No columns to parse from file
Successfully read: 2017_roll_call.csv
Successfully read: 2019_roll_call.csv
Error reading 2020_roll_call.csv: No columns to parse from file
Error reading 2021_roll_call.csv: No columns to parse from file
Error reading 2023_roll_call.csv: No columns to parse from file
Error reading 2025_roll_call.csv: No columns to parse from file

Concatenation successful. The combined DataFrame has:
- Number of rows: 160222
- Number of columns: 8
