In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import json

# These commands below set some options for pandas and to have matplotlib show the charts in the notebook
pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline

# Load the data
# We have this defaulted to the folder OUTSIDE of your repo - please change it as needed
population = pd.read_csv('Population by Age and Sex - US, States, Counties.csv')
with open('legislators-historical.json', 'r') as file:
    file_data = json.load(file)
rows = []
for person in file_data:
    base_info = {
        "bioguide_id": person["id"].get("bioguide"),
        "govtrack_id": person["id"].get("govtrack"),
        "icpsr_id": person["id"].get("icpsr"),
        "wikipedia": person["id"].get("wikipedia"),
        "wikidata": person["id"].get("wikidata"),
        "first_name": person["name"]["first"],
        "last_name": person["name"]["last"],
        "gender": person["bio"].get("gender"),
        "birthday": person["bio"].get("birthday"),
    }
    for term in person["terms"]:
        row = base_info.copy()
        row.update({
            "type": term["type"],
            "start": term["start"],
            "end": term["end"],
            "state": term["state"],
            "district": term.get("district"),
            "party": term.get("party"),
            "class": term.get("class"),
        })
        rows.append(row)

# # Convert to DataFrame
legislators = pd.DataFrame(rows)

# Note - for now, it is okay to ignore the warning about mixed types.

In [13]:
# Display the DataFrame

# Convert term start & end to datetimes
legislators['start'] = pd.to_datetime(legislators['start'])
legislators['end'] = pd.to_datetime(legislators['end'])

# Filter to starting on or after 2000
leg_filtered_df = legislators[legislators['start'].dt.year >= 1900] 

leg_filtered_df[leg_filtered_df["last_name"] == ""]


Unnamed: 0,bioguide_id,govtrack_id,icpsr_id,wikipedia,wikidata,first_name,last_name,gender,birthday,type,start,end,state,district,party,class


In [35]:
##Gabby test cell
filtered_pop = population[population['Description'] == 'U.S.']

# Remove unnecessary columns
filtered_pop = filtered_pop.drop(['IBRC_Geo_ID', 'Statefips', 'Countyfips'], axis=1)

# Function to get percentages of populations
def percent(row, name):
    base = row['Total Population']
    return row[name]/base

for c in filtered_pop.columns:
    if 'population' in c.lower():
        filtered_pop['% ' + c] = filtered_pop.apply(lambda row: percent(row, c), axis = 1)
#filtered_pop['0-4 Percent'] = filtered_pop.apply(lambda row: percent(row, "Population 0-4"), axis = 1)
filtered_pop.columns
filtered_pop

Unnamed: 0,Description,Year,Total Population,Population 0-4,Population 5-17,Population 18-24,Population 25-44,Population 45-64,Population 65+,Population Under 18,...,% Population 5-17,% Population 18-24,% Population 25-44,% Population 45-64,% Population 65+,% Population Under 18,% Population 18-54,% Population 55+,% Male Population,% Female Population
0,U.S.,2000,282162411.0,19178293.0,53197896.0,27315274.0,84973340.0,62428040.0,35069568.0,72376189.0,...,0.19,0.1,0.3,0.22,0.12,0.26,0.53,0.21,0.49,0.51
1,U.S.,2001,284968955.0,19298217.0,53372958.0,27992652.0,84523274.0,64491563.0,35290291.0,72671175.0,...,0.19,0.1,0.3,0.23,0.12,0.26,0.53,0.21,0.49,0.51
2,U.S.,2002,287625193.0,19429192.0,53507265.0,28480708.0,83990295.0,66695526.0,35522207.0,72936457.0,...,0.19,0.1,0.29,0.23,0.12,0.25,0.53,0.22,0.49,0.51
3,U.S.,2003,290107933.0,19592446.0,53508312.0,28916746.0,83398001.0,68828899.0,35863529.0,73100758.0,...,0.18,0.1,0.29,0.24,0.12,0.25,0.53,0.22,0.49,0.51
4,U.S.,2004,292805298.0,19785885.0,53511850.0,29302179.0,83066831.0,70935234.0,36203319.0,73297735.0,...,0.18,0.1,0.28,0.24,0.12,0.25,0.53,0.22,0.49,0.51
5,U.S.,2005,295516599.0,19917400.0,53606269.0,29441546.0,82764185.0,73137401.0,36649798.0,73523669.0,...,0.18,0.1,0.28,0.25,0.12,0.25,0.52,0.23,0.49,0.51
6,U.S.,2006,298379912.0,19938883.0,53818831.0,29602839.0,82638980.0,75216272.0,37164107.0,73757714.0,...,0.18,0.1,0.28,0.25,0.12,0.25,0.52,0.23,0.49,0.51
7,U.S.,2007,301231207.0,20125962.0,53893443.0,29808025.0,82509693.0,77068373.0,37825711.0,74019405.0,...,0.18,0.1,0.27,0.26,0.13,0.25,0.52,0.24,0.49,0.51
8,U.S.,2008,304093966.0,20271127.0,53833475.0,30194274.0,82399959.0,78617510.0,38777621.0,74104602.0,...,0.18,0.1,0.27,0.26,0.13,0.24,0.52,0.24,0.49,0.51
9,U.S.,2009,306771529.0,20244518.0,53889649.0,30530346.0,82211153.0,80272688.0,39623175.0,74134167.0,...,0.18,0.1,0.27,0.26,0.13,0.24,0.51,0.24,0.49,0.51


In [36]:
##Ambro Test cell
def load_json(file_name: str):
    with open(file_name, 'r') as file:
        file_data = json.load(file)
    rows = []
    for person in file_data:
        base_info = {
            "bioguide_id": person["id"].get("bioguide"),
            "govtrack_id": person["id"].get("govtrack"),
            "icpsr_id": person["id"].get("icpsr"),
            "wikipedia": person["id"].get("wikipedia"),
            "wikidata": person["id"].get("wikidata"),
            "first_name": person["name"]["first"],
            "last_name": person["name"]["last"],
            "gender": person["bio"].get("gender"),
            "birthday": person["bio"].get("birthday"),
        }
        for term in person["terms"]:
            row = base_info.copy()
            row.update({
                "type": term["type"],
                "start": term["start"],
                "end": term["end"],
                "state": term["state"],
                "district": term.get("district"),
                "party": term.get("party"),
                "class": term.get("class"),
            })
            rows.append(row)

    # Convert to DataFrame
    legislators = pd.DataFrame(rows)
    return legislators
incumbent = load_json('legislators-current.json')
non_incumbent = load_json('legislators-historical.json')

In [22]:
##Uma test cell

In [None]:
##Chad test cell