In [6]:

import pandas as pd 
import geopandas as gpd
import csv
from pathlib import Path

from collections import OrderedDict

In [8]:
data_dir = Path.cwd() / "data"
out_dir = Path.cwd() / "out-files"

In [9]:
# Util method to come between state formats 
def get_state_code(input, FIPS=True, TwoDigit=False, Full=False):
    if (FIPS + TwoDigit + Full != 1):
        raise ValueError("Exactly one format argument must be True. Default = FIPS")

    code_hash = {
        'AL': {'full': 'Alabama', 'two_digit': 'AL', 'FIPS': 1},
        'AK': {'full': 'Alaska', 'two_digit': 'AK', 'FIPS': 2},
        'AZ': {'full': 'Arizona', 'two_digit': 'AZ', 'FIPS': 4},
        'AR': {'full': 'Arkansas', 'two_digit': 'AR', 'FIPS': 5},
        'CA': {'full': 'California', 'two_digit': 'CA', 'FIPS': 6},
        'CO': {'full': 'Colorado', 'two_digit': 'CO', 'FIPS': 8},
        'CT': {'full': 'Connecticut', 'two_digit': 'CT', 'FIPS': 9},
        'DE': {'full': 'Delaware', 'two_digit': 'DE', 'FIPS': 10},
        'DC': {'full': 'District of Columbia', 'two_digit': 'DC', 'FIPS': 11},
        'FL': {'full': 'Florida', 'two_digit': 'FL', 'FIPS': 12},
        'GA': {'full': 'Georgia', 'two_digit': 'GA', 'FIPS': 13},
        'HI': {'full': 'Hawaii', 'two_digit': 'HI', 'FIPS': 15},
        'ID': {'full': 'Idaho', 'two_digit': 'ID', 'FIPS': 16},
        'IL': {'full': 'Illinois', 'two_digit': 'IL', 'FIPS': 17},
        'IN': {'full': 'Indiana', 'two_digit': 'IN', 'FIPS': 18},
        'IA': {'full': 'Iowa', 'two_digit': 'IA', 'FIPS': 19},
        'KS': {'full': 'Kansas', 'two_digit': 'KS', 'FIPS': 20},
        'KY': {'full': 'Kentucky', 'two_digit': 'KY', 'FIPS': 21},
        'LA': {'full': 'Louisiana', 'two_digit': 'LA', 'FIPS': 22},
        'ME': {'full': 'Maine', 'two_digit': 'ME', 'FIPS': 23},
        'MD': {'full': 'Maryland', 'two_digit': 'MD', 'FIPS': 24},
        'MA': {'full': 'Massachusetts', 'two_digit': 'MA', 'FIPS': 25},
        'MI': {'full': 'Michigan', 'two_digit': 'MI', 'FIPS': 26},
        'MN': {'full': 'Minnesota', 'two_digit': 'MN', 'FIPS': 27},
        'MS': {'full': 'Mississippi', 'two_digit': 'MS', 'FIPS': 28},
        'MO': {'full': 'Missouri', 'two_digit': 'MO', 'FIPS': 29},
        'MT': {'full': 'Montana', 'two_digit': 'MT', 'FIPS': 30},
        'NE': {'full': 'Nebraska', 'two_digit': 'NE', 'FIPS': 31},
        'NV': {'full': 'Nevada', 'two_digit': 'NV', 'FIPS': 32},
        'NH': {'full': 'New Hampshire', 'two_digit': 'NH', 'FIPS': 33},
        'NJ': {'full': 'New Jersey', 'two_digit': 'NJ', 'FIPS': 34},
        'NM': {'full': 'New Mexico', 'two_digit': 'NM', 'FIPS': 35},
        'NY': {'full': 'New York', 'two_digit': 'NY', 'FIPS': 36},
        'NC': {'full': 'North Carolina', 'two_digit': 'NC', 'FIPS': 37},
        'ND': {'full': 'North Dakota', 'two_digit': 'ND', 'FIPS': 38},
        'OH': {'full': 'Ohio', 'two_digit': 'OH', 'FIPS': 39},
        'OK': {'full': 'Oklahoma', 'two_digit': 'OK', 'FIPS': 40},
        'OR': {'full': 'Oregon', 'two_digit': 'OR', 'FIPS': 41},
        'PA': {'full': 'Pennsylvania', 'two_digit': 'PA', 'FIPS': 42},
        'RI': {'full': 'Rhode Island', 'two_digit': 'RI', 'FIPS': 44},
        'SC': {'full': 'South Carolina', 'two_digit': 'SC', 'FIPS': 45},
        'SD': {'full': 'South Dakota', 'two_digit': 'SD', 'FIPS': 46},
        'TN': {'full': 'Tennessee', 'two_digit': 'TN', 'FIPS': 47},
        'TX': {'full': 'Texas', 'two_digit': 'TX', 'FIPS': 48},
        'UT': {'full': 'Utah', 'two_digit': 'UT', 'FIPS': 49},
        'VT': {'full': 'Vermont', 'two_digit': 'VT', 'FIPS': 50},
        'VA': {'full': 'Virginia', 'two_digit': 'VA', 'FIPS': 51},
        'WA': {'full': 'Washington', 'two_digit': 'WA', 'FIPS': 53},
        'WV': {'full': 'West Virginia', 'two_digit': 'WV', 'FIPS': 54},
        'WI': {'full': 'Wisconsin', 'two_digit': 'WI', 'FIPS': 55},
        'WY': {'full': 'Wyoming', 'two_digit': 'WY', 'FIPS': 56},
        'PR': {'full': 'Puerto Rico', 'two_digit': 'PR', 'FIPS': 72}
    }

    if FIPS:
        return code_hash[input]['FIPS']
    
    return None

In [10]:
# converts district code from the money ball csv to a GEOID
# ex. "CT-HD-59" --> '9059'
# ex. "MN-HD-13A" --> '2713A'
def getGEOID(district_str, leading_zero = False):
    state, chamber, dist_num = district_str.split('-')
    GEOID = str(get_state_code(state)) 
    if leading_zero and len(GEOID) < 2: 
        GEOID= '0' + GEOID
    while len(dist_num) <3: 
        dist_num = '0' + dist_num
    GEOID =  GEOID + dist_num
    return GEOID

In [22]:
# extracts the chamber type from the district code in the money ball csv
# ex. "CT-HD-59" --> 'HD'
def getChamber(district_str):
    state, chamber, dist_num = district_str.split('-')
    return chamber

In [21]:
# extracts the district name.  Used for the non-numerical Massachussets 
# districts in place of GEOID matching
def getName(district_str):
    state, chamber, dist_str = district_str.split('-')
    if len(dist_str) < 4 : return ''
    return dist_str

In [38]:
#################################################
#  PROCESS + ADD FIELDS TO MONEYBALL MODEL CSV  #
#################################################
def process_moneyball_data(inFile, outFile):
    df = pd.read_csv(data_dir / inFile)

    lambdafunc = lambda x: pd.Series(
        [getGEOID(x['district'], leading_zero = True),
        getChamber(x['district']),
        getName(x['district'])]
    )
    df [['GEOID', 'chamber', 'dist_name']] = df.apply(lambdafunc, axis = 1)

    df.to_csv(data_dir / outFile, index=False, float_format='%.16f')

In [40]:
process_moneyball_data('model-output-7-28.csv', 'processed_data.csv')

In [41]:
    # read in moneyball data
    df = pd.read_csv(data_dir / 'processed_data.csv')

    # segment to upper and lower chamber
    upper_df = df[df['chamber'] == 'SD']
    lower_df = df[df['chamber'] == 'HD']

In [42]:
upper_df.head()

Unnamed: 0,state,district,favored,confidence,rep_nominee,dem_nominee,incumbent,redistricting_voter_power,anti_gerrymandering_party,GEOID,chamber,dist_name
11,MN,MN-SD-14,FALSE,Toss-Up,Jerry Relph,Aric Putnam,R,81.187213,R,27014,SD,
12,MN,MN-SD-58,FALSE,Toss-Up,Zach Duckworth,Matt Little,D,80.552036,R,27058,SD,
15,MN,MN-SD-56,FALSE,Toss-Up,Dan Hall,TBA,R,76.009501,R,27056,SD,
23,MN,MN-SD-20,R,Lean,Rich Draheim,Jon Olson,R,63.66322,R,27020,SD,
24,MN,MN-SD-54,D,Lean,Leilani Holmstadt,Karla Bigham,D,59.532556,R,27054,SD,


In [43]:
lower_df.head()

Unnamed: 0,state,district,favored,confidence,rep_nominee,dem_nominee,incumbent,redistricting_voter_power,anti_gerrymandering_party,GEOID,chamber,dist_name
0,TX,TX-HD-112,FALSE,Toss-Up,Angie Chen Button,Brandy Chambers,R,100.0,D,48112,HD,
1,TX,TX-HD-26,R,Tilt,Jacey Jetton,L. Sarah DeMerchant,Open,93.309625,D,48026,HD,
2,KS,KS-HD-98,FALSE,Toss-Up,Ron Howard,TBA,R,91.500504,D,20098,HD,
3,KS,KS-HD-48,FALSE,Toss-Up,Terry Frederick,Jennifer Day,D,90.745331,D,20048,HD,
4,TX,TX-HD-66,FALSE,Toss-Up,Matt Shaheen,Sharon Hirsch,R,90.409286,D,48066,HD,


In [44]:
upper_shp = gpd.read_file(data_dir / 'UPPER_cb_2019_us_sldu_500k/cb_2019_us_sldu_500k.shp')

In [45]:
upper_shp.head()

Unnamed: 0,STATEFP,SLDUST,AFFGEOID,GEOID,NAME,LSAD,LSY,ALAND,AWATER,geometry
0,34,16,610U600US34016,34016,16,LU,2018,763668910,7272222,"POLYGON ((-75.04435 40.41259, -75.02472 40.431..."
1,23,26,610U600US23026,23026,26,LU,2018,535557926,115675982,"POLYGON ((-70.79991 43.85805, -70.79743 43.858..."
2,31,23,610U600US31023,31023,23,LU,2018,4288368209,54838073,"POLYGON ((-97.36819 41.33874, -97.36822 41.358..."
3,31,34,610U600US31034,31034,34,LU,2018,4230994361,55826999,"POLYGON ((-98.49394 40.85621, -98.49370 40.856..."
4,30,5,610U600US30005,30005,5,LU,2018,835660316,95496113,"POLYGON ((-114.33053 48.22589, -114.32399 48.2..."


In [52]:
def pandas_lambda_geolocate(row, df, df_columns, default_values):
    vals = []

    # match by district name if MA
    if row['STATEFP'] == '25':
        geomatch = df[df['dist_name'] == row['NAME']]
    else:
        geomatch = df[df['GEOID'] == row['GEOID']]
    
    if len(geomatch.index) < 1:
        #print (f"No match found for GEOID: {row['GEOID']}")
        return pd.Series(default_values)
    elif len(geomatch.index) > 1:
        print(f"More than one match found for GEOID: {row['GEOID']}")
    geomatch = geomatch.iloc[0]
    
    for i in range(0, len(df_columns)):
        vals.append(geomatch[df_columns[i]])

    return pd.Series(vals)

In [53]:
def get_lean(row, df):
    # match by district name if MA
    if row['STATEFP'] == '25':
        geomatch = df[df['dist_name'] == row['NAME']]
    else:
        geomatch = df[df['GEOID'] == row['GEOID']]
    
    if len(geomatch.index) < 1:
        return 'no data'
    elif len(geomatch.index) > 1:
        print(f"More than one match found for GEOID: {row['GEOID']}")
    geomatch = geomatch.iloc[0]
    confidence = geomatch['confidence']
    favored = geomatch['favored']
    if confidence == 'Toss-Up': return confidence
    return confidence + " " + favored

In [54]:
df_columns = ['district', 'rep_nominee', 'dem_nominee', 'incumbent', 'anti_gerrymandering_party', 'redistricting_voter_power']
default_values = ['',       '',           '',           '',          '',     0]
upper_shp[['DISTRICT', 'NOM_R', 'NOM_D', 'INCUMBENT','ANTI_GERRY_PARTY', 'VOTER_POWER']] = upper_shp.apply(lambda row: pandas_lambda_geolocate(row, upper_df, df_columns, default_values), axis = 1)
upper_shp['LEAN'] = upper_shp.apply(lambda row: get_lean(row, upper_df), axis = 1)


In [58]:
upper_shp[upper_shp['VOTER_POWER'] != 0]

Unnamed: 0,STATEFP,SLDUST,AFFGEOID,GEOID,NAME,LSAD,LSY,ALAND,AWATER,geometry,DISTRICT,NOM_R,NOM_D,INCUMBENT,ANTI_GERRY_PARTY,VOTER_POWER,LEAN
5,45,027,610U600US45027,45027,27,LU,2018,4099645368,50328510,"POLYGON ((-80.89719 34.49168, -80.89565 34.493...",SC-SD-27,Penry Gustafson,Vincent Sheheen,D,D,0.052766,Likely D
7,45,026,610U600US45026,45026,26,LU,2018,1985374241,19510156,"POLYGON ((-81.76577 33.62673, -81.76369 33.628...",SC-SD-26,TBA,Nikki Setzler,D,D,0.019814,Safe D
11,13,043,610U600US13043,13043,43,LU,2018,581735370,11760621,"POLYGON ((-84.18805 33.65409, -84.18391 33.655...",GA-SD-43,Melanie Williams,Tonya Anderson,D,D,0.001368,Safe D
12,13,017,610U600US13017,13017,17,LU,2018,1144431704,25560639,"POLYGON ((-84.35419 33.35336, -84.35418 33.377...",GA-SD-17,Brian Strickland,Kelly Rose,R,D,0.244580,Likely R
17,27,028,610U600US27028,27028,28,LU,2018,4738895114,72931748,"POLYGON ((-92.44957 43.67444, -92.44953 43.682...",MN-SD-28,Jeremy Miller,Sarah Kruger,R,R,4.197868,Safe R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1947,09,011,610U600US09011,09011,11,LU,2018,81400167,3543169,"POLYGON ((-72.96324 41.43941, -72.95792 41.445...",CT-SD-11,FALSE,FALSE,D,R,0.153386,Safe D
1948,09,022,610U600US09022,09022,22,LU,2018,96689045,1876068,"POLYGON ((-73.27603 41.30066, -73.27617 41.301...",CT-SD-22,FALSE,FALSE,D,R,0.261162,Safe D
1949,09,015,610U600US09015,09015,15,LU,2018,84403620,1300223,"POLYGON ((-73.14504 41.55782, -73.13088 41.560...",CT-SD-15,FALSE,FALSE,D,R,0.149968,Safe D
1952,55,018,610U600US55018,55018,18,LU,2018,1568817945,338125636,"POLYGON ((-88.88634 44.04529, -88.88604 44.066...",WI-SD-18,Dan Feyen,Aaron Wojciechowski,R,D,0.487346,Likely R


In [59]:
upper_shp[upper_shp['STATEFP'] == '25']

Unnamed: 0,STATEFP,SLDUST,AFFGEOID,GEOID,NAME,LSAD,LSY,ALAND,AWATER,geometry,DISTRICT,NOM_R,NOM_D,INCUMBENT,ANTI_GERRY_PARTY,VOTER_POWER,LEAN
610,25,32,610U600US25032,25032,"Norfolk, Bristol & Plymouth",7,2018,310027353,11502659,"POLYGON ((-71.19348 42.09751, -71.18880 42.100...","MA-SD-Norfolk, Bristol & Plymouth",False,False,D,R,0.003757,Safe D
616,25,39,610U600US25039,25039,Plymouth & Barnstable,7,2018,684946160,622397074,"MULTIPOLYGON (((-70.63965 42.01080, -70.63639 ...",MA-SD-Plymouth & Barnstable,False,False,D,R,0.0001,Tilt D
629,25,30,610U600US25030,25030,Norfolk & Suffolk,7,2018,160873306,4172289,"POLYGON ((-71.34166 42.22192, -71.34101 42.224...",MA-SD-Norfolk & Suffolk,False,False,D,R,0.003637,Safe D
913,25,6,610U600US25006,25006,"Hampshire, Franklin & Worcester",7,2018,1679268589,78349072,"POLYGON ((-72.77926 42.73575, -72.68617 42.733...","MA-SD-Hampshire, Franklin & Worcester",False,False,D,R,0.003456,Safe D
914,25,20,610U600US25020,25020,First Essex & Middlesex,7,2018,705022591,463616100,"MULTIPOLYGON (((-70.58029 42.63602, -70.57509 ...",MA-SD-First Essex & Middlesex,False,False,R,R,5e-06,Safe R
915,25,40,610U600US25040,25040,Cape & Islands,7,2018,1077398767,3889421769,"MULTIPOLYGON (((-70.23405 41.28565, -70.22361 ...",MA-SD-Cape & Islands,False,False,D,R,0.003204,Safe D
916,25,36,610U600US25036,25036,First Plymouth & Bristol,7,2018,749270390,84583201,"POLYGON ((-71.20346 41.88184, -71.18679 41.879...",MA-SD-First Plymouth & Bristol,False,False,D,R,0.00326,Safe D
917,25,12,610U600US25012,25012,Worcester & Norfolk,7,2018,671304442,27773825,"POLYGON ((-72.07443 42.05846, -72.06642 42.072...",MA-SD-Worcester & Norfolk,False,False,R,R,1.2e-05,Safe R
918,25,1,610U600US25001,25001,First Suffolk,7,2018,33403472,119112810,"MULTIPOLYGON (((-70.93091 42.32160, -70.93025 ...",MA-SD-First Suffolk,False,False,D,R,0.003109,Safe D
919,25,5,610U600US25005,25005,Second Hampden & Hampshire,7,2018,705719470,21875171,"POLYGON ((-73.07484 42.10615, -73.07305 42.106...",MA-SD-Second Hampden & Hampshire,False,False,D,R,0.000196,Lean D


In [72]:
# choose columns you want in output
upper_shp = upper_shp[['STATEFP', 'GEOID', 'DISTRICT', 'NOM_R', 'NOM_D', 'INCUMBENT','ANTI_GERRY_PARTY', 'LEAN', 'VOTER_POWER', 'geometry']]	

upper_shp['NOM_R'].replace({'FALSE': ''}, inplace =True)
upper_shp['NOM_D'].replace({'FALSE': ''}, inplace =True)

In [73]:
upper_shp[upper_shp['VOTER_POWER'] != 0]

Unnamed: 0,STATEFP,GEOID,DISTRICT,NOM_R,NOM_D,INCUMBENT,ANTI_GERRY_PARTY,LEAN,VOTER_POWER,geometry
5,45,45027,SC-SD-27,Penry Gustafson,Vincent Sheheen,D,D,Likely D,0.052766,"POLYGON ((-80.89719 34.49168, -80.89565 34.493..."
7,45,45026,SC-SD-26,TBA,Nikki Setzler,D,D,Safe D,0.019814,"POLYGON ((-81.76577 33.62673, -81.76369 33.628..."
11,13,13043,GA-SD-43,Melanie Williams,Tonya Anderson,D,D,Safe D,0.001368,"POLYGON ((-84.18805 33.65409, -84.18391 33.655..."
12,13,13017,GA-SD-17,Brian Strickland,Kelly Rose,R,D,Likely R,0.244580,"POLYGON ((-84.35419 33.35336, -84.35418 33.377..."
17,27,27028,MN-SD-28,Jeremy Miller,Sarah Kruger,R,R,Safe R,4.197868,"POLYGON ((-92.44957 43.67444, -92.44953 43.682..."
...,...,...,...,...,...,...,...,...,...,...
1947,09,09011,CT-SD-11,,,D,R,Safe D,0.153386,"POLYGON ((-72.96324 41.43941, -72.95792 41.445..."
1948,09,09022,CT-SD-22,,,D,R,Safe D,0.261162,"POLYGON ((-73.27603 41.30066, -73.27617 41.301..."
1949,09,09015,CT-SD-15,,,D,R,Safe D,0.149968,"POLYGON ((-73.14504 41.55782, -73.13088 41.560..."
1952,55,55018,WI-SD-18,Dan Feyen,Aaron Wojciechowski,R,D,Likely R,0.487346,"POLYGON ((-88.88634 44.04529, -88.88604 44.066..."


In [75]:
len(upper_shp[upper_shp['VOTER_POWER'] != 0].index)

399

In [74]:
upper_shp.to_file(out_dir / "upper_state_moneyball.geojson", driver="GeoJSON")

Previous lower geojson size: 46.8mb
Previous upper geojson size: 32.7mb