In [3]:

import shapefile
from collections import OrderedDict
import pandas as pd 
import csv

In [None]:
### process moneyball data

In [29]:
# ******* update to include other input and output cases *********

def get_state_code(input, FIPS=True, TwoDigit=False, Full=False):
    if (FIPS + TwoDigit + Full != 1):
        raise ValueError("Exactly one format argument must be True. Default = FIPS")

    code_hash = {
        'AL': {'full': 'Alabama', 'two_digit': 'AL', 'FIPS': 1},
        'AK': {'full': 'Alaska', 'two_digit': 'AK', 'FIPS': 2},
        'AZ': {'full': 'Arizona', 'two_digit': 'AZ', 'FIPS': 4},
        'AR': {'full': 'Arkansas', 'two_digit': 'AR', 'FIPS': 5},
        'CA': {'full': 'California', 'two_digit': 'CA', 'FIPS': 6},
        'CO': {'full': 'Colorado', 'two_digit': 'CO', 'FIPS': 8},
        'CT': {'full': 'Connecticut', 'two_digit': 'CT', 'FIPS': 9},
        'DE': {'full': 'Delaware', 'two_digit': 'DE', 'FIPS': 10},
        'DC': {'full': 'District of Columbia', 'two_digit': 'DC', 'FIPS': 11},
        'FL': {'full': 'Florida', 'two_digit': 'FL', 'FIPS': 12},
        'GA': {'full': 'Georgia', 'two_digit': 'GA', 'FIPS': 13},
        'HI': {'full': 'Hawaii', 'two_digit': 'HI', 'FIPS': 15},
        'ID': {'full': 'Idaho', 'two_digit': 'ID', 'FIPS': 16},
        'IL': {'full': 'Illinois', 'two_digit': 'IL', 'FIPS': 17},
        'IN': {'full': 'Indiana', 'two_digit': 'IN', 'FIPS': 18},
        'IA': {'full': 'Iowa', 'two_digit': 'IA', 'FIPS': 19},
        'KS': {'full': 'Kansas', 'two_digit': 'KS', 'FIPS': 20},
        'KY': {'full': 'Kentucky', 'two_digit': 'KY', 'FIPS': 21},
        'LA': {'full': 'Louisiana', 'two_digit': 'LA', 'FIPS': 22},
        'ME': {'full': 'Maine', 'two_digit': 'ME', 'FIPS': 23},
        'MD': {'full': 'Maryland', 'two_digit': 'MD', 'FIPS': 24},
        'MA': {'full': 'Massachusetts', 'two_digit': 'MA', 'FIPS': 25},
        'MI': {'full': 'Michigan', 'two_digit': 'MI', 'FIPS': 26},
        'MN': {'full': 'Minnesota', 'two_digit': 'MN', 'FIPS': 27},
        'MS': {'full': 'Mississippi', 'two_digit': 'MS', 'FIPS': 28},
        'MO': {'full': 'Missouri', 'two_digit': 'MO', 'FIPS': 29},
        'MT': {'full': 'Montana', 'two_digit': 'MT', 'FIPS': 30},
        'NE': {'full': 'Nebraska', 'two_digit': 'NE', 'FIPS': 31},
        'NV': {'full': 'Nevada', 'two_digit': 'NV', 'FIPS': 32},
        'NH': {'full': 'New Hampshire', 'two_digit': 'NH', 'FIPS': 33},
        'NJ': {'full': 'New Jersey', 'two_digit': 'NJ', 'FIPS': 34},
        'NM': {'full': 'New Mexico', 'two_digit': 'NM', 'FIPS': 35},
        'NY': {'full': 'New York', 'two_digit': 'NY', 'FIPS': 36},
        'NC': {'full': 'North Carolina', 'two_digit': 'NC', 'FIPS': 37},
        'ND': {'full': 'North Dakota', 'two_digit': 'ND', 'FIPS': 38},
        'OH': {'full': 'Ohio', 'two_digit': 'OH', 'FIPS': 39},
        'OK': {'full': 'Oklahoma', 'two_digit': 'OK', 'FIPS': 40},
        'OR': {'full': 'Oregon', 'two_digit': 'OR', 'FIPS': 41},
        'PA': {'full': 'Pennsylvania', 'two_digit': 'PA', 'FIPS': 42},
        'RI': {'full': 'Rhode Island', 'two_digit': 'RI', 'FIPS': 44},
        'SC': {'full': 'South Carolina', 'two_digit': 'SC', 'FIPS': 45},
        'SD': {'full': 'South Dakota', 'two_digit': 'SD', 'FIPS': 46},
        'TN': {'full': 'Tennessee', 'two_digit': 'TN', 'FIPS': 47},
        'TX': {'full': 'Texas', 'two_digit': 'TX', 'FIPS': 48},
        'UT': {'full': 'Utah', 'two_digit': 'UT', 'FIPS': 49},
        'VT': {'full': 'Vermont', 'two_digit': 'VT', 'FIPS': 50},
        'VA': {'full': 'Virginia', 'two_digit': 'VA', 'FIPS': 51},
        'WA': {'full': 'Washington', 'two_digit': 'WA', 'FIPS': 53},
        'WV': {'full': 'West Virginia', 'two_digit': 'WV', 'FIPS': 54},
        'WI': {'full': 'Wisconsin', 'two_digit': 'WI', 'FIPS': 55},
        'WY': {'full': 'Wyoming', 'two_digit': 'WY', 'FIPS': 56},
        'PR': {'full': 'Puerto Rico', 'two_digit': 'PR', 'FIPS': 72}
    }

    if FIPS:
        return code_hash[input]['FIPS']
    
    return None



In [43]:
def getGEOID(district_str, leading_zero = False):
    state, chamber, dist_num = district_str.split('-')
    GEOID = str(get_state_code(state)) 
    if leading_zero and len(GEOID) < 2: 
        GEOID= '0' + GEOID
    while len(dist_num) <3: 
        dist_num = '0' + dist_num
    GEOID =  GEOID + dist_num
    return GEOID

In [55]:
def getChamber(district_str):
    state, chamber, dist_num = district_str.split('-')
    return chamber

In [56]:
def process_moneyball_data(inPath, outPath):
    raw = pd.read_csv(inPath)

In [57]:
df = pd.read_csv("dummy_data.csv")

In [58]:
df.head()

Unnamed: 0,state,district,incumbent,favored,confidence,nom_R,nom_D,nom_I,turnout_cvap,VOTER_POWER
0,CT,CT-HD-1,D,D,Safe,False,False,False,8256,-7.48e-07
1,CT,CT-HD-2,D,D,Lean,False,False,False,9538,-2.77e-06
2,CT,CT-HD-3,D,D,Safe,False,False,False,7240,-8.53e-07
3,CT,CT-HD-4,D,D,Safe,False,False,False,8519,-7.25e-07
4,CT,CT-HD-5,D,D,Safe,False,False,False,8851,-6.98e-07


In [59]:
lambdafunc = lambda x: pd.Series(
    [getGEOID(x['district']),
    getChamber(x['district'])]
)

In [60]:
getGEOID('CT-HD-5')

'9005'

In [61]:
df [['GEOID', 'chamber']] = df.apply(lambdafunc, axis = 1)

In [62]:
df.head()

Unnamed: 0,state,district,incumbent,favored,confidence,nom_R,nom_D,nom_I,turnout_cvap,VOTER_POWER,GEOID,chamber
0,CT,CT-HD-1,D,D,Safe,False,False,False,8256,-7.48e-07,9001,HD
1,CT,CT-HD-2,D,D,Lean,False,False,False,9538,-2.77e-06,9002,HD
2,CT,CT-HD-3,D,D,Safe,False,False,False,7240,-8.53e-07,9003,HD
3,CT,CT-HD-4,D,D,Safe,False,False,False,8519,-7.25e-07,9004,HD
4,CT,CT-HD-5,D,D,Safe,False,False,False,8851,-6.98e-07,9005,HD


In [63]:
df

Unnamed: 0,state,district,incumbent,favored,confidence,nom_R,nom_D,nom_I,turnout_cvap,VOTER_POWER,GEOID,chamber
0,CT,CT-HD-1,D,D,Safe,FALSE,FALSE,FALSE,8256,-7.480000e-07,9001,HD
1,CT,CT-HD-2,D,D,Lean,FALSE,FALSE,FALSE,9538,-2.770000e-06,9002,HD
2,CT,CT-HD-3,D,D,Safe,FALSE,FALSE,FALSE,7240,-8.530000e-07,9003,HD
3,CT,CT-HD-4,D,D,Safe,FALSE,FALSE,FALSE,8519,-7.250000e-07,9004,HD
4,CT,CT-HD-5,D,D,Safe,FALSE,FALSE,FALSE,8851,-6.980000e-07,9005,HD
...,...,...,...,...,...,...,...,...,...,...,...,...
950,NV,NV-SD-6,D,D,Lean,April Becker,Nicole Cannizzaro,FALSE,48662,-1.660000e-06,32006,SD
951,NV,NV-SD-11,D,D,Safe,Joshua Dowden,Dallas Harris,FALSE,45728,-1.420000e-06,32011,SD
952,NV,NV-SD-15,R,R,Lean,Heidi S. Gansert,Wendy Jauregui-Jackins,Catana Barnes,48583,-2.150000e-07,32015,SD
953,NV,NV-SD-18,R,R,Likely,Scott Hammond,TBA,FALSE,52294,-8.620000e-08,32018,SD


In [66]:
df.to_csv('./processed_data.csv', index=False, float_format='%.16f')

In [70]:
    # read in moneyball data
    df = pd.read_csv('./processed_data.csv')

    # segment to upper and lower chamber
    upper_df = df[df['chamber'] == 'SD']
    lower_df = df[df['chamber'] == 'HD']

Unnamed: 0,state,district,incumbent,favored,confidence,nom_R,nom_D,nom_I,turnout_cvap,VOTER_POWER,GEOID,chamber
0,CT,CT-HD-1,D,D,Safe,FALSE,FALSE,FALSE,8256,-7.480000e-07,9001,HD
1,CT,CT-HD-2,D,D,Lean,FALSE,FALSE,FALSE,9538,-2.770000e-06,9002,HD
2,CT,CT-HD-3,D,D,Safe,FALSE,FALSE,FALSE,7240,-8.530000e-07,9003,HD
3,CT,CT-HD-4,D,D,Safe,FALSE,FALSE,FALSE,8519,-7.250000e-07,9004,HD
4,CT,CT-HD-5,D,D,Safe,FALSE,FALSE,FALSE,8851,-6.980000e-07,9005,HD
...,...,...,...,...,...,...,...,...,...,...,...,...
943,NV,NV-HD-37,D,D,Tilt,TBA,Shea Backus,FALSE,26156,-2.710000e-07,32037,HD
944,NV,NV-HD-39,R,R,Safe,Jim Wheeler,Deborah Chang,Dave Jones,25240,-4.960000e-09,32039,HD
945,NV,NV-HD-40,FALSE,R,Safe,TBA,TBA,FALSE,23507,-5.330000e-09,32040,HD
946,NV,NV-HD-41,D,D,Likely,Erika Smith,Sandra Jauregui,Victoria DaCosta,27643,-4.330000e-07,32041,HD


In [73]:
r = shapefile.Reader('./raw/UPPER_cb_2019_us_sldu_500k/cb_2019_us_sldu_500k.shp')

r.fields


[('DeletionFlag', 'C', 1, 0),
 ['STATEFP', 'C', 2, 0],
 ['SLDUST', 'C', 3, 0],
 ['AFFGEOID', 'C', 14, 0],
 ['GEOID', 'C', 5, 0],
 ['NAME', 'C', 100, 0],
 ['LSAD', 'C', 2, 0],
 ['LSY', 'C', 4, 0],
 ['ALAND', 'N', 14, 0],
 ['AWATER', 'N', 14, 0]]