In [52]:
import numpy as np
import pickle
import cartopy.io.shapereader as shpreader
from cartopy.feature import ShapelyFeature
from shapely.prepared import prep
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import pandas as pd

In [53]:
# Projection is lat/lon (unprojected/cylindrical equidistant)
# The proj.4 string:
# +proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs

In [54]:
# some useful arrays to translate between naming conventions

election_year_list = np.array([1992, 1994, 1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010, 2012, 
                                   2014, 2016, 2018])
congress_ID_list = np.array([103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116])

state_names = np.array(['ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA', 
               'COLORADO', 'CONNECTICUT', 'DELAWARE', 'FLORIDA', 'GEORGIA', 
               'HAWAII', 'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS', 
               'KENTUCKY', 'LOUISIANA', 'MAINE', 'MARYLAND', 'MASSACHUSETTS', 
               'MICHIGAN', 'MINNESOTA', 'MISSISSIPPI', 'MISSOURI', 'MONTANA', 
               'NEBRASKA', 'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY', 'NEW MEXICO', 
               'NEW YORK', 'NORTH CAROLINA', 'NORTH DAKOTA', 'OHIO', 'OKLAHOMA', 
               'OREGON', 'PENNSYLVANIA', 'RHODE ISLAND', 'SOUTH CAROLINA', 
               'SOUTH DAKOTA', 'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT', 
               'VIRGINIA', 'WASHINGTON', 'WEST VIRGINIA', 'WISCONSIN', 'WYOMING'])

state_abbrs = np.array(['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL',
              'IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT',
              'NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI',
              'SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY'])

state_fips = np.array([1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 
                       24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 
                       41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56])

In [55]:
def read_shapefiles(election_years,verbose=True):
    """
    Reads in shapefiles from UCLA database (pre-2016) US Census TigerLine files (2016 on).
    data sources: http://cdmaps.polisci.ucla.edu/
                  https://www.census.gov/geo/maps-data/data/cbf/cbf_cds.html
    Note that file names must be in the same folder as the code with the following format:
        -pre-2016: districtShapes-NNN/districtsNNN.shp
        -2016 on: must be named t1_YYYY_us_cdNNN/tl_YYYY_us_cdNNN.shp
    ...where NNN is congress ID and YYYY is election year
    
    input:
        election_years -- (list) Elections years of the shape files you want to read. Even years only. 
        verbose -- (bool) set to True if you want it to print every file as it reads it in
    output:
        district_df -- pandas data frame with index in format ST_00_YYYY (e.g. AL_01_2018) with 
                       shapefiles stored in a column named 'shape'.
    """
    # read in the standard dictionary 
    district_df = pickle.load(open('../Datasets/master_index.p','rb'))
    district_df['shape'] = [np.nan]*district_df.shape[0] # make a blank column
    district_df['shape'] = district_df['shape'].astype(object) # reassign to object so it can hold shapely stuff

    for election_year in election_years:
        # convert election year to "Nth Congress" 
        congress_ID = congress_ID_list[election_year_list==election_year][0]

        # pre-2016 files come from http://cdmaps.polisci.ucla.edu/
        # they are a bit cleaner
        if election_year < 2016:
            # read in the shapefile (must be named 'districtsN.shp' in a folder titled 'districtShapesN')
            shpfilename = 'districtShapes-{0}/districts{0}.shp'.format(congress_ID)
            reader = shpreader.Reader(shpfilename) 
            districts = reader.records() # get full records
            geometries = reader.geometries() # get just the shape
            
            # put the shapefiles into the standard dictionary
            for record in reader.records(): # loop over districts
                attr = record.attributes # dictionary of information about the district
                poly = record.geometry # coordinates of the district as a shapely polygon
                
                # 1) get the state abbr of the district
                if any(state_names==attr['STATENAME'].upper()): # filter out districts that aren't in states
                    ST = state_abbrs[state_names==attr['STATENAME'].upper()][0]    
                else: # pretty much just Washington, DC
                    print('{} is not a state.'.format(attr['STATENAME'].upper()))
                    continue
                # 2) get the id of the district
                id_int = int(attr['DISTRICT'])
                if id_int == 0: # change at-large district ID from 0 to 1 to play nice with our indexing convention.
                    id_int = 1 # todo: do we want to change this back? Looks like 0 for at-large districts is the convention. 
                ID = '{0:02d}'.format(id_int)
                # 3) reformat the polygon if it intersects with itself
                if not poly.is_valid: 
                    # if it does, use buffer to correct this
                    if verbose:
                        print('The following polygons intersected with themselves. Attempting to buffer shape...')
                        print(ind)
                    poly = poly.buffer(0)
                # 4) put it all together into an index
                ind = '{}_{}_{}'.format(ST, ID, election_year)
                if verbose:
                    print('{} was read in.'.format(ind))
                # put the polygon in the dictionary
                district_df.at[ind,'shape'] = poly
            
        # 2016 and later are US Census Bureau Tiger Line Files 
        # (and they're a bit messier)
        if election_year >= 2016:
            # read in the shapefile (must be named 'tl_YYYY_us_cdN.shp' in a folder titled 't1_YYYY_us_cdN')
            shpfilename = 'tl_{1}_us_cd{0}/tl_{1}_us_cd{0}.shp'.format(congress_ID,election_year)
            reader = shpreader.Reader(shpfilename) 
            districts = reader.records() # get full records
            geometries = reader.geometries() # get just the shape

            # put the shapefiles into the standard dictionary
            for record in reader.records(): # loop over districts
                attr = record.attributes # dictionary of information about the district
                poly = record.geometry # coordinates of the district as a shapely polygon

                # check if polygon intersects with itself
                if not poly.is_valid: 
                    # if it does, use buffer to correct this
                    if verbose: 
                        print('The following polygons intersected with themselves. Attempting to buffer shape...')
                        print(ind)
                    poly = poly.buffer(0)
                
                # 1) get the state abbr of the district
                if any(state_fips==int(attr['STATEFP'])): # filter out districts that aren't in states
                    ST = state_abbrs[state_fips==int(attr['STATEFP'])][0]
                else: # pretty much just Washington, DC
                    print('{} is not a state.'.format(int(attr['STATEFP'])))
                    continue
                # 2) get the id of the district
                if (attr['CD{}FP'.format(congress_ID)]=='ZZ'):
                    print('District ID is ZZ, which indicates this is not a congressional district.')
                    continue
                else:
                    id_int = int(attr['CD{}FP'.format(congress_ID)])
                if id_int == 0: # change at-large district ID from 0 to 1 to play nice with our indexing convention.
                    # todo: do we want to change this back? Looks like 0 for at-large districts is the convention. 
                    id_int = 1
                ID = '{0:02d}'.format(id_int)
                # 3) reformat the polygon if it intersects with itself
                if not poly.is_valid: 
                    # if it does, use buffer to correct this
                    if verbose:
                        print('The following polygons intersected with themselves. Attempting to buffer shape...')
                        print(ind)
                    poly = poly.buffer(0)
                # 4) put it all together into an index
                ind = '{}_{}_{}'.format(ST, ID, election_year)
                if verbose:
                    print('{} was read in.'.format(ind))
                # put the polygon in the dictionary
                district_df.at[ind,'shape'] = poly
    
    return district_df

In [56]:
# compute overlap percent between this district and last year's districts
def district_overlap(this_year, district_df, threshold_for_change=0.1):
    """
    Finds the fractional overlap between this year's district and the previous year's districts. 
    i.e., if a Florida's 1st district has changed its borders from 2014 to 2016, it may be made up of:
        20% of its area may come from district 2 in 2014
        40% of its area may come from district 3 in 2014
        10% of its area may come from district 4 in 2014
    
    input:
        this_year -- (int) Elections years of the districts you want to check. Even years only. 
        district_df -- (pd dataframe) dataframe with default indices and shapefiles stored in 
                       a column named 'shape'
    output:
        district_df -- pandas data frame with index in format ST_00_YYYY (e.g. AL_01_2018) 
                       
                       fractional overlap stored in a column called 'fractional_overlap'.
                       Each district has a dictionary where the keys are the indicies of the previous
                       districts which overlap with our district, and the values are the fractional overlap.
                       
                       change from last year stored in a column called 'border_change'.
                       Statuses:
                           0 - this district has not changed at all since the previous year
                           1 - this district has either changed area > threshold since 
                               the previous year, or it is new.
    """
    # add a column
    if 'overlap_frac' not in district_df.columns:
        district_df['overlap_frac'] = [np.nan]*district_df.shape[0] # make a blank column
        district_df['overlap_frac'] = district_df['overlap_frac'].astype(object) # reassign to object so it can hold shapely stuff

    if 'border_change' not in district_df.columns:
        district_df['border_change'] = [np.nan]*district_df.shape[0] # make a blank column    
    
    # loop over states so you only have to compare districts in-state
    # otherwise, comparing each district to 434 other districts would be super slow
    for ST in state_abbrs: 
        prev_year = this_year-2
        # get the relevant districts
        districts = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==this_year)]
        districts_prev = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==prev_year)]

        for ind,district in districts.iterrows(): # loop over districts in your current year
            overlap_dict = {}
            shape = district['shape']
            area = shape.area # area of this district
            
            # check if shapes intersect with themselves
            if not shape.is_valid: 
                # if they do, use buffer to correct this
                print('The following polygons intersected with themselves. Attempting to buffer shape...')
                print(ind)
                shape = shape.buffer(0)
                    
            for ind_prev,district_prev in districts_prev.iterrows(): # loop over districts in previous year
                shape_prev = district_prev['shape']
                
                # check if shapes intersect with themselves
                if not shape_prev.is_valid:
                    # if they do, use buffer to correct this
                    print('The following polygons intersected with themselves. Attempting to buffer shape...')
                    print(ind_prev)
                    shape_prev = shape_prev.buffer(0)

                # calculate frac overlap
                area_prev = shape_prev.area
                overlap_area = shape.intersection(shape_prev).area # area of overlap between shape and shape_prev
                frac_overlap = np.around(overlap_area/area,decimals=3) # fractional overlap between new and old district
                                                                       # round to the thousandth

                if frac_overlap > 10**-3: # use threshold of 0.1% to avoid trivial changes
                    overlap_dict[ind_prev] = frac_overlap

            # make sure areas add up to 1.
            dict_sum = np.sum(list(overlap_dict.values()))
            if not np.isclose(dict_sum,1.,rtol=1e-03):
                # rescale so they add up to 1.
                print(overlap_dict)
                print('rescaled')
                print(dict_sum)
                overlap_dict = {key : np.around(val/dict_sum, decimals=3) 
                                for key,val in overlap_dict.items() if val/dict_sum > 10**-3}
                    
            print(ind)
            print(overlap_dict)
            district_df.at[ind, 'overlap_frac'] = overlap_dict

            # calculate the status of the new district
            max_frac = np.max(list(overlap_dict.values()))
            ind_prev = '{}{}'.format(ind[:6],prev_year) # same district, different year
            if ind_prev not in districts_prev.index: # the district is new this year
                print('new')
                district_df.at[ind, 'border_change'] = 1
            elif ind_prev not in overlap_dict.keys(): # the district existed last year, but is in a totally new location
                print('totally moved')
                district_df.at[ind, 'border_change'] = 1
            elif 1.-overlap_dict[ind_prev] > threshold_for_change: # the district existed last year, but the borders have moved
                print(1.-overlap_dict[ind_prev])
                print('changed')
                district_df.at[ind, 'border_change'] = 1
            else: # the district is essentially the same as last year
                print('same')
                district_df.at[ind, 'border_change'] = 0
            
    return district_df

In [57]:
def get_centroid(district_df):
    """
    Finds the centroid of a district in lon,lat. 
    
    input: 
        district_df -- (pd dataframe) dataframe with default indices and shapefiles stored in 
                       a column named 'shape'
    output:
        district_df -- pandas data frame with index in format ST_00_YYYY (e.g. AL_01_2018) 
                       with centroid stored in a column called 'centroid'.
                       The centroid is calculated as a lon,lat on a Cartesian plane.
                       It ignores spherical geometry. 
                       The centroid is stored as a tuple in the form (lon,lat)
    """
    # add column
    district_df['centroid'] = [np.nan]*district_df.shape[0] # make a blank column
    district_df['centroid'] = district_df['centroid'].astype(object) # reassign to object so it can hold shapely stuff
    
    for ind, district in district_df.iterrows():
        shape = district['shape']
        if pd.isnull(district['shape']): # if there's no shape, fill it with a nan
            district_df.at[ind, 'centroid'] = np.nan
        else:
            centroid = shape.centroid.coords
            district_df.at[ind, 'centroid'] = centroid # in units of lat/lon
            
    return district_df

In [58]:
# compute overlap percent between this district and last year's districts
def population_overlap(this_year, district_df, threshold_for_change=0.1):
    """
    Estimates percent of population coming from previous district boundaries using 
    the equation:
    
    population_overlap = (overlap_area / area) * (1 / area_prev)
    
    Where overlap_area is the area of overlap between the new and old districts, 
    area is the area of the new district, and area_prev is the area of the previous 
    district. 
    
    This equation assumes that the voting population in each district is constant, 
    but area changes. Although each district has roughly 711,000 people, variations in 
    voter turnout between districts means that this assumption is false. 
    
    density_weighted_overlap is calculated for each previous district which intersects with 
    the new district. density_weigted_overlap is then scaled so that the sum of all 
    previous districts which intersect with the new district is 1. 
    
    input:
        this_year -- (int) Elections years of the districts you want to check. Even years only. 
        district_df -- (pd dataframe) dataframe with default indices and shapefiles stored in 
                       a column named 'shape'
    output:
        district_df -- pandas data frame with index in format ST_00_YYYY (e.g. AL_01_2018) 
                       
                       population_overlap is stored in a column called 'population_overlap'.
                       Each district has a dictionary where the keys are the indicies of the previous
                       districts which overlap with our district, and the values are the population overlap.
                       
    """
    
    # add a column
    if 'population_overlap' not in district_df.columns:
        district_df['population_overlap'] = [np.nan]*district_df.shape[0] # make a blank column
        district_df['population_overlap'] = district_df['population_overlap'].astype(object) # reassign to object so it can hold shapely stuff

    # loop over states so you only have to compare districts in-state
    # otherwise, comparing each district to 434 other districts would be super slow
    for ST in state_abbrs: 
        prev_year = this_year-2
        # get the relevant districts
        districts = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==this_year)]
        districts_prev = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==prev_year)]

        for ind,district in districts.iterrows(): # loop over districts in your current year
            overlap_dict = {}
            shape = district['shape']
            area = shape.area # area of this district
            
            # check if shapes intersect with themselves
            if not shape.is_valid: 
                # if they do, use buffer to correct this
                print('The following polygons intersected with themselves. Attempting to buffer shape...')
                print(ind)
                shape = shape.buffer(0)
                    
            for ind_prev,district_prev in districts_prev.iterrows(): # loop over districts in previous year
                shape_prev = district_prev['shape']
                
                # check if shapes intersect with themselves
                if not shape_prev.is_valid:
                    # if they do, use buffer to correct this
                    print('The following polygons intersected with themselves. Attempting to buffer shape...')
                    print(ind_prev)
                    shape_prev = shape_prev.buffer(0)

                # calculate frac overlap
                area_prev = shape_prev.area
                density = 1./area_prev # assume population roughly the same, but area changes
                overlap_area = shape.intersection(shape_prev).area # area of overlap between shape and shape_prev
                frac_overlap = overlap_area/area # fractional overlap between new and old district
                                                                       # round to the thousandth
                
                if frac_overlap > 10**-3: # use threshold of 0.1% to avoid trivial changes
                    population_overlap = frac_overlap*density
                    overlap_dict[ind_prev] = population_overlap

            # make sure areas add up to 1.
            dict_sum = np.sum(list(overlap_dict.values()))
            # rescale so they add up to 1.
            overlap_dict = {key : np.around(val/dict_sum, decimals=3) 
                            for key,val in overlap_dict.items() if val/dict_sum > 10**-3}

            print(ind)
            print(overlap_dict)
            district_df.at[ind, 'population_overlap'] = overlap_dict
            
    return district_df

In [59]:
def inverse_population_overlap(this_year, district_df):
    """
    Exactly the same as population_overlap, except it gives each district_prev a dictionary
    where the keys are current year's districts, and the values are the estimated 
    population overlap of district_prev and district. 
    
    This allows you to impute values from (year) into (year-2) in your dataset. 
    
    """
    
    # add a column
    if 'inverse_population_overlap' not in district_df.columns:
        district_df['inverse_population_overlap'] = [np.nan]*district_df.shape[0] # make a blank column
        district_df['inverse_population_overlap'] = district_df['inverse_population_overlap'].astype(object) # reassign to object so it can hold shapely stuff
        
    # loop over states so you only have to compare districts in-state
    # otherwise, comparing each district to 434 other districts would be super slow
    for ST in state_abbrs: 
        prev_year = this_year-2
        # get the relevant districts
        districts = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==this_year)]
        districts_prev = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==prev_year)]

        for ind_prev,district_prev in districts_prev.iterrows(): # loop over districts in previous year
            overlap_dict = {}
            shape_prev = district_prev['shape']
            area_prev = shape_prev.area # area of this district
            
            # check if shapes intersect with themselves
            if not shape_prev.is_valid: 
                # if they do, use buffer to correct this
                print('The following polygons intersected with themselves. Attempting to buffer shape...')
                print(ind_prev)
                shape_prev = shape_prev.buffer(0)
                    
            for ind,district in districts.iterrows(): # loop over districts in your current year
                shape = district['shape']
                
                # check if shapes intersect with themselves
                if not shape.is_valid:
                    # if they do, use buffer to correct this
                    print('The following polygons intersected with themselves. Attempting to buffer shape...')
                    print(ind)
                    shape = shape.buffer(0)

                # calculate frac overlap
                area = shape.area # area of current district
                density = 1./area # assume population roughly the same, but area changes
                overlap_area = shape.intersection(shape_prev).area # area of overlap between shape and shape_prev
                frac_overlap = overlap_area/area_prev # fractional overlap between new and old district
                                                                       # round to the thousandth
                
                if frac_overlap > 10**-3: # use threshold of 0.1% to avoid trivial changes
                    population_overlap = frac_overlap*density
                    overlap_dict[ind] = population_overlap

            # make sure areas add up to 1.
            dict_sum = np.sum(list(overlap_dict.values()))
            # rescale so they add up to 1.
            overlap_dict = {key : np.around(val/dict_sum, decimals=3) 
                            for key,val in overlap_dict.items() if val/dict_sum > 10**-3}

            print(ind_prev)
            print(overlap_dict)
            district_df.at[ind_prev, 'inverse_population_overlap'] = overlap_dict
            
    return district_df

In [63]:
years_to_read = [2006, 2008, 2010]
years_to_check = [2008, 2010]

inv_df = read_shapefiles(years_to_read, verbose=False)
for year in years_to_check:
    inv_df = inverse_population_overlap(year, inv_df)
    pickle.dump(inv_df.drop('shape', axis=1), open('inv_pop_frac_{}.p'.format(year), 'wb'))
#pickle.dump(inv_df.drop('shape',axis=1), open('inv_pop_frac.p', 'wb'))

DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
AL_01_2008
{'AL_01_2010': 1.0}
AL_02_2008
{'AL_02_2010': 1.0}
AL_03_2008
{'AL_03_2010': 1.0}
AL_04_2008
{'AL_04_2010': 1.0}
AL_05_2008
{'AL_05_2010': 1.0}
AL_06_2008
{'AL_06_2010': 1.0}
AL_07_2008
{'AL_07_2010': 1.0}
AK_01_2008
{'AK_01_2010': 1.0}
AZ_01_2008
{'AZ_01_2010': 1.0}
AZ_02_2008
{'AZ_02_2010': 1.0}
AZ_03_2008
{'AZ_03_2010': 1.0}
AZ_04_2008
{'AZ_04_2010': 1.0}
AZ_05_2008
{'AZ_05_2010': 1.0}
AZ_06_2008
{'AZ_06_2010': 1.0}
AZ_07_2008
{'AZ_07_2010': 1.0}
AZ_08_2008
{'AZ_08_2010': 1.0}
AR_01_2008
{'AR_01_2010': 1.0}
AR_02_2008
{'AR_02_2010': 1.0}
AR_03_2008
{'AR_03_2010': 1.0}
AR_04_2008
{'AR_04_2010': 1.0}
CA_01_2008
{'CA_01_2010': 1.0}
CA_02_2008
{'CA_02_2010': 1.0}
CA_03_2008
{'CA_03_2010': 1.0}
CA_04_2008
{'CA_04_2010': 1.0}
CA_05_2008
{'CA_05_2010': 1.0}
CA_06_2008
{'CA_06_2010': 1.0}
CA_07_2008
{'CA_07_2010': 1.0}
CA_08_2008
{'CA_08_2010': 1.0}
CA_09_2008
{'CA_09_2010': 1.0}
CA_10_2008
{'CA_10_2010': 1

NY_04_2008
{'NY_04_2010': 1.0}
NY_05_2008
{'NY_05_2010': 1.0}
NY_06_2008
{'NY_06_2010': 1.0}
NY_07_2008
{'NY_07_2010': 1.0}
NY_08_2008
{'NY_08_2010': 1.0}
NY_09_2008
{'NY_09_2010': 1.0}
NY_10_2008
{'NY_10_2010': 1.0}
NY_11_2008
{'NY_11_2010': 1.0}
NY_12_2008
{'NY_12_2010': 1.0}
NY_13_2008
{'NY_13_2010': 1.0}
NY_14_2008
{'NY_14_2010': 1.0}
NY_15_2008
{'NY_15_2010': 1.0}
NY_16_2008
{'NY_16_2010': 1.0}
NY_17_2008
{'NY_17_2010': 1.0}
NY_18_2008
{'NY_18_2010': 1.0}
NY_19_2008
{'NY_19_2010': 1.0}
NY_20_2008
{'NY_20_2010': 1.0}
NY_21_2008
{'NY_21_2010': 1.0}
NY_22_2008
{'NY_22_2010': 1.0}
NY_23_2008
{'NY_23_2010': 1.0}
NY_24_2008
{'NY_24_2010': 1.0}
NY_25_2008
{'NY_25_2010': 1.0}
NY_26_2008
{'NY_26_2010': 1.0}
NY_27_2008
{'NY_27_2010': 1.0}
NY_28_2008
{'NY_28_2010': 1.0}
NY_29_2008
{'NY_29_2010': 1.0}
NC_01_2008
{'NC_01_2010': 1.0}
NC_02_2008
{'NC_02_2010': 1.0}
NC_03_2008
{'NC_03_2010': 1.0}
NC_04_2008
{'NC_04_2010': 1.0}
NC_05_2008
{'NC_05_2010': 1.0}
NC_06_2008
{'NC_06_2010': 1.0}
NC_07_20

In [14]:
# get centroid coords for everything 
years = [2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]

centroid_df = read_shapefiles(years, verbose=False)
for year in years:
     centroid_df = get_centroid(centroid_df)
pickle.dump(centroid_df.drop('shape',axis=1), open('centroid.p', 'wb'))

DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
11 is not a state.
60 is not a state.
66 is not a state.
69 is not a state.
72 is not a state.
78 is not a state.
District ID is ZZ, which indicates this is not a congressional district.
District ID is ZZ, which indicates this is not a congressional district.
District ID is ZZ, which indicates this is not a congressional district.
District ID is ZZ, which indicates this is not a congressional district.
69 is not a state.
District ID is ZZ, which indicates this is not a congressional district.
11 is not a state.
66 is not a state.
78 is not a state.
72 is not a state.
60 is not a state.
District ID is ZZ, which indicates this is not a congressional district.


In [59]:
# find out the area overlap and border change between this year's and last year's districts
# takes ~2hrs to run for all 8 years.

# you can only check a district if you have last year's data, so you read one more district than the data you make
years_to_read = [2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]
years_to_check = [2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]

# read in the data (make a fresh df)
#overlap_df = read_shapefiles(years_to_read, verbose=False)
for year in years_to_check:
    overlap_df = district_overlap(year, overlap_df)
    pickle.dump(overlap_df.drop('shape', axis=1), open('overlap_frac_{}.p'.format(year), 'wb'))
pickle.dump(overlap_df.drop('shape', axis=1), open('overlap_frac.p', 'wb'))

AL_01_2018
{'AL_01_2016': 1.0}
same
AL_02_2018
{'AL_02_2016': 1.0}
same
AL_03_2018
{'AL_03_2016': 1.0}
same
AL_04_2018
{'AL_04_2016': 1.0}
same
AL_05_2018
{'AL_05_2016': 1.0}
same
AL_06_2018
{'AL_06_2016': 1.0}
same
AL_07_2018
{'AL_07_2016': 1.0}
same
AK_01_2018
{'AK_01_2016': 0.999}
same
AZ_01_2018
{'AZ_01_2016': 1.0}
same
AZ_02_2018
{'AZ_02_2016': 1.0}
same
AZ_03_2018
{'AZ_03_2016': 1.0}
same
AZ_04_2018
{'AZ_04_2016': 1.0}
same
AZ_05_2018
{'AZ_05_2016': 1.0}
same
AZ_06_2018
{'AZ_06_2016': 1.0}
same
AZ_07_2018
{'AZ_07_2016': 1.0}
same
AZ_08_2018
{'AZ_08_2016': 1.0}
same
AZ_09_2018
{'AZ_09_2016': 0.999}
same
AR_01_2018
{'AR_01_2016': 1.0}
same
AR_02_2018
{'AR_02_2016': 1.0}
same
AR_03_2018
{'AR_03_2016': 1.0}
same
AR_04_2018
{'AR_04_2016': 1.0}
same
CA_01_2018
{'CA_01_2016': 1.0}
same
CA_10_2018
{'CA_10_2016': 1.0}
same
CA_11_2018
{'CA_11_2016': 1.0}
same
CA_12_2018
{'CA_12_2016': 1.0}
same
CA_13_2018
{'CA_13_2016': 1.0}
same
CA_14_2018
{'CA_14_2016': 1.0}
same
CA_15_2018
{'CA_15_2016'

MO_04_2018
{'MO_04_2016': 1.0}
same
MO_05_2018
{'MO_05_2016': 1.0}
same
MO_06_2018
{'MO_06_2016': 1.0}
same
MO_07_2018
{'MO_07_2016': 1.0}
same
MO_08_2018
{'MO_08_2016': 1.0}
same
MT_01_2018
{'MT_01_2016': 1.0}
same
NE_01_2018
{'NE_01_2016': 1.0}
same
NE_02_2018
{'NE_02_2016': 1.0}
same
NE_03_2018
{'NE_03_2016': 1.0}
same
NV_01_2018
{'NV_01_2016': 1.0}
same
NV_02_2018
{'NV_02_2016': 1.0}
same
NV_03_2018
{'NV_03_2016': 1.0}
same
NV_04_2018
{'NV_04_2016': 1.0}
same
NH_01_2018
{'NH_01_2016': 1.0}
same
NH_02_2018
{'NH_02_2016': 1.0}
same
NJ_01_2018
{'NJ_01_2016': 1.0}
same
NJ_10_2018
{'NJ_10_2016': 1.0}
same
NJ_11_2018
{'NJ_11_2016': 1.0}
same
NJ_12_2018
{'NJ_12_2016': 1.0}
same
NJ_02_2018
{'NJ_02_2016': 1.0}
same
NJ_03_2018
{'NJ_03_2016': 1.0}
same
NJ_04_2018
{'NJ_04_2016': 1.0}
same
NJ_05_2018
{'NJ_05_2016': 1.0}
same
NJ_06_2018
{'NJ_06_2016': 1.0}
same
NJ_07_2018
{'NJ_07_2016': 1.0}
same
NJ_09_2018
{'NJ_09_2016': 1.0}
same
NJ_08_2018
{'NJ_08_2016': 1.0}
same
NM_01_2018
{'NM_01_2016': 1.

WA_08_2018
{'WA_08_2016': 1.0}
same
WV_01_2018
{'WV_01_2016': 1.0}
same
WV_02_2018
{'WV_02_2016': 1.0}
same
WV_03_2018
{'WV_03_2016': 1.0}
same
WI_01_2018
{'WI_01_2016': 1.0}
same
WI_02_2018
{'WI_02_2016': 1.0}
same
WI_03_2018
{'WI_03_2016': 1.0}
same
WI_04_2018
{'WI_04_2016': 1.0}
same
WI_05_2018
{'WI_05_2016': 1.0}
same
WI_06_2018
{'WI_06_2016': 1.0}
same
WI_07_2018
{'WI_07_2016': 1.0}
same
WI_08_2018
{'WI_08_2016': 1.0}
same
WY_01_2018
{'WY_01_2016': 1.0}
same


In [19]:
# estimate % population which derives from each previous district

# you can only check a district if you have last year's data, so you read one more district than the data you make
years_to_read = [2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]
years_to_check = [2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]

# read in the data (make a fresh df)
pop_df = read_shapefiles(years_to_read, verbose=False)
for year in years_to_check:
    pop_df = population_overlap(year, pop_df)
    pickle.dump(pop_df.drop('shape', axis=1), open('pop_frac_{}.p'.format(year), 'wb'))
pickle.dump(pop_df.drop('shape', axis=1), open('pop_frac.p', 'wb'))

DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
11 is not a state.
60 is not a state.
66 is not a state.
69 is not a state.
72 is not a state.
78 is not a state.
District ID is ZZ, which indicates this is not a congressional district.
District ID is ZZ, which indicates this is not a congressional district.
District ID is ZZ, which indicates this is not a congressional district.
District ID is ZZ, which indicates this is not a congressional district.
69 is not a state.
District ID is ZZ, which indicates this is not a congressional district.
11 is not a state.
66 is not a state.
78 is not a state.
72 is not a state.
60 is not a state.
District ID is ZZ, which indicates this is not a congressional district.
AL_01_2004
{'AL_01_2002': 1.0}
AL_02_2004
{'AL_02_2002': 1.0}
AL_03_2004
{'

MO_09_2004
{'MO_09_2002': 1.0}
MT_01_2004
{'MT_01_2002': 1.0}
NE_01_2004
{'NE_01_2002': 1.0}
NE_02_2004
{'NE_02_2002': 1.0}
NE_03_2004
{'NE_03_2002': 1.0}
NV_01_2004
{'NV_01_2002': 1.0}
NV_02_2004
{'NV_02_2002': 1.0}
NV_03_2004
{'NV_03_2002': 1.0}
NH_01_2004
{'NH_01_2002': 1.0}
NH_02_2004
{'NH_02_2002': 1.0}
NJ_01_2004
{'NJ_01_2002': 1.0}
NJ_02_2004
{'NJ_02_2002': 1.0}
NJ_03_2004
{'NJ_03_2002': 1.0}
NJ_04_2004
{'NJ_04_2002': 1.0}
NJ_05_2004
{'NJ_05_2002': 1.0}
NJ_06_2004
{'NJ_06_2002': 1.0}
NJ_07_2004
{'NJ_07_2002': 1.0}
NJ_08_2004
{'NJ_08_2002': 1.0}
NJ_09_2004
{'NJ_09_2002': 1.0}
NJ_10_2004
{'NJ_10_2002': 1.0}
NJ_11_2004
{'NJ_11_2002': 1.0}
NJ_12_2004
{'NJ_12_2002': 1.0}
NJ_13_2004
{'NJ_13_2002': 1.0}
NM_01_2004
{'NM_01_2002': 1.0}
NM_02_2004
{'NM_02_2002': 1.0}
NM_03_2004
{'NM_03_2002': 1.0}
NY_01_2004
{'NY_01_2002': 1.0}
NY_02_2004
{'NY_02_2002': 1.0}
NY_03_2004
{'NY_03_2002': 1.0}
NY_04_2004
{'NY_04_2002': 1.0}
NY_05_2004
{'NY_05_2002': 1.0}
NY_06_2004
{'NY_06_2002': 1.0}
NY_07_20

WA_04_2004
{'WA_04_2002': 1.0}
WA_05_2004
{'WA_05_2002': 1.0}
WA_06_2004
{'WA_06_2002': 1.0}
WA_07_2004
{'WA_07_2002': 1.0}
WA_08_2004
{'WA_08_2002': 1.0}
WA_09_2004
{'WA_09_2002': 1.0}
WV_01_2004
{'WV_01_2002': 1.0}
WV_02_2004
{'WV_02_2002': 1.0}
WV_03_2004
{'WV_03_2002': 1.0}
WI_01_2004
{'WI_01_2002': 1.0}
WI_02_2004
{'WI_02_2002': 1.0}
WI_03_2004
{'WI_03_2002': 1.0}
WI_04_2004
{'WI_04_2002': 1.0}
WI_05_2004
{'WI_05_2002': 1.0}
WI_06_2004
{'WI_06_2002': 1.0}
WI_07_2004
{'WI_07_2002': 1.0}
WI_08_2004
{'WI_08_2002': 1.0}
WY_01_2004
{'WY_01_2002': 1.0}
AL_01_2006
{'AL_01_2004': 1.0}
AL_02_2006
{'AL_02_2004': 1.0}
AL_03_2006
{'AL_03_2004': 1.0}
AL_04_2006
{'AL_04_2004': 1.0}
AL_05_2006
{'AL_05_2004': 1.0}
AL_06_2006
{'AL_06_2004': 1.0}
AL_07_2006
{'AL_07_2004': 1.0}
AK_01_2006
{'AK_01_2004': 1.0}
AZ_01_2006
{'AZ_01_2004': 1.0}
AZ_02_2006
{'AZ_02_2004': 1.0}
AZ_03_2006
{'AZ_03_2004': 1.0}
AZ_04_2006
{'AZ_04_2004': 1.0}
AZ_05_2006
{'AZ_05_2004': 1.0}
AZ_06_2006
{'AZ_06_2004': 1.0}
AZ_07_20

MN_07_2006
{'MN_07_2004': 1.0}
MN_08_2006
{'MN_08_2004': 1.0}
MS_01_2006
{'MS_01_2004': 1.0}
MS_02_2006
{'MS_02_2004': 1.0}
MS_03_2006
{'MS_03_2004': 1.0}
MS_04_2006
{'MS_04_2004': 1.0}
MO_01_2006
{'MO_01_2004': 1.0}
MO_02_2006
{'MO_02_2004': 1.0}
MO_03_2006
{'MO_03_2004': 1.0}
MO_04_2006
{'MO_04_2004': 1.0}
MO_05_2006
{'MO_05_2004': 1.0}
MO_06_2006
{'MO_06_2004': 1.0}
MO_07_2006
{'MO_07_2004': 1.0}
MO_08_2006
{'MO_08_2004': 1.0}
MO_09_2006
{'MO_09_2004': 1.0}
MT_01_2006
{'MT_01_2004': 1.0}
NE_01_2006
{'NE_01_2004': 1.0}
NE_02_2006
{'NE_02_2004': 1.0}
NE_03_2006
{'NE_03_2004': 1.0}
NV_01_2006
{'NV_01_2004': 1.0}
NV_02_2006
{'NV_02_2004': 1.0}
NV_03_2006
{'NV_03_2004': 1.0}
NH_01_2006
{'NH_01_2004': 1.0}
NH_02_2006
{'NH_02_2004': 1.0}
NJ_01_2006
{'NJ_01_2004': 1.0}
NJ_02_2006
{'NJ_02_2004': 1.0}
NJ_03_2006
{'NJ_03_2004': 1.0}
NJ_04_2006
{'NJ_04_2004': 1.0}
NJ_05_2006
{'NJ_05_2004': 1.0}
NJ_06_2006
{'NJ_06_2004': 1.0}
NJ_07_2006
{'NJ_07_2004': 1.0}
NJ_08_2006
{'NJ_08_2004': 1.0}
NJ_09_20

CA_22_2008
{'CA_22_2006': 1.0}
CA_23_2008
{'CA_23_2006': 1.0}
CA_24_2008
{'CA_24_2006': 1.0}
CA_25_2008
{'CA_25_2006': 1.0}
CA_26_2008
{'CA_26_2006': 1.0}
CA_27_2008
{'CA_27_2006': 1.0}
CA_28_2008
{'CA_28_2006': 1.0}
CA_29_2008
{'CA_29_2006': 1.0}
CA_30_2008
{'CA_30_2006': 1.0}
CA_31_2008
{'CA_31_2006': 1.0}
CA_32_2008
{'CA_32_2006': 1.0}
CA_33_2008
{'CA_33_2006': 1.0}
CA_34_2008
{'CA_34_2006': 1.0}
CA_35_2008
{'CA_35_2006': 1.0}
CA_36_2008
{'CA_36_2006': 1.0}
CA_37_2008
{'CA_37_2006': 1.0}
CA_38_2008
{'CA_38_2006': 1.0}
CA_39_2008
{'CA_39_2006': 1.0}
CA_40_2008
{'CA_40_2006': 1.0}
CA_41_2008
{'CA_41_2006': 1.0}
CA_42_2008
{'CA_42_2006': 1.0}
CA_43_2008
{'CA_43_2006': 1.0}
CA_44_2008
{'CA_44_2006': 1.0}
CA_45_2008
{'CA_45_2006': 1.0}
CA_46_2008
{'CA_46_2006': 1.0}
CA_47_2008
{'CA_47_2006': 1.0}
CA_48_2008
{'CA_48_2006': 1.0}
CA_49_2008
{'CA_49_2006': 1.0}
CA_50_2008
{'CA_50_2006': 1.0}
CA_51_2008
{'CA_51_2006': 1.0}
CA_52_2008
{'CA_52_2006': 1.0}
CA_53_2008
{'CA_53_2006': 1.0}
CO_01_20

OH_06_2008
{'OH_06_2006': 1.0}
OH_07_2008
{'OH_07_2006': 1.0}
OH_08_2008
{'OH_08_2006': 1.0}
OH_09_2008
{'OH_09_2006': 1.0}
OH_10_2008
{'OH_10_2006': 1.0}
OH_11_2008
{'OH_11_2006': 1.0}
OH_12_2008
{'OH_12_2006': 1.0}
OH_13_2008
{'OH_13_2006': 1.0}
OH_14_2008
{'OH_14_2006': 1.0}
OH_15_2008
{'OH_15_2006': 1.0}
OH_16_2008
{'OH_16_2006': 1.0}
OH_17_2008
{'OH_17_2006': 1.0}
OH_18_2008
{'OH_18_2006': 1.0}
OK_01_2008
{'OK_01_2006': 1.0}
OK_02_2008
{'OK_02_2006': 1.0}
OK_03_2008
{'OK_03_2006': 1.0}
OK_04_2008
{'OK_04_2006': 1.0}
OK_05_2008
{'OK_05_2006': 1.0}
OR_01_2008
{'OR_01_2006': 1.0}
OR_02_2008
{'OR_02_2006': 1.0}
OR_03_2008
{'OR_03_2006': 1.0}
OR_04_2008
{'OR_04_2006': 1.0}
OR_05_2008
{'OR_05_2006': 1.0}
PA_01_2008
{'PA_01_2006': 1.0}
PA_02_2008
{'PA_02_2006': 1.0}
PA_03_2008
{'PA_03_2006': 1.0}
PA_04_2008
{'PA_04_2006': 1.0}
PA_05_2008
{'PA_05_2006': 1.0}
PA_06_2008
{'PA_06_2006': 1.0}
PA_07_2008
{'PA_07_2006': 1.0}
PA_08_2008
{'PA_08_2006': 1.0}
PA_09_2008
{'PA_09_2006': 1.0}
PA_10_20

IL_10_2010
{'IL_10_2008': 1.0}
IL_11_2010
{'IL_11_2008': 1.0}
IL_12_2010
{'IL_12_2008': 1.0}
IL_13_2010
{'IL_13_2008': 1.0}
IL_14_2010
{'IL_14_2008': 1.0}
IL_15_2010
{'IL_15_2008': 1.0}
IL_16_2010
{'IL_16_2008': 1.0}
IL_17_2010
{'IL_17_2008': 1.0}
IL_18_2010
{'IL_18_2008': 1.0}
IL_19_2010
{'IL_19_2008': 1.0}
IN_01_2010
{'IN_01_2008': 1.0}
IN_02_2010
{'IN_02_2008': 1.0}
IN_03_2010
{'IN_03_2008': 1.0}
IN_04_2010
{'IN_04_2008': 1.0}
IN_05_2010
{'IN_05_2008': 1.0}
IN_06_2010
{'IN_06_2008': 1.0}
IN_07_2010
{'IN_07_2008': 1.0}
IN_08_2010
{'IN_08_2008': 1.0}
IN_09_2010
{'IN_09_2008': 1.0}
IA_01_2010
{'IA_01_2008': 1.0}
IA_02_2010
{'IA_02_2008': 1.0}
IA_03_2010
{'IA_03_2008': 1.0}
IA_04_2010
{'IA_04_2008': 1.0}
IA_05_2010
{'IA_05_2008': 1.0}
KS_01_2010
{'KS_01_2008': 1.0}
KS_02_2010
{'KS_02_2008': 1.0}
KS_03_2010
{'KS_03_2008': 1.0}
KS_04_2010
{'KS_04_2008': 1.0}
KY_01_2010
{'KY_01_2008': 1.0}
KY_02_2010
{'KY_02_2008': 1.0}
KY_03_2010
{'KY_03_2008': 1.0}
KY_04_2010
{'KY_04_2008': 1.0}
KY_05_20

VT_01_2010
{'VT_01_2008': 1.0}
VA_01_2010
{'VA_01_2008': 1.0}
VA_02_2010
{'VA_02_2008': 1.0}
VA_03_2010
{'VA_03_2008': 1.0}
VA_04_2010
{'VA_04_2008': 1.0}
VA_05_2010
{'VA_05_2008': 1.0}
VA_06_2010
{'VA_06_2008': 1.0}
VA_07_2010
{'VA_07_2008': 1.0}
VA_08_2010
{'VA_08_2008': 1.0}
VA_09_2010
{'VA_09_2008': 1.0}
VA_10_2010
{'VA_10_2008': 1.0}
VA_11_2010
{'VA_11_2008': 1.0}
WA_01_2010
{'WA_01_2008': 1.0}
WA_02_2010
{'WA_02_2008': 1.0}
WA_03_2010
{'WA_03_2008': 1.0}
WA_04_2010
{'WA_04_2008': 1.0}
WA_05_2010
{'WA_05_2008': 1.0}
WA_06_2010
{'WA_06_2008': 1.0}
WA_07_2010
{'WA_07_2008': 1.0}
WA_08_2010
{'WA_08_2008': 1.0}
WA_09_2010
{'WA_09_2008': 1.0}
WV_01_2010
{'WV_01_2008': 1.0}
WV_02_2010
{'WV_02_2008': 1.0}
WV_03_2010
{'WV_03_2008': 1.0}
WI_01_2010
{'WI_01_2008': 1.0}
WI_02_2010
{'WI_02_2008': 1.0}
WI_03_2010
{'WI_03_2008': 1.0}
WI_04_2010
{'WI_04_2008': 1.0}
WI_05_2010
{'WI_05_2008': 1.0}
WI_06_2010
{'WI_06_2008': 1.0}
WI_07_2010
{'WI_07_2008': 1.0}
WI_08_2010
{'WI_08_2008': 1.0}
WY_01_20

CT_04_2012
{'CT_03_2010': 0.01, 'CT_04_2010': 0.99}
CT_05_2012
{'CT_05_2010': 1.0}
DE_01_2012
{'DE_01_2010': 1.0}
FL_01_2012
{'FL_01_2010': 0.988, 'FL_02_2010': 0.012}
FL_02_2012
{'FL_01_2010': 0.142, 'FL_02_2010': 0.735, 'FL_04_2010': 0.123}
FL_03_2012
{'FL_02_2010': 0.117, 'FL_03_2010': 0.016, 'FL_04_2010': 0.267, 'FL_05_2010': 0.162, 'FL_06_2010': 0.438}
FL_04_2012
{'FL_03_2010': 0.019, 'FL_04_2010': 0.821, 'FL_06_2010': 0.16}
FL_05_2012
{'FL_03_2010': 0.841, 'FL_04_2010': 0.006, 'FL_06_2010': 0.027, 'FL_07_2010': 0.017, 'FL_08_2010': 0.063, 'FL_24_2010': 0.047}
FL_06_2012
{'FL_03_2010': 0.183, 'FL_07_2010': 0.617, 'FL_24_2010': 0.201}
FL_07_2012
{'FL_03_2010': 0.044, 'FL_07_2010': 0.295, 'FL_08_2010': 0.041, 'FL_24_2010': 0.62}
FL_08_2012
{'FL_15_2010': 0.522, 'FL_24_2010': 0.478}
FL_09_2012
{'FL_08_2010': 0.132, 'FL_12_2010': 0.101, 'FL_15_2010': 0.668, 'FL_24_2010': 0.099}
FL_10_2012
{'FL_03_2010': 0.016, 'FL_05_2010': 0.198, 'FL_06_2010': 0.025, 'FL_08_2010': 0.595, 'FL_12_2010'

MD_01_2012
{'MD_01_2010': 0.826, 'MD_02_2010': 0.039, 'MD_06_2010': 0.135}
MD_02_2012
{'MD_01_2010': 0.001, 'MD_02_2010': 0.801, 'MD_03_2010': 0.155, 'MD_07_2010': 0.043}
MD_03_2012
{'MD_01_2010': 0.007, 'MD_02_2010': 0.077, 'MD_03_2010': 0.582, 'MD_04_2010': 0.248, 'MD_07_2010': 0.086}
MD_04_2012
{'MD_01_2010': 0.02, 'MD_02_2010': 0.015, 'MD_03_2010': 0.305, 'MD_04_2010': 0.602, 'MD_05_2010': 0.03, 'MD_08_2010': 0.028}
MD_05_2012
{'MD_03_2010': 0.042, 'MD_04_2010': 0.024, 'MD_05_2010': 0.934}
MD_06_2012
{'MD_04_2010': 0.148, 'MD_06_2010': 0.395, 'MD_08_2010': 0.457}
MD_07_2012
{'MD_01_2010': 0.013, 'MD_02_2010': 0.066, 'MD_03_2010': 0.037, 'MD_06_2010': 0.046, 'MD_07_2010': 0.838}
MD_08_2012
{'MD_04_2010': 0.149, 'MD_06_2010': 0.353, 'MD_08_2010': 0.497}
MA_01_2012
{'MA_01_2010': 0.524, 'MA_02_2010': 0.476}
MA_02_2012
{'MA_01_2010': 0.27, 'MA_02_2010': 0.353, 'MA_03_2010': 0.377}
MA_03_2012
{'MA_01_2010': 0.086, 'MA_03_2010': 0.05, 'MA_05_2010': 0.863}
MA_04_2012
{'MA_02_2010': 0.035,

NC_01_2012
{'NC_01_2010': 0.716, 'NC_02_2010': 0.119, 'NC_03_2010': 0.01, 'NC_04_2010': 0.078, 'NC_13_2010': 0.077}
NC_02_2012
{'NC_02_2010': 0.271, 'NC_04_2010': 0.166, 'NC_06_2010': 0.436, 'NC_07_2010': 0.034, 'NC_08_2010': 0.092, 'NC_13_2010': 0.001}
NC_03_2012
{'NC_01_2010': 0.25, 'NC_03_2010': 0.648, 'NC_07_2010': 0.102}
NC_04_2012
{'NC_02_2010': 0.188, 'NC_04_2010': 0.653, 'NC_07_2010': 0.024, 'NC_08_2010': 0.015, 'NC_13_2010': 0.119}
NC_05_2012
{'NC_05_2010': 0.616, 'NC_06_2010': 0.026, 'NC_10_2010': 0.004, 'NC_12_2010': 0.354}
NC_06_2012
{'NC_04_2010': 0.15, 'NC_05_2010': 0.174, 'NC_06_2010': 0.187, 'NC_12_2010': 0.018, 'NC_13_2010': 0.471}
NC_07_2012
{'NC_02_2010': 0.306, 'NC_03_2010': 0.066, 'NC_07_2010': 0.599, 'NC_08_2010': 0.029}
NC_08_2012
{'NC_06_2010': 0.156, 'NC_07_2010': 0.095, 'NC_08_2010': 0.585, 'NC_09_2010': 0.15, 'NC_12_2010': 0.014}
NC_09_2012
{'NC_05_2010': 0.062, 'NC_08_2010': 0.003, 'NC_09_2010': 0.776, 'NC_10_2010': 0.088, 'NC_12_2010': 0.071}
NC_10_2012
{'N

TX_25_2012
{'TX_10_2010': 0.003, 'TX_11_2010': 0.055, 'TX_17_2010': 0.424, 'TX_21_2010': 0.071, 'TX_25_2010': 0.087, 'TX_31_2010': 0.36}
TX_26_2012
{'TX_12_2010': 0.011, 'TX_24_2010': 0.063, 'TX_26_2010': 0.926}
TX_27_2012
{'TX_14_2010': 0.482, 'TX_15_2010': 0.074, 'TX_25_2010': 0.283, 'TX_27_2010': 0.161}
TX_28_2012
{'TX_15_2010': 0.002, 'TX_21_2010': 0.024, 'TX_23_2010': 0.002, 'TX_28_2010': 0.972}
TX_29_2012
{'TX_02_2010': 0.002, 'TX_18_2010': 0.022, 'TX_22_2010': 0.027, 'TX_29_2010': 0.949}
TX_30_2012
{'TX_24_2010': 0.212, 'TX_30_2010': 0.774, 'TX_32_2010': 0.013}
TX_31_2012
{'TX_31_2010': 1.0}
TX_32_2012
{'TX_03_2010': 0.383, 'TX_04_2010': 0.004, 'TX_05_2010': 0.009, 'TX_30_2010': 0.035, 'TX_32_2010': 0.568}
TX_33_2012
{'TX_06_2010': 0.005, 'TX_12_2010': 0.023, 'TX_24_2010': 0.16, 'TX_26_2010': 0.082, 'TX_30_2010': 0.122, 'TX_32_2010': 0.607}
TX_34_2012
{'TX_15_2010': 0.301, 'TX_25_2010': 0.067, 'TX_27_2010': 0.632}
TX_35_2012
{'TX_10_2010': 0.018, 'TX_20_2010': 0.696, 'TX_21_2010

IL_12_2014
{'IL_12_2012': 1.0}
IL_13_2014
{'IL_13_2012': 1.0}
IL_14_2014
{'IL_14_2012': 1.0}
IL_15_2014
{'IL_15_2012': 1.0}
IL_16_2014
{'IL_16_2012': 1.0}
IL_17_2014
{'IL_17_2012': 1.0}
IL_18_2014
{'IL_18_2012': 1.0}
IN_01_2014
{'IN_01_2012': 1.0}
IN_02_2014
{'IN_02_2012': 1.0}
IN_03_2014
{'IN_03_2012': 1.0}
IN_04_2014
{'IN_04_2012': 1.0}
IN_05_2014
{'IN_05_2012': 1.0}
IN_06_2014
{'IN_06_2012': 1.0}
IN_07_2014
{'IN_07_2012': 1.0}
IN_08_2014
{'IN_08_2012': 1.0}
IN_09_2014
{'IN_09_2012': 1.0}
IA_01_2014
{'IA_01_2012': 1.0}
IA_02_2014
{'IA_02_2012': 1.0}
IA_03_2014
{'IA_03_2012': 1.0}
IA_04_2014
{'IA_04_2012': 1.0}
KS_01_2014
{'KS_01_2012': 1.0}
KS_02_2014
{'KS_02_2012': 1.0}
KS_03_2014
{'KS_03_2012': 1.0}
KS_04_2014
{'KS_04_2012': 1.0}
KY_01_2014
{'KY_01_2012': 1.0}
KY_02_2014
{'KY_02_2012': 1.0}
KY_03_2014
{'KY_03_2012': 1.0}
KY_04_2014
{'KY_04_2012': 1.0}
KY_05_2014
{'KY_05_2012': 1.0}
KY_06_2014
{'KY_06_2012': 1.0}
LA_01_2014
{'LA_01_2012': 1.0}
LA_02_2014
{'LA_02_2012': 1.0}
LA_03_20

VA_07_2014
{'VA_07_2012': 1.0}
VA_08_2014
{'VA_08_2012': 1.0}
VA_09_2014
{'VA_09_2012': 1.0}
VA_10_2014
{'VA_10_2012': 1.0}
VA_11_2014
{'VA_11_2012': 1.0}
WA_01_2014
{'WA_01_2012': 1.0}
WA_02_2014
{'WA_02_2012': 1.0}
WA_03_2014
{'WA_03_2012': 1.0}
WA_04_2014
{'WA_04_2012': 1.0}
WA_05_2014
{'WA_05_2012': 1.0}
WA_06_2014
{'WA_06_2012': 1.0}
WA_07_2014
{'WA_07_2012': 1.0}
WA_08_2014
{'WA_08_2012': 1.0}
WA_09_2014
{'WA_09_2012': 1.0}
WA_10_2014
{'WA_10_2012': 1.0}
WV_01_2014
{'WV_01_2012': 1.0}
WV_02_2014
{'WV_02_2012': 1.0}
WV_03_2014
{'WV_03_2012': 1.0}
WI_01_2014
{'WI_01_2012': 1.0}
WI_02_2014
{'WI_02_2012': 1.0}
WI_03_2014
{'WI_03_2012': 1.0}
WI_04_2014
{'WI_04_2012': 1.0}
WI_05_2014
{'WI_05_2012': 1.0}
WI_06_2014
{'WI_06_2012': 1.0}
WI_07_2014
{'WI_07_2012': 1.0}
WI_08_2014
{'WI_08_2012': 1.0}
WY_01_2014
{'WY_01_2012': 1.0}
AL_01_2016
{'AL_01_2014': 1.0}
AL_02_2016
{'AL_02_2014': 1.0}
AL_03_2016
{'AL_03_2014': 1.0}
AL_04_2016
{'AL_04_2014': 1.0}
AL_05_2016
{'AL_05_2014': 1.0}
AL_06_20

MI_02_2016
{'MI_02_2014': 1.0}
MI_03_2016
{'MI_03_2014': 1.0}
MI_04_2016
{'MI_04_2014': 1.0}
MI_05_2016
{'MI_05_2014': 1.0}
MI_06_2016
{'MI_06_2014': 1.0}
MI_07_2016
{'MI_07_2014': 1.0}
MI_08_2016
{'MI_08_2014': 1.0}
MI_09_2016
{'MI_09_2014': 1.0}
MI_10_2016
{'MI_10_2014': 1.0}
MI_11_2016
{'MI_11_2014': 1.0}
MI_12_2016
{'MI_12_2014': 1.0}
MI_13_2016
{'MI_13_2014': 1.0}
MI_14_2016
{'MI_14_2014': 1.0}
MN_01_2016
{'MN_01_2014': 1.0}
MN_02_2016
{'MN_02_2014': 1.0}
MN_03_2016
{'MN_03_2014': 1.0}
MN_04_2016
{'MN_04_2014': 1.0}
MN_05_2016
{'MN_05_2014': 1.0}
MN_06_2016
{'MN_06_2014': 1.0}
MN_07_2016
{'MN_07_2014': 1.0}
MN_08_2016
{'MN_08_2014': 1.0}
MS_01_2016
{'MS_01_2014': 1.0}
MS_02_2016
{'MS_02_2014': 1.0}
MS_03_2016
{'MS_03_2014': 1.0}
MS_04_2016
{'MS_04_2014': 1.0}
MO_01_2016
{'MO_01_2014': 1.0}
MO_02_2016
{'MO_02_2014': 1.0}
MO_03_2016
{'MO_03_2014': 1.0}
MO_04_2016
{'MO_04_2014': 1.0}
MO_05_2016
{'MO_05_2014': 1.0}
MO_06_2016
{'MO_06_2014': 1.0}
MO_07_2016
{'MO_07_2014': 1.0}
MO_08_20

WI_01_2016
{'WI_01_2014': 1.0}
WI_02_2016
{'WI_02_2014': 1.0}
WI_03_2016
{'WI_03_2014': 1.0}
WI_04_2016
{'WI_04_2014': 1.0}
WI_05_2016
{'WI_05_2014': 1.0}
WI_06_2016
{'WI_06_2014': 1.0}
WI_07_2016
{'WI_07_2014': 1.0}
WI_08_2016
{'WI_08_2014': 1.0}
WY_01_2016
{'WY_01_2014': 1.0}
AL_01_2018
{'AL_01_2016': 1.0}
AL_02_2018
{'AL_02_2016': 1.0}
AL_03_2018
{'AL_03_2016': 1.0}
AL_04_2018
{'AL_04_2016': 1.0}
AL_05_2018
{'AL_05_2016': 1.0}
AL_06_2018
{'AL_06_2016': 1.0}
AL_07_2018
{'AL_07_2016': 1.0}
AK_01_2018
{'AK_01_2016': 1.0}
AZ_01_2018
{'AZ_01_2016': 1.0}
AZ_02_2018
{'AZ_02_2016': 1.0}
AZ_03_2018
{'AZ_03_2016': 1.0}
AZ_04_2018
{'AZ_04_2016': 1.0}
AZ_05_2018
{'AZ_05_2016': 1.0}
AZ_06_2018
{'AZ_06_2016': 1.0}
AZ_07_2018
{'AZ_07_2016': 1.0}
AZ_08_2018
{'AZ_08_2016': 1.0}
AZ_09_2018
{'AZ_07_2016': 0.001, 'AZ_09_2016': 0.999}
AR_01_2018
{'AR_01_2016': 1.0}
AR_02_2018
{'AR_02_2016': 1.0}
AR_03_2018
{'AR_03_2016': 1.0}
AR_04_2018
{'AR_04_2016': 1.0}
CA_01_2018
{'CA_01_2016': 1.0}
CA_10_2018
{'CA_

NM_02_2018
{'NM_02_2016': 1.0}
NM_03_2018
{'NM_03_2016': 1.0}
NY_01_2018
{'NY_01_2016': 1.0}
NY_10_2018
{'NY_10_2016': 1.0}
NY_11_2018
{'NY_11_2016': 1.0}
NY_12_2018
{'NY_12_2016': 1.0}
NY_13_2018
{'NY_13_2016': 1.0}
NY_14_2018
{'NY_14_2016': 1.0}
NY_15_2018
{'NY_15_2016': 1.0}
NY_16_2018
{'NY_16_2016': 1.0}
NY_17_2018
{'NY_17_2016': 1.0}
NY_18_2018
{'NY_18_2016': 1.0}
NY_19_2018
{'NY_19_2016': 1.0}
NY_02_2018
{'NY_02_2016': 1.0}
NY_20_2018
{'NY_20_2016': 1.0}
NY_21_2018
{'NY_21_2016': 1.0}
NY_22_2018
{'NY_22_2016': 1.0}
NY_23_2018
{'NY_23_2016': 1.0}
NY_24_2018
{'NY_24_2016': 1.0}
NY_25_2018
{'NY_25_2016': 1.0}
NY_26_2018
{'NY_26_2016': 1.0}
NY_27_2018
{'NY_27_2016': 1.0}
NY_03_2018
{'NY_03_2016': 1.0}
NY_04_2018
{'NY_04_2016': 1.0}
NY_05_2018
{'NY_05_2016': 1.0}
NY_06_2018
{'NY_06_2016': 1.0}
NY_07_2018
{'NY_07_2016': 1.0}
NY_08_2018
{'NY_08_2016': 1.0}
NY_09_2018
{'NY_09_2016': 1.0}
NC_01_2018
{'NC_01_2016': 1.0}
NC_10_2018
{'NC_10_2016': 1.0}
NC_11_2018
{'NC_11_2016': 1.0}
NC_12_20

In [46]:
# Read in the files written here and add them to a master overlap set
all_overlap_data_df = pickle.load(open('overlap_frac.p','rb'))
all_overlap_data_df['centroid'] = pickle.load(open('centroid.p','rb'))['centroid']
all_overlap_data_df['population_overlap'] = \
    pickle.load(open('pop_frac.p','rb'))['population_overlap']
all_overlap_data_df['inverse_population_overlap'] = \
    pickle.load(open('inv_pop_frac.p','rb'))['inverse_population_overlap']
pickle.dump(all_overlap_data_df, open('all_overlap_data.p', 'wb'))

In [None]:
# find if a district has changed between last year and this year
def check_if_districts_changed(this_year, district_df, threshold_for_change=0.1):
    """
    *** DEPRECIATED *** 
    might have some bugs. 
    use district_overlap to perform this function instead.
    
    Checks if each district changed more than the set threshold since the last election year.
    Change is a fraction of the district which overlaps with the previous year of that district (ranges 0-1).
    If the total area of the district increases, the overlap area is divided by the largest of the two years.
        (this prevents counting a district as unchanged if its area has increased)
    
    input:
        this_year -- (int) Elections years of the districts you want to check. Even years only. 
        district_df -- (pd dataframe) dataframe with default indices and shapefiles stored in 
                       a column named 'shape'
    output:
        district_df -- pandas data frame with index in format ST_00_YYYY (e.g. AL_01_2018) 
                       with change from last year stored in a column called 'border_change'.
                       Statuses:
                           'same' - this district has not changed at all since the previous year
                           'new' - indicates a district with this ID was not in this state last year
                           'changed' - indicates the borders have changed from last year, but that 
                                       this district was present in its state last year
    """
    # loop over states so you only have to compare districts in-state
    # otherwise, comparing each district to 434 other districts would be super slow
    for ST in state_abbrs: 
        prev_year = this_year-2
        # get the relevant districts
        districts = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==this_year)]
        districts_prev = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==prev_year)]
        # loop over districts in your current year
        for ind,district in districts.iterrows():
            # find previous year's district
            district_prev = districts_prev.loc[districts_prev['district']==district['district']]
            ind_prev = '{}{}'.format(ind[:6],prev_year)

            # determine whether district is new or borders have changed
            if ind_prev in districts_prev.index: # if the district didn't exist last year
                # then this district is new this year
                district_df.loc[ind,'border_change'] = 'new'
            else: 
                # check if the borders have changed
                shape_prev = district_prev['shape'].values[0]
                shape = district['shape']

                # check if shapes intersect with themselves
                if not (shape.is_valid and shape_prev.is_valid): 
                    # if they do, use buffer to correct this
                    print('The following polygons intersected with themselves. Attempting to buffer shape...')
                    if not shape.is_valid:
                        print(ind)
                        shape = shape.buffer(0)
                    if not shape_prev.is_valid:
                        print(ind_prev)
                        shape_prev = shape_prev.buffer(0)

                # calculate overlap percent
                area = shape.area # area of this district
                area_prev = shape_prev.area
                overlap_area = shape.intersection(shape_prev).area # area of overlap between shape and shape_prev
                
                if year==2016:
                    # we switch data sources in 2016
                    # Tiger Line districts include area over water, while UCLA districts do not.
                    # to avoid counting all of these districts as changed (because of the added 
                    # water area), we always divide by area_prev
                    frac_overlap = overlap_area/area_prev
                else:   
                    # fractional overlap between new and old district
                    # divide by the larger of the old or new district so that increasing district area 
                    # counts as a change
                    frac_overlap = overlap_area/np.max([area,area_prev]) 
                
                if (1.-frac_overlap) < threshold_for_change: 
                    # then district has not changed
                    district_df.loc[ind,'border_change'] = 'same'
                else:
                    # district has changed
                    district_df.loc[ind,'border_change'] = 'changed'
                    print(ind)
                    print(frac_overlap)
                    
    return district_df

In [None]:
# depreciated
# # find out if each district changed

# # you can only check a district if you have last year's data, so you read one more district than the data you make
# years_to_read = [2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]
# years_to_check = [2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018] 

# # read in the data (make a fresh df)
# change_df = read_shapefiles(years_to_read, verbose=False)
# for year in years_to_check:
#     change_df = check_if_districts_changed(year, change_df)
# pickle.dump(change_df, open('change.p', 'wb'))