In [1]:
import numpy as np
import pickle
import cartopy.io.shapereader as shpreader
from cartopy.feature import ShapelyFeature
from shapely.prepared import prep
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import pandas as pd

In [2]:
# Projection is lat/lon (unprojected/cylindrical equidistant)
# The proj.4 string:
# +proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs

In [3]:
# some useful arrays to translate between naming conventions

election_year_list = np.array([1992, 1994, 1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010, 2012, 
                                   2014, 2016, 2018])
congress_ID_list = np.array([103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116])

state_names = np.array(['ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA', 
               'COLORADO', 'CONNECTICUT', 'DELAWARE', 'FLORIDA', 'GEORGIA', 
               'HAWAII', 'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS', 
               'KENTUCKY', 'LOUISIANA', 'MAINE', 'MARYLAND', 'MASSACHUSETTS', 
               'MICHIGAN', 'MINNESOTA', 'MISSISSIPPI', 'MISSOURI', 'MONTANA', 
               'NEBRASKA', 'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY', 'NEW MEXICO', 
               'NEW YORK', 'NORTH CAROLINA', 'NORTH DAKOTA', 'OHIO', 'OKLAHOMA', 
               'OREGON', 'PENNSYLVANIA', 'RHODE ISLAND', 'SOUTH CAROLINA', 
               'SOUTH DAKOTA', 'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT', 
               'VIRGINIA', 'WASHINGTON', 'WEST VIRGINIA', 'WISCONSIN', 'WYOMING'])

state_abbrs = np.array(['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL',
              'IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT',
              'NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI',
              'SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY'])

state_fips = np.array([1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 
                       24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 
                       41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56])

In [4]:
def read_shapefiles(election_years,verbose=True):
    """
    Reads in shapefiles from UCLA database (pre-2016) US Census TigerLine files (2016 on).
    data sources: http://cdmaps.polisci.ucla.edu/
                  https://www.census.gov/geo/maps-data/data/cbf/cbf_cds.html
    Note that file names must be in the same folder as the code with the following format:
        -pre-2016: districtShapes-NNN/districtsNNN.shp
        -2016 on: must be named t1_YYYY_us_cdNNN/tl_YYYY_us_cdNNN.shp
    ...where NNN is congress ID and YYYY is election year
    
    input:
        election_years -- (list) Elections years of the shape files you want to read. Even years only. 
        verbose -- (bool) set to True if you want it to print every file as it reads it in
    output:
        district_df -- pandas data frame with index in format ST_00_YYYY (e.g. AL_01_2018) with 
                       shapefiles stored in a column named 'shape'.
    """
    # read in the standard dictionary 
    district_df = pickle.load(open('../Datasets/master_index.p','rb'))
    district_df['shape'] = [np.nan]*district_df.shape[0] # make a blank column
    district_df['shape'] = district_df['shape'].astype(object) # reassign to object so it can hold shapely stuff

    for election_year in election_years:
        # convert election year to "Nth Congress" 
        congress_ID = congress_ID_list[election_year_list==election_year][0]

        # pre-2016 files come from http://cdmaps.polisci.ucla.edu/
        # they are a bit cleaner
        if election_year < 2016:
            # read in the shapefile (must be named 'districtsN.shp' in a folder titled 'districtShapesN')
            shpfilename = 'districtShapes-{0}/districts{0}.shp'.format(congress_ID)
            reader = shpreader.Reader(shpfilename) 
            districts = reader.records() # get full records
            geometries = reader.geometries() # get just the shape
            
            # put the shapefiles into the standard dictionary
            for record in reader.records(): # loop over districts
                attr = record.attributes # dictionary of information about the district
                poly = record.geometry # coordinates of the district as a shapely polygon

                # 1) get the state abbr of the district
                if any(state_names==attr['STATENAME'].upper()): # filter out districts that aren't in states
                    ST = state_abbrs[state_names==attr['STATENAME'].upper()][0]
                else: # pretty much just Washington, DC
                    print('{} is not a state.'.format(attr['STATENAME'].upper()))
                    continue
                # 2) get the id of the district
                id_int = int(attr['DISTRICT'])
                if id_int == 0: # change at-large district ID from 0 to 1 to play nice with our indexing convention.
                    id_int = 1 # todo: do we want to change this back? Looks like 0 for at-large districts is the convention. 
                ID = '{0:02d}'.format(id_int)
                # 3) put it all together into an index
                ind = '{}_{}_{}'.format(ST, ID, election_year)
                if verbose:
                    print('{} was read in.'.format(ind))
                # put the polygon in the dictionary
                district_df.at[ind,'shape'] = poly
            
        # 2016 and later are US Census Bureau Tiger Line Files 
        # (and they're a bit messier)
        if election_year >= 2016:
            # read in the shapefile (must be named 'tl_YYYY_us_cdN.shp' in a folder titled 't1_YYYY_us_cdN')
            shpfilename = 'tl_{1}_us_cd{0}/tl_{1}_us_cd{0}.shp'.format(congress_ID,election_year)
            reader = shpreader.Reader(shpfilename) 
            districts = reader.records() # get full records
            geometries = reader.geometries() # get just the shape

            # put the shapefiles into the standard dictionary
            for record in reader.records(): # loop over districts
                attr = record.attributes # dictionary of information about the district
                poly = record.geometry # coordinates of the district as a shapely polygon

                # 1) get the state abbr of the district
                if any(state_fips==int(attr['STATEFP'])): # filter out districts that aren't in states
                    ST = state_abbrs[state_fips==int(attr['STATEFP'])][0]
                else: # pretty much just Washington, DC
                    print('{} is not a state.'.format(int(attr['STATEFP'])))
                    continue
                # 2) get the id of the district
                if (attr['CD{}FP'.format(congress_ID)]=='ZZ'):
                    print('District ID is ZZ, which indicates this is not a congressional district.')
                    continue
                else:
                    id_int = int(attr['CD{}FP'.format(congress_ID)])
                if id_int == 0: # change at-large district ID from 0 to 1 to play nice with our indexing convention.
                    # todo: do we want to change this back? Looks like 0 for at-large districts is the convention. 
                    id_int = 1
                ID = '{0:02d}'.format(id_int)
                # 3) put it all together into an index
                ind = '{}_{}_{}'.format(ST, ID, election_year)
                if verbose:
                    print('{} was read in.'.format(ind))
                # put the polygon in the dictionary
                district_df.at[ind,'shape'] = poly
    
    return district_df

In [5]:
# compute overlap percent between this district and last year's districts
def district_overlap(this_year, district_df, threshold_for_change=0.1):
    """
    Finds the fractional overlap between this year's district and the previous year's districts. 
    i.e., if a Florida's 1st district has changed its borders from 2014 to 2016, it may be made up of:
        20% of its area may come from district 2 in 2014
        40% of its area may come from district 3 in 2014
        10% of its area may come from district 4 in 2014
    
    input:
        this_year -- (int) Elections years of the districts you want to check. Even years only. 
        district_df -- (pd dataframe) dataframe with default indices and shapefiles stored in 
                       a column named 'shape'
    output:
        district_df -- pandas data frame with index in format ST_00_YYYY (e.g. AL_01_2018) 
                       
                       fractional overlap stored in a column called 'fractional_overlap'.
                       Each district has a dictionary where the keys are the indicies of the previous
                       districts which overlap with our district, and the values are the fractional overlap.
                       
                       change from last year stored in a column called 'border_change'.
                       Statuses:
                           0 - this district has not changed at all since the previous year
                           1 - this district has either changed area > threshold since 
                               the previous year, or it is new.
    """
    # add a column
    if 'overlap_frac' not in district_df.columns:
        district_df['overlap_frac'] = [np.nan]*district_df.shape[0] # make a blank column
        district_df['overlap_frac'] = district_df['overlap_frac'].astype(object) # reassign to object so it can hold shapely stuff

    if 'border_change' not in district_df.columns:
        district_df['border_change'] = [np.nan]*district_df.shape[0] # make a blank column    
    
    # loop over states so you only have to compare districts in-state
    # otherwise, comparing each district to 434 other districts would be super slow
    for ST in state_abbrs: 
        prev_year = this_year-2
        # get the relevant districts
        districts = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==this_year)]
        districts_prev = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==prev_year)]

        for ind,district in districts.iterrows(): # loop over districts in your current year
            overlap_dict = {}
            shape = district['shape']
            area = shape.area # area of this district
            
            # check if shapes intersect with themselves
            if not shape.is_valid: 
                # if they do, use buffer to correct this
                print('The following polygons intersected with themselves. Attempting to buffer shape...')
                print(ind)
                shape = shape.buffer(0)
                    
            for ind_prev,district_prev in districts_prev.iterrows(): # loop over districts in previous year
                shape_prev = district_prev['shape']
                
                # check if shapes intersect with themselves
                if not shape_prev.is_valid:
                    # if they do, use buffer to correct this
                    print('The following polygons intersected with themselves. Attempting to buffer shape...')
                    print(ind_prev)
                    shape_prev = shape_prev.buffer(0)

                # calculate frac overlap
                area_prev = shape_prev.area
                overlap_area = shape.intersection(shape_prev).area # area of overlap between shape and shape_prev
                frac_overlap = np.around(overlap_area/area,decimals=3) # fractional overlap between new and old district
                                                                       # round to the thousandth

                if frac_overlap > 10**-3: # use threshold of 0.1% to avoid trivial changes
                    overlap_dict[ind_prev] = frac_overlap

            # make sure areas add up to 1.
            dict_sum = np.sum(list(overlap_dict.values()))
            if not np.isclose(dict_sum,1.,rtol=1e-03):
                # rescale so they add up to 1.
                print(overlap_dict)
                print('rescaled')
                print(dict_sum)
                overlap_dict = {key : np.around(val/dict_sum, decimals=3) 
                                for key,val in overlap_dict.items() if val/dict_sum > 10**-3}
                    
            print(ind)
            print(overlap_dict)
            district_df.at[ind, 'overlap_frac'] = overlap_dict

            # calculate the status of the new district
            max_frac = np.max(list(overlap_dict.values()))
            ind_prev = '{}{}'.format(ind[:6],prev_year) # same district, different year
            if ind_prev not in districts_prev.index: # the district is new this year
                print('new')
                district_df.at[ind, 'border_change'] = 1
            elif ind_prev not in overlap_dict.keys(): # the district existed last year, but is in a totally new location
                print('totally moved')
                district_df.at[ind, 'border_change'] = 1
            elif 1.-overlap_dict[ind_prev] > threshold_for_change: # the district existed last year, but the borders have moved
                print(1.-overlap_dict[ind_prev])
                print('changed')
                district_df.at[ind, 'border_change'] = 1
            else: # the district is essentially the same as last year
                print('same')
                district_df.at[ind, 'border_change'] = 0
            
    return district_df

In [6]:
def get_centroid(district_df):
    """
    Finds the centroid of a district in lon,lat. 
    
    input: 
        district_df -- (pd dataframe) dataframe with default indices and shapefiles stored in 
                       a column named 'shape'
    output:
        district_df -- pandas data frame with index in format ST_00_YYYY (e.g. AL_01_2018) 
                       with centroid stored in a column called 'centroid'.
                       The centroid is calculated as a lon,lat on a Cartesian plane.
                       It ignores spherical geometry. 
                       The centroid is stored as a tuple in the form (lon,lat)
    """
    # add column
    district_df['centroid'] = [np.nan]*district_df.shape[0] # make a blank column
    district_df['centroid'] = district_df['centroid'].astype(object) # reassign to object so it can hold shapely stuff
    
    for ind, district in district_df.iterrows():
        shape = district['shape']
        if pd.isnull(district['shape']): # if there's no shape, fill it with a nan
            district_df.at[ind, 'centroid'] = np.nan
        else:
            centroid = shape.centroid.coords
            district_df.at[ind, 'centroid'] = centroid # in units of lat/lon
            
    return district_df

In [7]:
# def get_distance(district_df):
#     # add column
#     #todo

In [18]:
# get centroid coords for everything 
years = [2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]

centroid_df = read_shapefiles(years, verbose=False)
for year in years:
     centroid_df = get_centroid(centroid_df)
pickle.dump(centroid_df.drop('shape',axis=1), open('centroid.p', 'wb'))

DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
DISTRICT OF COLUMBIA is not a state.
11 is not a state.
60 is not a state.
66 is not a state.
69 is not a state.
72 is not a state.
78 is not a state.
District ID is ZZ, which indicates this is not a congressional district.
District ID is ZZ, which indicates this is not a congressional district.
District ID is ZZ, which indicates this is not a congressional district.
District ID is ZZ, which indicates this is not a congressional district.
69 is not a state.
District ID is ZZ, which indicates this is not a congressional district.
11 is not a state.
66 is not a state.
78 is not a state.
72 is not a state.
60 is not a state.
District ID is ZZ, which indicates this is not a congressional district.


In [59]:
# find out the area overlap and border change between this year's and last year's districts
# takes ~2hrs to run for all 8 years.

# you can only check a district if you have last year's data, so you read one more district than the data you make
#years_to_read = [2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]
#years_to_check = [2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]
years_to_check = [2018]

# read in the data (make a fresh df)
#overlap_df = read_shapefiles(years_to_read, verbose=False)
for year in years_to_check:
    overlap_df = district_overlap(year, overlap_df)
    pickle.dump(overlap_df.drop('shape', axis=1), open('overlap_frac_{}.p'.format(year), 'wb'))
pickle.dump(overlap_df.drop('shape', axis=1), open('overlap_frac.p', 'wb'))

AL_01_2018
{'AL_01_2016': 1.0}
same
AL_02_2018
{'AL_02_2016': 1.0}
same
AL_03_2018
{'AL_03_2016': 1.0}
same
AL_04_2018
{'AL_04_2016': 1.0}
same
AL_05_2018
{'AL_05_2016': 1.0}
same
AL_06_2018
{'AL_06_2016': 1.0}
same
AL_07_2018
{'AL_07_2016': 1.0}
same
AK_01_2018
{'AK_01_2016': 0.999}
same
AZ_01_2018
{'AZ_01_2016': 1.0}
same
AZ_02_2018
{'AZ_02_2016': 1.0}
same
AZ_03_2018
{'AZ_03_2016': 1.0}
same
AZ_04_2018
{'AZ_04_2016': 1.0}
same
AZ_05_2018
{'AZ_05_2016': 1.0}
same
AZ_06_2018
{'AZ_06_2016': 1.0}
same
AZ_07_2018
{'AZ_07_2016': 1.0}
same
AZ_08_2018
{'AZ_08_2016': 1.0}
same
AZ_09_2018
{'AZ_09_2016': 0.999}
same
AR_01_2018
{'AR_01_2016': 1.0}
same
AR_02_2018
{'AR_02_2016': 1.0}
same
AR_03_2018
{'AR_03_2016': 1.0}
same
AR_04_2018
{'AR_04_2016': 1.0}
same
CA_01_2018
{'CA_01_2016': 1.0}
same
CA_10_2018
{'CA_10_2016': 1.0}
same
CA_11_2018
{'CA_11_2016': 1.0}
same
CA_12_2018
{'CA_12_2016': 1.0}
same
CA_13_2018
{'CA_13_2016': 1.0}
same
CA_14_2018
{'CA_14_2016': 1.0}
same
CA_15_2018
{'CA_15_2016'

MO_04_2018
{'MO_04_2016': 1.0}
same
MO_05_2018
{'MO_05_2016': 1.0}
same
MO_06_2018
{'MO_06_2016': 1.0}
same
MO_07_2018
{'MO_07_2016': 1.0}
same
MO_08_2018
{'MO_08_2016': 1.0}
same
MT_01_2018
{'MT_01_2016': 1.0}
same
NE_01_2018
{'NE_01_2016': 1.0}
same
NE_02_2018
{'NE_02_2016': 1.0}
same
NE_03_2018
{'NE_03_2016': 1.0}
same
NV_01_2018
{'NV_01_2016': 1.0}
same
NV_02_2018
{'NV_02_2016': 1.0}
same
NV_03_2018
{'NV_03_2016': 1.0}
same
NV_04_2018
{'NV_04_2016': 1.0}
same
NH_01_2018
{'NH_01_2016': 1.0}
same
NH_02_2018
{'NH_02_2016': 1.0}
same
NJ_01_2018
{'NJ_01_2016': 1.0}
same
NJ_10_2018
{'NJ_10_2016': 1.0}
same
NJ_11_2018
{'NJ_11_2016': 1.0}
same
NJ_12_2018
{'NJ_12_2016': 1.0}
same
NJ_02_2018
{'NJ_02_2016': 1.0}
same
NJ_03_2018
{'NJ_03_2016': 1.0}
same
NJ_04_2018
{'NJ_04_2016': 1.0}
same
NJ_05_2018
{'NJ_05_2016': 1.0}
same
NJ_06_2018
{'NJ_06_2016': 1.0}
same
NJ_07_2018
{'NJ_07_2016': 1.0}
same
NJ_09_2018
{'NJ_09_2016': 1.0}
same
NJ_08_2018
{'NJ_08_2016': 1.0}
same
NM_01_2018
{'NM_01_2016': 1.

WA_08_2018
{'WA_08_2016': 1.0}
same
WV_01_2018
{'WV_01_2016': 1.0}
same
WV_02_2018
{'WV_02_2016': 1.0}
same
WV_03_2018
{'WV_03_2016': 1.0}
same
WI_01_2018
{'WI_01_2016': 1.0}
same
WI_02_2018
{'WI_02_2016': 1.0}
same
WI_03_2018
{'WI_03_2016': 1.0}
same
WI_04_2018
{'WI_04_2016': 1.0}
same
WI_05_2018
{'WI_05_2016': 1.0}
same
WI_06_2018
{'WI_06_2016': 1.0}
same
WI_07_2018
{'WI_07_2016': 1.0}
same
WI_08_2018
{'WI_08_2016': 1.0}
same
WY_01_2018
{'WY_01_2016': 1.0}
same


In [60]:
overlap_df

Unnamed: 0,district,state,year,shape,overlap_frac,border_change
AK_01_2002,1,AK,2002,(POLYGON ((-135.9073108033481 58.3808390301714...,,
AL_01_2002,1,AL,2002,"(POLYGON ((-87.765152 31.29734599999996, -87.7...",,
AL_02_2002,2,AL,2002,(POLYGON ((-85.06820143377806 31.9895855427267...,,
AL_03_2002,3,AL,2002,"(POLYGON ((-86.32226900000001 32.367738, -86.3...",,
AL_04_2002,4,AL,2002,(POLYGON ((-86.39608200000001 33.8446100000000...,,
AL_05_2002,5,AL,2002,"(POLYGON ((-86.905654 34.453554, -86.905653 34...",,
AL_06_2002,6,AL,2002,"(POLYGON ((-86.962193 33.45120800000001, -86.9...",,
AL_07_2002,7,AL,2002,(POLYGON ((-87.31453600000002 33.2181709999999...,,
AR_01_2002,1,AR,2002,"(POLYGON ((-91.539429 35.09085000000002, -91.5...",,
AR_02_2002,2,AR,2002,"(POLYGON ((-92.072417 34.92463199999997, -92.0...",,


In [65]:
this_df = pickle.load(open('overlap_frac.p','rb'))

In [66]:
this_df

Unnamed: 0,district,state,year,overlap_frac,border_change
AK_01_2002,1,AK,2002,,
AL_01_2002,1,AL,2002,,
AL_02_2002,2,AL,2002,,
AL_03_2002,3,AL,2002,,
AL_04_2002,4,AL,2002,,
AL_05_2002,5,AL,2002,,
AL_06_2002,6,AL,2002,,
AL_07_2002,7,AL,2002,,
AR_01_2002,1,AR,2002,,
AR_02_2002,2,AR,2002,,


In [68]:
# if all of these miraculously run without errors, pickle them in a combined df
all_overlap_data_df = overlap_df.copy()
all_overlap_data_df['centroid'] = centroid_df['centroid']
pickle.dump(all_overlap_data_df.drop('shape', axis=1), open('all_overlap_data.p', 'wb'))

In [69]:
pickle.load(open('all_overlap_data.p', 'rb'))

Unnamed: 0,district,state,year,overlap_frac,border_change,centroid
AK_01_2002,1,AK,2002,,,"((-152.32365505132063, 64.29581383800975))"
AL_01_2002,1,AL,2002,,,"((-87.76820913055884, 31.124893211117662))"
AL_02_2002,2,AL,2002,,,"((-86.10683515960984, 31.724324604044607))"
AL_03_2002,3,AL,2002,,,"((-85.70565800289148, 33.05185901590957))"
AL_04_2002,4,AL,2002,,,"((-87.0956815605327, 34.073723262348935))"
AL_05_2002,5,AL,2002,,,"((-86.9342393256208, 34.74555774066489))"
AL_06_2002,6,AL,2002,,,"((-86.87955908053934, 33.27625932111756))"
AL_07_2002,7,AL,2002,,,"((-87.69017892743545, 32.438300087798105))"
AR_01_2002,1,AR,2002,,,"((-91.22679331388447, 35.5051402842957))"
AR_02_2002,2,AR,2002,,,"((-92.55099516370609, 35.07748095836055))"


In [30]:
master_df = pickle.load(open('../Datasets/master_index.p','rb'))
master_df.loc['TX_23_2018',:]

KeyError: 'the label [TX_23_2018] is not in the [index]'

In [None]:
# find if a district has changed between last year and this year
def check_if_districts_changed(this_year, district_df, threshold_for_change=0.1):
    """
    *** DEPRECIATED *** 
    might have some bugs. 
    use district_overlap to perform this function instead.
    
    Checks if each district changed more than the set threshold since the last election year.
    Change is a fraction of the district which overlaps with the previous year of that district (ranges 0-1).
    If the total area of the district increases, the overlap area is divided by the largest of the two years.
        (this prevents counting a district as unchanged if its area has increased)
    
    input:
        this_year -- (int) Elections years of the districts you want to check. Even years only. 
        district_df -- (pd dataframe) dataframe with default indices and shapefiles stored in 
                       a column named 'shape'
    output:
        district_df -- pandas data frame with index in format ST_00_YYYY (e.g. AL_01_2018) 
                       with change from last year stored in a column called 'border_change'.
                       Statuses:
                           'same' - this district has not changed at all since the previous year
                           'new' - indicates a district with this ID was not in this state last year
                           'changed' - indicates the borders have changed from last year, but that 
                                       this district was present in its state last year
    """
    # loop over states so you only have to compare districts in-state
    # otherwise, comparing each district to 434 other districts would be super slow
    for ST in state_abbrs: 
        prev_year = this_year-2
        # get the relevant districts
        districts = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==this_year)]
        districts_prev = district_df.loc[np.logical_and(district_df['state']==ST,
                                                 district_df['year']==prev_year)]
        # loop over districts in your current year
        for ind,district in districts.iterrows():
            # find previous year's district
            district_prev = districts_prev.loc[districts_prev['district']==district['district']]
            ind_prev = '{}{}'.format(ind[:6],prev_year)

            # determine whether district is new or borders have changed
            if ind_prev in districts_prev.index: # if the district didn't exist last year
                # then this district is new this year
                district_df.loc[ind,'border_change'] = 'new'
            else: 
                # check if the borders have changed
                shape_prev = district_prev['shape'].values[0]
                shape = district['shape']

                # check if shapes intersect with themselves
                if not (shape.is_valid and shape_prev.is_valid): 
                    # if they do, use buffer to correct this
                    print('The following polygons intersected with themselves. Attempting to buffer shape...')
                    if not shape.is_valid:
                        print(ind)
                        shape = shape.buffer(0)
                    if not shape_prev.is_valid:
                        print(ind_prev)
                        shape_prev = shape_prev.buffer(0)

                # calculate overlap percent
                area = shape.area # area of this district
                area_prev = shape_prev.area
                overlap_area = shape.intersection(shape_prev).area # area of overlap between shape and shape_prev
                
                if year==2016:
                    # we switch data sources in 2016
                    # Tiger Line districts include area over water, while UCLA districts do not.
                    # to avoid counting all of these districts as changed (because of the added 
                    # water area), we always divide by area_prev
                    frac_overlap = overlap_area/area_prev
                else:   
                    # fractional overlap between new and old district
                    # divide by the larger of the old or new district so that increasing district area 
                    # counts as a change
                    frac_overlap = overlap_area/np.max([area,area_prev]) 
                
                if (1.-frac_overlap) < threshold_for_change: 
                    # then district has not changed
                    district_df.loc[ind,'border_change'] = 'same'
                else:
                    # district has changed
                    district_df.loc[ind,'border_change'] = 'changed'
                    print(ind)
                    print(frac_overlap)
                    
    return district_df

In [None]:
# depreciated
# # find out if each district changed

# # you can only check a district if you have last year's data, so you read one more district than the data you make
# years_to_read = [2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018]
# years_to_check = [2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018] 

# # read in the data (make a fresh df)
# change_df = read_shapefiles(years_to_read, verbose=False)
# for year in years_to_check:
#     change_df = check_if_districts_changed(year, change_df)
# pickle.dump(change_df, open('change.p', 'wb'))