In [2]:
import pandas as pd
import json
import requests

## Migration data from IRS

In [2]:
migration_df = pd.read_csv('Data/IRS_county_to_county_migration_2017-2021.csv')
migration_df.head()

Unnamed: 0,year,dest_state_id,dest_county_id,orig_state_id,orig_county_id,dest_state,dest_county,returns,individuals,adj_gross_income,inflow_type,orig_county,orig_state
0,2017,1,1,96,0,AL,Autauga County Total Migration-US and Foreign,2400,5702,125069000,us_and_foreign,,
1,2017,1,1,97,0,AL,Autauga County Total Migration-US,2366,5573,122696000,us,,
2,2017,1,1,97,1,AL,Autauga County Total Migration-Same State,1516,3352,66187000,same_state,,
3,2017,1,1,97,3,AL,Autauga County Total Migration-Different State,850,2221,56510000,different_state,,
4,2017,1,1,98,0,AL,Autauga County Total Migration-Foreign,34,129,2373000,foreign,,


### Restructure the data to include new single-column County ID's 

In [3]:
# create a new index column 'dest_county_id' for each county
migration_df['new_county_id'] = pd.factorize(migration_df['dest_state_id'].astype(str) + '_' + migration_df['dest_county_id'].astype(str))[0]

In [4]:
# create a new sorting column 'inflow_type'

def get_inflow_type(row):
    if row['orig_state_id'] == 96 and row['orig_county_id'] == 0:
        return 'us_and_foreign'
    elif row['orig_state_id'] == 97 and row['orig_county_id'] == 0:
        return 'us'
    elif row['orig_state_id'] == 97 and row['orig_county_id'] == 1:
        return 'same_state'
    elif row['orig_state_id'] == 97 and row['orig_county_id'] == 3:
        return 'different_state'
    elif row['orig_state_id'] == 98 and row['orig_county_id'] == 0:
        return 'foreign'
    elif row['dest_county_id'] == row['orig_county_id']:
        return 'non-migrants'
    elif row['orig_state_id'] == 57 and row['orig_county_id'] == 9:
        return 'other_foreign'
    elif row['orig_state_id'] == 58 and row['orig_county_id'] == 0:
        return 'other_same_state'
    elif row['orig_state_id'] == 59 and row['orig_county_id'] == 0:
        return 'other_diff_state'
    else:
        return 'us_counties'  # Default value for unmatched cases

# Add the 'inflow_type' column based on the custom function
migration_df['inflow_type'] = migration_df.apply(get_inflow_type, axis=1)

## Create new Counties table / df

In [None]:
import re

# Create an empty DataFrame to store the results
counties_df = pd.DataFrame(columns=['county_id', 'county', 'state', 'fips_state', 'fips_county'])

# Iterate over the unique new_county_id in migration_df
for dest_county_id in migration_df['new_county_id'].unique():
    # Select the first row with the current dest_county_id
    row = migration_df[migration_df['new_county_id'] == dest_county_id].iloc[0]
    # Extract the state and county information from the selected row
    state = row['dest_state']
    county = re.split(' County| Parish| City| Borough| Municipality', row['dest_county'], flags=re.IGNORECASE)[0]
    fips_st = row['dest_state_id']
    fips_ct = row['dest_county_id']
    # Create a temporary DataFrame and concatenate it with counties_df
    temp_df = pd.DataFrame({
        'county_id': [dest_county_id],
        'county': [county],
        'state': [state],
        'fips_state': [fips_st],
        'fips_county': [fips_ct]
    })
    counties_df = pd.concat([counties_df, temp_df], ignore_index=True).reset_index(drop=True)

counties_df = counties_df[['county_id', 'county', 'state', 'fips_state', 'fips_county']]

In [None]:
# remove all rows showing county name as 'census'
counties_df = counties_df[~counties_df['county'].str.contains('census', case=False, na=False)]

# remove all rows with no county identified (from 2017, 2018 data)
counties_df = counties_df[~counties_df['county'].str.startswith('Total')]

In [20]:
counties_df = pd.read_csv('Data/us_counties')


In [23]:
counties_df.dtypes

county         object
state          object
fips_state      int64
fips_county     int64
fips           object
dtype: object

In [13]:
counties_df

Unnamed: 0,county,state,fips_state,fips_county,fips
0,Abbeville,SC,45,1,45001
1,Acadia,LA,22,1,22001
2,Accomack,VA,51,1,51001
3,Ada,ID,16,1,16001
4,Adair,KY,21,1,21001
...,...,...,...,...,...
3126,Yuma,CO,8,125,8125
3127,Yuma,AZ,4,27,4027
3128,Zapata,TX,48,505,48505
3129,Zavala,TX,48,507,48507


In [None]:
counties_df['fips_state'] = counties_df['fips_state'].astype(int)
counties_df['fips_county'] = counties_df['fips_county'].astype(int)

In [16]:
# Convert fips_state and fips_county to string ensuring the correct format
counties_df['fips_state'] = counties_df['fips_state'].apply(lambda x: f"{x:05}")
counties_df['fips_county'] = counties_df['fips_county'].apply(lambda x: f"{x:03}")

# Combine the two fips columns into 'geoid' column
counties_df['fips'] = counties_df['fips_state'] + counties_df['fips_county']

In [14]:
counties_df[counties_df['fips'].astype(int) < 10000]

Unnamed: 0,county,state,fips_state,fips_county,fips
11,Adams,CO,8,1,8001
25,Alameda,CA,6,1,6001
26,Alamosa,CO,8,3,8003
32,Aleutians East,AK,2,13,2013
52,Alpine,CA,6,3,6003
...,...,...,...,...,...
3114,Yell,AR,5,149,5149
3118,Yolo,CA,6,113,6113
3125,Yuba,CA,6,115,6115
3126,Yuma,CO,8,125,8125


In [22]:
counties_df['fips'] = counties_df['fips'].astype(str)

In [None]:
counties_df[['fips_state', 'fips_county']] = counties_df[['fips_state', 'fips_county']].astype(int)

## Migration table restructuring

In [None]:
# Perform a merge operation to add 'orig_county' and 'orig_state' names 
migration_df = migration_df.merge(
    counties_df, 
    how='left', 
    left_on=['orig_state_id', 'orig_county_id'], 
    right_on=['fips_state', 'fips_county']
)

In [None]:
migration_df = migration_df.drop(columns=['county_id', 'new_county_id', 'fips_state', 'fips_county', 'geoid'])
migration_df = migration_df.rename(columns={'county': 'orig_county', 'state': 'orig_state'})
migration_df.fillna('', inplace=True)
migration_df.head(1)

Unnamed: 0,year,dest_state_id,dest_county_id,orig_state_id,orig_county_id,dest_state,dest_county,returns,individuals,adj_gross_income,inflow_type,orig_county,orig_state
0,2017,1,1,96,0,AL,Autauga County Total Migration-US and Foreign,2400,5702,125069000,us_and_foreign,,


In [None]:
len(migration_df)

385052

In [None]:
# save migration_df to csv
migration_df.to_csv('Data/IRS_county_to_county_migration_2017-2021.csv', index=False)

In [None]:
# save counties_df to csv (drop 'county_id)
counties_df = counties_df.drop(columns='county_id')
counties_df = counties_df.sort_values(by='county')
counties_df.to_csv('Data/us_counties', index=False)