## Merging

## Tweaks: 
- 2023 cross walk
- 2013 cross walk

In [1]:
crosswalk_version = 2023

In [2]:
import yaml
import pandas as pd

# Step 1: Load YAML
with open("../../config/merging.yaml", "r") as f:
    config = yaml.safe_load(f)

# Step 2: Select version and extract file names
file_list = config[crosswalk_version]['files']
join_keys = config[crosswalk_version]['join_keys']

# Step 3: Define data folder path
data_path = "../../data/interim/"

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [3]:
# Step 4: Load each CSV using the paths from the config
criteria_df = pd.read_csv(data_path + file_list["criteria"])
hazard_df   = pd.read_csv(data_path + file_list["hazard"])
migration_df= pd.read_csv(data_path + file_list["migration"])
housing_df  = pd.read_csv(data_path + file_list["housing"])

In [4]:
join_keys

{'criteria': 'Crosswalk2023_CBSA Code',
 'hazard': 'Crosswalk2023_CBSA Code',
 'migration': 'Migration16to20_Metro Code of Geography A',
 'housing': 'HousingPI25_GEOID'}

In [5]:
criteria_df.head(3)

Unnamed: 0,Crosswalk2023_CBSA Code,Crosswalk2023_CBSA Title,Crosswalk2023_State Name
0,10180,"Abilene, TX",Texas
1,10380,"Aguadilla, PR",Puerto Rico
2,10420,"Akron, OH",Ohio


In [6]:
hazard_df.head(3)

Unnamed: 0,Crosswalk2023_CBSA Code,NRI_AVLN_EALPE,NRI_CFLD_EALPE,NRI_CWAV_EALPE,NRI_ERQK_EALPE,NRI_HAIL_EALPE,NRI_HWAV_EALPE,NRI_HRCN_EALPE,NRI_ISTM_EALPE,NRI_LNDS_EALPE,...,NRI_RFLD_EXPB,NRI_SWND_EXPB,NRI_TRND_EXPB,NRI_TSUN_EXPB,NRI_VLCN_EXPB,NRI_WFIR_EXPB,NRI_WNTW_EXPB,NRI_BUILDVALUE,NRI_POPULATION,NRI_AREA
0,10100.0,0.0,0.0,2164489.0,6357.991,187611.434808,434705.3,0.0,222528.920926,1065.074257,...,650521500.0,13970630000.0,13970630000.0,0.0,0.0,1514336000.0,13970630000.0,13970630000.0,42217.0,2911.111469
1,10140.0,0.0,23314130.0,0.0,12807930.0,397.881741,2055.71,0.0,1739.400614,122577.699905,...,3281721000.0,15664850000.0,15664850000.0,8037370000.0,0.0,2640196000.0,15664850000.0,15664850000.0,75462.0,2246.050928
2,10180.0,0.0,0.0,485441.0,19906.67,483001.071482,1375110.0,2751.963861,118521.393091,40474.255904,...,4136462000.0,31702040000.0,31702040000.0,0.0,0.0,4755646000.0,31702040000.0,31702040000.0,176438.0,2785.545644


In [7]:
migration_df.head(3)

Unnamed: 0,Migration16to20_Metro Code of Geography A,Migration16to20_A_Inflow_Estimate,Migration16to20_A_Outflow_Estimate,Migration16to20_A_NetMigration_Estimate,Migration16to20_A_GrossMigration_Estimate,Migration16to20_A_Inflow_MOE,Migration16to20_A_Outflow_MOE,Migration16to20_A_NetMigration_MOE,Migration16to20_A_GrossMigration_MOE
0,10180,14074.0,12714.0,1360.0,26788.0,1184.831634,1675.577811,2061.050218,2043.932973
1,10380,3447.0,11070.0,-7623.0,14517.0,639.049294,1348.850251,1486.460225,1498.274674
2,10420,29065.0,29891.0,-826.0,58956.0,1763.152007,1855.395645,2517.326161,2601.782274


In [8]:
housing_df.head(3)

Unnamed: 0,HousingPI25_GEOID,HousingPI25_Metro Area,HousingPI25_GEOID.1,HousingPI25_1980,HousingPI25_1981,HousingPI25_1982,HousingPI25_1983,HousingPI25_1984,HousingPI25_1985,HousingPI25_1986,...,HousingPI25_2015,HousingPI25_2016,HousingPI25_2017,HousingPI25_2018,HousingPI25_2019,HousingPI25_2020,HousingPI25_2021,HousingPI25_2022,HousingPI25_2023,HousingPI25_2024
0,10540,"Albany, OR",10540,3.2,2.9,2.8,2.8,2.5,2.4,2.3,...,4.0,4.1,4.4,4.6,4.5,4.5,5.1,5.7,5.7,5.7
1,10180,"Abilene, TX",10180,3.1,2.9,2.9,2.9,2.7,2.6,2.6,...,3.3,3.2,3.3,3.3,3.4,3.6,3.8,4.2,3.9,4.0
2,10500,"Albany, GA",10500,3.2,3.3,3.2,3.1,3.0,3.0,3.0,...,3.0,3.1,3.2,3.3,3.2,3.4,3.6,4.1,4.2,4.3


In [9]:
# Step 1 : Already done loading datasets.

# Step 2: Set base
combined_df = criteria_df.copy()

# Step 3: Merge datasets
combined_df = combined_df.merge(
    hazard_df,
    how='left',
    left_on=join_keys['criteria'],
    right_on=join_keys['hazard'],
    suffixes=('', '_hazard')
)

combined_df = combined_df.merge(
    migration_df,
    how='left',
    left_on=join_keys['criteria'],
    right_on=join_keys['migration'],
    suffixes=('', '_migration')
)

combined_df = combined_df.merge(
    housing_df,
    how='left',
    left_on=join_keys['criteria'],
    right_on=join_keys['housing'],
    suffixes=('', '_housing')
)

# Step 4: Drop only non-criteria join keys
redundant_keys = [
    join_keys['hazard'],
    join_keys['migration'],
    join_keys['housing']
]
redundant_keys = [key for key in redundant_keys if key != join_keys['criteria']]

combined_df.drop(columns=redundant_keys, inplace=True, errors='ignore')

# Step 5: Reorder to put CBSA code first
cbsa_col = join_keys['criteria']
cols = combined_df.columns.tolist()
combined_df = combined_df[[cbsa_col] + [col for col in cols if col != cbsa_col]]

# Step 6: Preview
combined_df.head()


Unnamed: 0,Crosswalk2023_CBSA Code,Crosswalk2023_CBSA Title,Crosswalk2023_State Name,NRI_AVLN_EALPE,NRI_CFLD_EALPE,NRI_CWAV_EALPE,NRI_ERQK_EALPE,NRI_HAIL_EALPE,NRI_HWAV_EALPE,NRI_HRCN_EALPE,...,HousingPI25_2015,HousingPI25_2016,HousingPI25_2017,HousingPI25_2018,HousingPI25_2019,HousingPI25_2020,HousingPI25_2021,HousingPI25_2022,HousingPI25_2023,HousingPI25_2024
0,10180,"Abilene, TX",Texas,0.0,0.0,485441.029265,19906.67,483001.071482,1375110.0,2751.964,...,3.3,3.2,3.3,3.3,3.4,3.6,3.8,4.2,3.9,4.0
1,10380,"Aguadilla, PR",Puerto Rico,0.0,1170.838964,0.0,26587660.0,3692.19471,0.0,5023007.0,...,,,,,,,,,,
2,10420,"Akron, OH",Ohio,0.0,0.0,283878.596332,167456.8,118503.30189,425044.0,9718.143,...,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.8,2.9
3,10500,"Albany, GA",Georgia,0.0,0.0,0.0,160196.1,88644.910173,404583.2,223496.2,...,3.0,3.1,3.2,3.3,3.2,3.4,3.6,4.1,4.2,4.3
4,10540,"Albany, OR",Oregon,36919.52184,0.0,1485.855905,13423930.0,28589.341968,20239.23,0.0,...,4.0,4.1,4.4,4.6,4.5,4.5,5.1,5.7,5.7,5.7


In [10]:
combined_df.to_csv(f'../../data/final/final_data_{crosswalk_version}.csv', index=False)

In [11]:
combined_df

Unnamed: 0,Crosswalk2023_CBSA Code,Crosswalk2023_CBSA Title,Crosswalk2023_State Name,NRI_AVLN_EALPE,NRI_CFLD_EALPE,NRI_CWAV_EALPE,NRI_ERQK_EALPE,NRI_HAIL_EALPE,NRI_HWAV_EALPE,NRI_HRCN_EALPE,...,HousingPI25_2015,HousingPI25_2016,HousingPI25_2017,HousingPI25_2018,HousingPI25_2019,HousingPI25_2020,HousingPI25_2021,HousingPI25_2022,HousingPI25_2023,HousingPI25_2024
0,10180,"Abilene, TX",Texas,0.000000,0.000000,4.854410e+05,1.990667e+04,483001.071482,1.375110e+06,2.751964e+03,...,3.3,3.2,3.3,3.3,3.4,3.6,3.8,4.2,3.9,4.0
1,10380,"Aguadilla, PR",Puerto Rico,0.000000,1170.838964,0.000000e+00,2.658766e+07,3692.194710,0.000000e+00,5.023007e+06,...,,,,,,,,,,
2,10420,"Akron, OH",Ohio,0.000000,0.000000,2.838786e+05,1.674568e+05,118503.301890,4.250440e+05,9.718143e+03,...,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.8,2.9
3,10500,"Albany, GA",Georgia,0.000000,0.000000,0.000000e+00,1.601961e+05,88644.910173,4.045832e+05,2.234962e+05,...,3.0,3.1,3.2,3.3,3.2,3.4,3.6,4.1,4.2,4.3
4,10540,"Albany, OR",Oregon,36919.521840,0.000000,1.485856e+03,1.342393e+07,28589.341968,2.023923e+04,0.000000e+00,...,4.0,4.1,4.4,4.6,4.5,4.5,5.1,5.7,5.7,5.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,49420,"Yakima, WA",Washington,168354.956650,0.000000,1.635637e+06,6.451273e+06,15375.668773,1.456925e+05,0.000000e+00,...,3.6,3.9,4.1,4.3,4.5,4.9,5.4,5.4,5.0,4.9
389,49620,"York-Hanover, PA",Pennsylvania,0.000000,0.000000,3.291121e+05,1.178405e+05,2175.367687,8.589427e+05,4.652846e+05,...,2.7,2.7,2.7,2.7,2.7,2.8,2.9,3.1,3.2,3.3
390,49660,"Youngstown-Warren, OH",Ohio,0.000000,0.000000,3.043149e+06,1.030401e+05,67028.229971,9.982039e+05,2.075168e+04,...,1.8,1.9,1.9,2.0,2.1,2.3,2.5,2.7,2.8,3.1
391,49700,"Yuba City, CA",California,61532.536375,0.000000,0.000000e+00,7.949410e+06,13702.433003,1.572410e+06,0.000000e+00,...,5.6,5.7,6.2,6.2,6.5,6.7,7.5,8.0,7.1,7.0
