# Data Source 2 - Crosswalk

# Crosswalk - 2023 & 2013 Versions

## Raw data source:
https://www.bls.gov/cew/classifications/areas/county-msa-csa-crosswalk.htm

- contains both 2013 and 2023 version

## Select the year

In [1]:
version = 2023

In [2]:
import sys
sys.path.append('../../scripts')  
import merging_utils
import yaml
import pandas as pd

with open("../../config/preprocessing.yaml", "r") as f:
    preprocessing_config = yaml.safe_load(f)

prefix = preprocessing_config['crosswalk'][version]['prefix']
agg_method = preprocessing_config['crosswalk'][version]['agg_method']
file_name = preprocessing_config['crosswalk'][version]['file_name']
path = '../../data/raw/'


# aggregate columns
cbsa_code = preprocessing_config['crosswalk'][version]['cbsa_code']
cbsa_title = preprocessing_config['crosswalk'][version]['cbsa_title']
cbsa_state = preprocessing_config['crosswalk'][version]['cbsa_state']

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [3]:
prefix, agg_method

('Crosswalk2023', 'concat')

In [4]:
cbsa_code, cbsa_title, cbsa_state

('Crosswalk2023_CBSA Code',
 'Crosswalk2023_CBSA Title',
 'Crosswalk2023_State Name')

In [5]:
df = pd.read_excel(path + file_name, header=0)

## Set the header

In [6]:
df.head(5)

Unnamed: 0,Table with row headers in column A and column headers in row 3,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,"List 1. CORE BASED STATISTICAL AREAS (CBSAs), ...",,,,,,,,,,,
1,CBSA Code,Metropolitan Division Code,CSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,Metropolitan Division Title,CSA Title,County/County Equivalent,State Name,FIPS State Code,FIPS County Code,Central/Outlying County
2,10100,,,"Aberdeen, SD",Micropolitan Statistical Area,,,Brown County,South Dakota,46,013,Central
3,10100,,,"Aberdeen, SD",Micropolitan Statistical Area,,,Edmunds County,South Dakota,46,045,Outlying
4,10140,,,"Aberdeen, WA",Micropolitan Statistical Area,,,Grays Harbor County,Washington,53,027,Central


In [7]:
print(df.iloc[0,0])

List 1. CORE BASED STATISTICAL AREAS (CBSAs), METROPOLITAN DIVISIONS, AND COMBINED STATISTICAL AREAS (CSAs), JULY 2023


In [8]:
df = pd.read_excel(path + file_name, header=2, dtype={'FIPS County Code': str, 'FIPS State Code': str})
df.head(3)

Unnamed: 0,CBSA Code,Metropolitan Division Code,CSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,Metropolitan Division Title,CSA Title,County/County Equivalent,State Name,FIPS State Code,FIPS County Code,Central/Outlying County
0,10100,,,"Aberdeen, SD",Micropolitan Statistical Area,,,Brown County,South Dakota,46,13,Central
1,10100,,,"Aberdeen, SD",Micropolitan Statistical Area,,,Edmunds County,South Dakota,46,45,Outlying
2,10140,,,"Aberdeen, WA",Micropolitan Statistical Area,,,Grays Harbor County,Washington,53,27,Central


## Delete the footnote

In [9]:
df.tail(3)

Unnamed: 0,CBSA Code,Metropolitan Division Code,CSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,Metropolitan Division Title,CSA Title,County/County Equivalent,State Name,FIPS State Code,FIPS County Code,Central/Outlying County
1915,,,,,,,,,,,,
1916,Note: The Office of Management and Budget's (O...,,,,,,,,,,,
1917,"Source: File prepared by U.S. Census Bureau, P...",,,,,,,,,,,


In [10]:
df = df.iloc[:-3]

In [11]:
df.tail(3)

Unnamed: 0,CBSA Code,Metropolitan Division Code,CSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,Metropolitan Division Title,CSA Title,County/County Equivalent,State Name,FIPS State Code,FIPS County Code,Central/Outlying County
1912,49740,,,"Yuma, AZ",Metropolitan Statistical Area,,,Yuma County,Arizona,4,27,Central
1913,49780,,198.0,"Zanesville, OH",Micropolitan Statistical Area,,"Columbus-Marion-Zanesville, OH",Muskingum County,Ohio,39,119,Central
1914,49820,,,"Zapata, TX",Micropolitan Statistical Area,,,Zapata County,Texas,48,505,Central


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1915 entries, 0 to 1914
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   CBSA Code                                   1915 non-null   object 
 1   Metropolitan Division Code                  139 non-null    float64
 2   CSA Code                                    1341 non-null   float64
 3   CBSA Title                                  1915 non-null   object 
 4   Metropolitan/Micropolitan Statistical Area  1915 non-null   object 
 5   Metropolitan Division Title                 139 non-null    object 
 6   CSA Title                                   1341 non-null   object 
 7   County/County Equivalent                    1915 non-null   object 
 8   State Name                                  1915 non-null   object 
 9   FIPS State Code                             1915 non-null   object 
 10  FIPS County 

In [13]:
df.isnull().sum()

CBSA Code                                        0
Metropolitan Division Code                    1776
CSA Code                                       574
CBSA Title                                       0
Metropolitan/Micropolitan Statistical Area       0
Metropolitan Division Title                   1776
CSA Title                                      574
County/County Equivalent                         0
State Name                                       0
FIPS State Code                                  0
FIPS County Code                                 0
Central/Outlying County                          0
dtype: int64

In [14]:
df.head()

Unnamed: 0,CBSA Code,Metropolitan Division Code,CSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,Metropolitan Division Title,CSA Title,County/County Equivalent,State Name,FIPS State Code,FIPS County Code,Central/Outlying County
0,10100,,,"Aberdeen, SD",Micropolitan Statistical Area,,,Brown County,South Dakota,46,13,Central
1,10100,,,"Aberdeen, SD",Micropolitan Statistical Area,,,Edmunds County,South Dakota,46,45,Outlying
2,10140,,,"Aberdeen, WA",Micropolitan Statistical Area,,,Grays Harbor County,Washington,53,27,Central
3,10180,,101.0,"Abilene, TX",Metropolitan Statistical Area,,"Abilene-Sweetwater, TX",Callahan County,Texas,48,59,Outlying
4,10180,,101.0,"Abilene, TX",Metropolitan Statistical Area,,"Abilene-Sweetwater, TX",Jones County,Texas,48,253,Outlying


## Creating a KEY
### FIPS Key is a 5-digit key combining the state and the county code

In [15]:
df.dtypes

CBSA Code                                      object
Metropolitan Division Code                    float64
CSA Code                                      float64
CBSA Title                                     object
Metropolitan/Micropolitan Statistical Area     object
Metropolitan Division Title                    object
CSA Title                                      object
County/County Equivalent                       object
State Name                                     object
FIPS State Code                                object
FIPS County Code                               object
Central/Outlying County                        object
dtype: object

In [16]:
df['FIPS_Key'] = df['FIPS State Code'] + df['FIPS County Code']

In [17]:
df

Unnamed: 0,CBSA Code,Metropolitan Division Code,CSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,Metropolitan Division Title,CSA Title,County/County Equivalent,State Name,FIPS State Code,FIPS County Code,Central/Outlying County,FIPS_Key
0,10100,,,"Aberdeen, SD",Micropolitan Statistical Area,,,Brown County,South Dakota,46,013,Central,46013
1,10100,,,"Aberdeen, SD",Micropolitan Statistical Area,,,Edmunds County,South Dakota,46,045,Outlying,46045
2,10140,,,"Aberdeen, WA",Micropolitan Statistical Area,,,Grays Harbor County,Washington,53,027,Central,53027
3,10180,,101.0,"Abilene, TX",Metropolitan Statistical Area,,"Abilene-Sweetwater, TX",Callahan County,Texas,48,059,Outlying,48059
4,10180,,101.0,"Abilene, TX",Metropolitan Statistical Area,,"Abilene-Sweetwater, TX",Jones County,Texas,48,253,Outlying,48253
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910,49700,,472.0,"Yuba City, CA",Metropolitan Statistical Area,,"Sacramento-Roseville, CA",Sutter County,California,06,101,Central,06101
1911,49700,,472.0,"Yuba City, CA",Metropolitan Statistical Area,,"Sacramento-Roseville, CA",Yuba County,California,06,115,Central,06115
1912,49740,,,"Yuma, AZ",Metropolitan Statistical Area,,,Yuma County,Arizona,04,027,Central,04027
1913,49780,,198.0,"Zanesville, OH",Micropolitan Statistical Area,,"Columbus-Marion-Zanesville, OH",Muskingum County,Ohio,39,119,Central,39119


## Adding a prefix 

In [18]:
print('Before adding prefixes: ' , df.columns)

df = merging_utils.add_prefix_all(df, prefix=prefix)

print()
print('After adding prefixes: ' , df.columns)

Before adding prefixes:  Index(['CBSA Code', 'Metropolitan Division Code', 'CSA Code', 'CBSA Title',
       'Metropolitan/Micropolitan Statistical Area',
       'Metropolitan Division Title', 'CSA Title', 'County/County Equivalent',
       'State Name', 'FIPS State Code', 'FIPS County Code',
       'Central/Outlying County', 'FIPS_Key'],
      dtype='object')

After adding prefixes:  Index(['Crosswalk2023_CBSA Code', 'Crosswalk2023_Metropolitan Division Code',
       'Crosswalk2023_CSA Code', 'Crosswalk2023_CBSA Title',
       'Crosswalk2023_Metropolitan/Micropolitan Statistical Area',
       'Crosswalk2023_Metropolitan Division Title', 'Crosswalk2023_CSA Title',
       'Crosswalk2023_County/County Equivalent', 'Crosswalk2023_State Name',
       'Crosswalk2023_FIPS State Code', 'Crosswalk2023_FIPS County Code',
       'Crosswalk2023_Central/Outlying County', 'Crosswalk2023_FIPS_Key'],
      dtype='object')


In [20]:
df.to_csv(f'../../data/interim/data2_county_level_{version}.csv',index=False)

## Group by at the MSA Level

## Merge type : Example

In [21]:
example_common = merging_utils.get_msa_summary(size='metro', df = df, agg_method= 'most_common', prefix=prefix, cbsa_code=cbsa_code, cbsa_title=cbsa_title, cbsa_state=cbsa_state)

In [22]:
example_common[example_common[cbsa_code] == '17980']

Unnamed: 0,Crosswalk2023_CBSA Code,Crosswalk2023_CBSA Title,Crosswalk2023_State Name
80,17980,"Columbus, GA-AL",Georgia


In [23]:
example_concat =merging_utils.get_msa_summary(size='metro', df = df, agg_method= 'concat', prefix=prefix, cbsa_code=cbsa_code, cbsa_title=cbsa_title, cbsa_state=cbsa_state)

In [24]:
example_concat[example_concat[cbsa_code] == '17980']

Unnamed: 0,Crosswalk2023_CBSA Code,Crosswalk2023_CBSA Title,Crosswalk2023_State Name
80,17980,"Columbus, GA-AL","Alabama, Georgia"


## Select the Merge type

In [25]:
msa_summary =merging_utils.get_msa_summary(size='metro', df = df, agg_method= agg_method, prefix=prefix, cbsa_code=cbsa_code, cbsa_title=cbsa_title, cbsa_state=cbsa_state)

In [26]:
len(msa_summary)

393

In [28]:
msa_summary.to_csv(f'../../data/interim/data2_msa_level_{version}.csv', index=False)