## Data Source - Migration (Origin & Destination)

### Raw Data Source:
https://www.census.gov/data/tables/2020/demo/geographic-mobility/metro-to-metro-migration.html

## Settings

In [1]:
import pandas as pd
import numpy as np
# Disable scientific notation & add commas
pd.set_option('display.float_format', '{:,.2f}'.format)
# Show all columns without truncation
pd.set_option('display.max_columns', None)
# # Show full column width (helpful for long strings)
# pd.set_option('display.max_colwidth', None)
# # Limit the number of rows displayed in output
# pd.set_option('display.max_rows', 20)

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
import pandas as pd
import yaml
migration_raw_data = pd.read_csv('../../data/interim/data4_migration_before_msa_level.csv')
migration_raw_data.head()

  migration_raw_data = pd.read_csv('../../data/interim/data4_migration_before_msa_level.csv')


Unnamed: 0,Metro Code of Geography A,Metro Code of Geography B,Metro Statistical Area of Geography A,Metro Statistical Area of Geography B,Flow from Geography B to Geography A_Estimate,Flow from Geography B to Geography A_MOE,Counterflow from Geography A to Geography B_Estimate,Counterflow from Geography A to Geography B_MOE,Net Migration from Geography B to Geography A_Estimate,Net Migration from Geography B to Geography A_MOE,Gross Migration between Geography A and Geography B_Estimate,Gross Migration between Geography A and Geography B_MOE
0,10180,10740,"Abilene, TX Metro Area","Albuquerque, NM Metro Area",55.0,65.0,0.0,,55.0,65.0,55.0,65.0
1,10180,11100,"Abilene, TX Metro Area","Amarillo, TX Metro Area",461.0,292.0,118.0,88.0,343.0,294.0,579.0,316.0
2,10180,11260,"Abilene, TX Metro Area","Anchorage, AK Metro Area",0.0,,35.0,34.0,-35.0,34.0,35.0,34.0
3,10180,12060,"Abilene, TX Metro Area","Atlanta-Sandy Springs-Alpharetta, GA Metro Area",14.0,20.0,83.0,120.0,-69.0,121.0,97.0,122.0
4,10180,12220,"Abilene, TX Metro Area","Auburn-Opelika, AL Metro Area",16.0,20.0,0.0,,16.0,20.0,16.0,20.0


In [3]:
with open("../../config/preprocessing.yaml", "r") as f:
    config = yaml.safe_load(f)
    
exclude_regions = config['migration']['exclude_region']
print(exclude_regions)

# Applying exclude_regions
filtered_df = migration_raw_data[
    ~migration_raw_data["Metro Code of Geography A"].isin(exclude_regions) &
    ~migration_raw_data["Metro Code of Geography B"].isin(exclude_regions)
]

filtered_df['Flow from Geography B to Geography A_Estimate'].isnull().sum()

['AFR--', 'ASI--', 'CAM--', 'CAR--', 'EUR--', 'ISL--', 'NAM--', 'OCE--', 'SAM--', '99999']


0

In [4]:
# 처음 6개 컬럼만 선택
filtered_df = filtered_df.iloc[:, :6].copy()

# 컬럼명 바꾸기
filtered_df.columns = [
    "Destination_Code",       # Metro Code of Geography A
    "Origin_Code",  # Metro Code of Geography B
    "Destination_Name",       # Metro Statistical Area of Geography A
    "Origin_Name",  # Metro Statistical Area of Geography B
    "Flow",           # Flow from Geography B to Geography A_Estimate
    "Flow_MOE"        # Flow from Geography B to Geography A_MOE
]

filtered_df.head()

Unnamed: 0,Destination_Code,Origin_Code,Destination_Name,Origin_Name,Flow,Flow_MOE
0,10180,10740,"Abilene, TX Metro Area","Albuquerque, NM Metro Area",55.0,65.0
1,10180,11100,"Abilene, TX Metro Area","Amarillo, TX Metro Area",461.0,292.0
2,10180,11260,"Abilene, TX Metro Area","Anchorage, AK Metro Area",0.0,
3,10180,12060,"Abilene, TX Metro Area","Atlanta-Sandy Springs-Alpharetta, GA Metro Area",14.0,20.0
4,10180,12220,"Abilene, TX Metro Area","Auburn-Opelika, AL Metro Area",16.0,20.0


## Import Crosswalk Data

In [5]:
version = 2023
crosswalk_path = f'../../data/interim/data2_msa_level_{version}.csv'
crosswalk_data = pd.read_csv(crosswalk_path)
crosswalk_data

Unnamed: 0,Crosswalk2023_CBSA Code,Crosswalk2023_CBSA Title,Crosswalk2023_State Name
0,10180,"Abilene, TX",Texas
1,10380,"Aguadilla, PR",Puerto Rico
2,10420,"Akron, OH",Ohio
3,10500,"Albany, GA",Georgia
4,10540,"Albany, OR",Oregon
...,...,...,...
388,49420,"Yakima, WA",Washington
389,49620,"York-Hanover, PA",Pennsylvania
390,49660,"Youngstown-Warren, OH",Ohio
391,49700,"Yuba City, CA",California


## Merge

In [6]:
# Ensure both codes are strings
filtered_df['Origin_Code'] = filtered_df['Origin_Code'].astype(str)
filtered_df['Destination_Code'] = filtered_df['Destination_Code'].astype(str)
crosswalk_data['Crosswalk2023_CBSA Code'] = crosswalk_data['Crosswalk2023_CBSA Code'].astype(str)

# Origin merge
filtered_df = filtered_df.merge(
    crosswalk_data[['Crosswalk2023_CBSA Code', 'Crosswalk2023_State Name']],
    left_on='Origin_Code',
    right_on='Crosswalk2023_CBSA Code',
    how='left'
).rename(columns={'Crosswalk2023_State Name': 'Origin_Crosswalk2023_State Name'}) \
 .drop(columns=['Crosswalk2023_CBSA Code'])

# Destination merge
filtered_df = filtered_df.merge(
    crosswalk_data[['Crosswalk2023_CBSA Code', 'Crosswalk2023_State Name']],
    left_on='Destination_Code',
    right_on='Crosswalk2023_CBSA Code',
    how='left'
).rename(columns={'Crosswalk2023_State Name': 'Destination_Crosswalk2023_State Name'}) \
 .drop(columns=['Crosswalk2023_CBSA Code'])


In [7]:
filtered_df.sample(10)

Unnamed: 0,Destination_Code,Origin_Code,Destination_Name,Origin_Name,Flow,Flow_MOE,Origin_Crosswalk2023_State Name,Destination_Crosswalk2023_State Name
49076,40060,34980,"Richmond, VA Metro Area","Nashville-Davidson--Murfreesboro--Franklin, TN...",87.0,59.0,Tennessee,Virginia
34660,31180,13380,"Lubbock, TX Metro Area","Bellingham, WA Metro Area",49.0,57.0,Washington,Texas
5262,13220,12060,"Beckley, WV Metro Area","Atlanta-Sandy Springs-Alpharetta, GA Metro Area",81.0,75.0,Georgia,West Virginia
20884,22540,40340,"Fond du Lac, WI Metro Area","Rochester, MN Metro Area",16.0,24.0,Minnesota,Wisconsin
46648,38940,27180,"Port St. Lucie, FL Metro Area","Jackson, TN Metro Area",0.0,,Tennessee,Florida
5204,13140,35620,"Beaumont-Port Arthur, TX Metro Area","New York-Newark-Jersey City, NY-NJ-PA Metro Area",149.0,151.0,"New Jersey, New York",Texas
50414,40900,15680,"Sacramento-Roseville-Folsom, CA Metro Area","California-Lexington Park, MD Metro Area",27.0,42.0,,California
49887,40380,21660,"Rochester, NY Metro Area","Eugene-Springfield, OR Metro Area",0.0,,Oregon,New York
15044,19100,19500,"Dallas-Fort Worth-Arlington, TX Metro Area","Decatur, IL Metro Area",85.0,100.0,Illinois,Texas
10765,16820,13820,"Charlottesville, VA Metro Area","Birmingham-Hoover, AL Metro Area",47.0,96.0,Alabama,Virginia


## Divide the volume of flow into ratio (Multiple States)

In [8]:
assign_flow_df = filtered_df.copy()

# Origin split & explode
assign_flow_df = assign_flow_df.assign(Origin_State=assign_flow_df["Origin_Crosswalk2023_State Name"].str.split(",")).explode("Origin_State")

# Destination split & explode
assign_flow_df = assign_flow_df.assign(Destination_State=assign_flow_df["Destination_Crosswalk2023_State Name"].str.split(",")).explode("Destination_State")

# 공백 제거
assign_flow_df["Origin_State"] = assign_flow_df["Origin_State"].str.strip()
assign_flow_df["Destination_State"] = assign_flow_df["Destination_State"].str.strip()

# Flow 균등 분할
counts = assign_flow_df.groupby(assign_flow_df.index).size()
assign_flow_df["Flow"] = assign_flow_df["Flow"] / counts
assign_flow_df["Flow_MOE"] = assign_flow_df["Flow_MOE"] / counts


In [9]:
assign_flow_df[21120:21125]

Unnamed: 0,Destination_Code,Origin_Code,Destination_Name,Origin_Name,Flow,Flow_MOE,Origin_Crosswalk2023_State Name,Destination_Crosswalk2023_State Name,Origin_State,Destination_State
15314,19100,48660,"Dallas-Fort Worth-Arlington, TX Metro Area","Wichita Falls, TX Metro Area",1012.0,281.0,Texas,Texas,Texas,Texas
15315,19100,48700,"Dallas-Fort Worth-Arlington, TX Metro Area","Williamsport, PA Metro Area",65.0,92.0,Pennsylvania,Texas,Pennsylvania,Texas
15316,19100,48900,"Dallas-Fort Worth-Arlington, TX Metro Area","Wilmington, NC Metro Area",0.0,,North Carolina,Texas,North Carolina,Texas
15317,19100,49020,"Dallas-Fort Worth-Arlington, TX Metro Area","Winchester, VA-WV Metro Area",11.0,11.5,"Virginia, West Virginia",Texas,Virginia,Texas
15317,19100,49020,"Dallas-Fort Worth-Arlington, TX Metro Area","Winchester, VA-WV Metro Area",11.0,11.5,"Virginia, West Virginia",Texas,West Virginia,Texas


## Map Region

In [10]:
import yaml

with open("../../config/Analysis.yaml", "r") as f:
    config = yaml.safe_load(f)

mapping = config['subgroup']['us_census']

In [11]:
assign_flow_df["Origin_Region"] = assign_flow_df["Origin_State"].map( 
        lambda s: mapping.get(s, {}).get("region")
)

assign_flow_df["Destination_Region"] = assign_flow_df["Destination_State"].map( 
        lambda s: mapping.get(s, {}).get("region")
)

In [12]:
assign_flow_df.groupby(["Origin_Region", "Destination_Region"], as_index=False).sum()[['Origin_Region', 'Destination_Region', 'Flow']]

Unnamed: 0,Origin_Region,Destination_Region,Flow
0,Midwest,Midwest,843004.56
1,Midwest,Northeast,67503.25
2,Midwest,South,389184.53
3,Midwest,West,219001.17
4,Northeast,Midwest,87937.17
5,Northeast,Northeast,681243.5
6,Northeast,South,475040.83
7,Northeast,West,163302.0
8,South,Midwest,284923.28
9,South,Northeast,277033.25


In [13]:
assign_flow_df.columns

Index(['Destination_Code', 'Origin_Code', 'Destination_Name', 'Origin_Name',
       'Flow', 'Flow_MOE', 'Origin_Crosswalk2023_State Name',
       'Destination_Crosswalk2023_State Name', 'Origin_State',
       'Destination_State', 'Origin_Region', 'Destination_Region'],
      dtype='object')

In [14]:
cols = ['Origin_Code', 'Destination_Code', 'Origin_Name', 'Destination_Name', 
        'Origin_State', 'Destination_State', 'Origin_Region', 'Destination_Region',
        'Flow', 'Flow_MOE']

assign_flow_df = assign_flow_df[cols]

assign_flow_df.to_csv('../../data/interim/data4_migration_origin_destination.csv')

In [15]:
assign_flow_df

Unnamed: 0,Origin_Code,Destination_Code,Origin_Name,Destination_Name,Origin_State,Destination_State,Origin_Region,Destination_Region,Flow,Flow_MOE
0,10740,10180,"Albuquerque, NM Metro Area","Abilene, TX Metro Area",New Mexico,Texas,West,South,55.00,65.00
1,11100,10180,"Amarillo, TX Metro Area","Abilene, TX Metro Area",Texas,Texas,South,South,461.00,292.00
2,11260,10180,"Anchorage, AK Metro Area","Abilene, TX Metro Area",Alaska,Texas,West,South,0.00,
3,12060,10180,"Atlanta-Sandy Springs-Alpharetta, GA Metro Area","Abilene, TX Metro Area",Georgia,Texas,South,South,14.00,20.00
4,12220,10180,"Auburn-Opelika, AL Metro Area","Abilene, TX Metro Area",Alabama,Texas,South,South,16.00,20.00
...,...,...,...,...,...,...,...,...,...,...
65127,48300,49740,"Wenatchee, WA Metro Area","Yuma, AZ Metro Area",Washington,Arizona,West,West,14.00,23.00
65128,48620,49740,"Wichita, KS Metro Area","Yuma, AZ Metro Area",Kansas,Arizona,Midwest,West,7.00,11.00
65129,48660,49740,"Wichita Falls, TX Metro Area","Yuma, AZ Metro Area",Texas,Arizona,South,West,19.00,29.00
65130,49420,49740,"Yakima, WA Metro Area","Yuma, AZ Metro Area",Washington,Arizona,West,West,39.00,44.00
