# Feature Engineering

In [1]:
import pandas as pd

merged_df_clean  = pd.read_csv('Cleaned_merged_dataset.csv')

In [2]:
# Relative income
merged_df_clean['state_income_mean'] = merged_df_clean.groupby('State_name')['Median_household_income'].transform('mean')
merged_df_clean['state_income_std'] = merged_df_clean.groupby('State_name')['Median_household_income'].transform('std')

merged_df_clean['income_relative'] = (merged_df_clean['Median_household_income'] - merged_df_clean['state_income_mean']) / merged_df_clean['state_income_std']
merged_df_clean.head(5)

Unnamed: 0,State_name,State_code,Year,Population,Sex,Median_household_income,Age_label,Age_group,state_income_mean,state_income_std,income_relative
0,Alabama,1,2021,5050380,0,53913.0,All ages,All ages,58599.666667,3474.419749,-1.348906
1,Alabama,1,2022,5073903,0,59674.0,All ages,All ages,58599.666667,3474.419749,0.309212
2,Alabama,1,2023,5108468,0,62212.0,All ages,All ages,58599.666667,3474.419749,1.039694
3,Alabama,1,2021,57503,0,53913.0,1,0-18,58599.666667,3474.419749,-1.348906
4,Alabama,1,2022,57244,0,59674.0,1,0-18,58599.666667,3474.419749,0.309212


The relative income measures deviation from long-term state income level.

In [3]:
# Demographic decomposition
merged_df_clean['state_year_total_pop'] = merged_df_clean.groupby(['State_name', 'Year'])['Population'].transform('sum')

merged_df_clean['population_share'] = merged_df_clean['Population'] / merged_df_clean['state_year_total_pop']
merged_df_clean.head(5)

Unnamed: 0,State_name,State_code,Year,Population,Sex,Median_household_income,Age_label,Age_group,state_income_mean,state_income_std,income_relative,state_year_total_pop,population_share
0,Alabama,1,2021,5050380,0,53913.0,All ages,All ages,58599.666667,3474.419749,-1.348906,20088038,0.251412
1,Alabama,1,2022,5073903,0,59674.0,All ages,All ages,58599.666667,3474.419749,0.309212,20180280,0.251429
2,Alabama,1,2023,5108468,0,62212.0,All ages,All ages,58599.666667,3474.419749,1.039694,20318102,0.251424
3,Alabama,1,2021,57503,0,53913.0,1,0-18,58599.666667,3474.419749,-1.348906,20088038,0.002863
4,Alabama,1,2022,57244,0,59674.0,1,0-18,58599.666667,3474.419749,0.309212,20180280,0.002837


The demographic decomposition captures demographic structure within each state-year.

In [4]:
# Stability indicator
merged_df_clean["population_volatility"] = (
    merged_df_clean.groupby("State_name")["Population"]
    .transform("std")
)

merged_df_clean.head(5)

Unnamed: 0,State_name,State_code,Year,Population,Sex,Median_household_income,Age_label,Age_group,state_income_mean,state_income_std,income_relative,state_year_total_pop,population_share,population_volatility
0,Alabama,1,2021,5050380,0,53913.0,All ages,All ages,58599.666667,3474.419749,-1.348906,20088038,0.251412,381851.938371
1,Alabama,1,2022,5073903,0,59674.0,All ages,All ages,58599.666667,3474.419749,0.309212,20180280,0.251429,381851.938371
2,Alabama,1,2023,5108468,0,62212.0,All ages,All ages,58599.666667,3474.419749,1.039694,20318102,0.251424,381851.938371
3,Alabama,1,2021,57503,0,53913.0,1,0-18,58599.666667,3474.419749,-1.348906,20088038,0.002863,381851.938371
4,Alabama,1,2022,57244,0,59674.0,1,0-18,58599.666667,3474.419749,0.309212,20180280,0.002837,381851.938371


The population volatility measures demographic stability within each state.

In [5]:
# Scale interaction
merged_df_clean["income_per_population"] = merged_df_clean["Median_household_income"] / merged_df_clean["Population"]

merged_df_clean.head(5)

Unnamed: 0,State_name,State_code,Year,Population,Sex,Median_household_income,Age_label,Age_group,state_income_mean,state_income_std,income_relative,state_year_total_pop,population_share,population_volatility,income_per_population
0,Alabama,1,2021,5050380,0,53913.0,All ages,All ages,58599.666667,3474.419749,-1.348906,20088038,0.251412,381851.938371,0.010675
1,Alabama,1,2022,5073903,0,59674.0,All ages,All ages,58599.666667,3474.419749,0.309212,20180280,0.251429,381851.938371,0.011761
2,Alabama,1,2023,5108468,0,62212.0,All ages,All ages,58599.666667,3474.419749,1.039694,20318102,0.251424,381851.938371,0.012178
3,Alabama,1,2021,57503,0,53913.0,1,0-18,58599.666667,3474.419749,-1.348906,20088038,0.002863,381851.938371,0.937568
4,Alabama,1,2022,57244,0,59674.0,1,0-18,58599.666667,3474.419749,0.309212,20180280,0.002837,381851.938371,1.04245


This captures income scale relative to population size.