In [2]:
import requests
import pandas as pd

# 1. API endpoint
url = "https://api.census.gov/data/2023/pep/charv"

# 2. Parameters
params = {
    "get": "NAME,STATE,YEAR,POP,AGE,SEX",
    "for": "state:*"
}

# 3. Send request
response = requests.get(url, params=params)

# 4. Parse JSON
data = response.json()

# 5. Convert to DataFrame
pop_df = pd.DataFrame(data[1:], columns=data[0])

pop_df.sample(5)


Unnamed: 0,NAME,STATE,YEAR,POP,AGE,SEX,state
67154,Virginia,51,2023,54403,6200,1,51
13550,Delaware,10,2020,6076,2600,2,10
67822,Virginia,51,2021,29115,7600,2,51
22096,Idaho,16,2020,12083,600,2,16
44190,North Carolina,37,2020,72092,1300,1,37


In [3]:
pop_df.to_csv(
    "population_dataset.csv",
    index=False
)

In [4]:
pop_df.sample(5)

Unnamed: 0,NAME,STATE,YEAR,POP,AGE,SEX,state
70543,West Virginia,54,2022,10783,4800,1,54
19632,Hawaii,15,2021,77374,7074,0,15
43006,New York,36,2020,113291,900,2,36
62176,Texas,48,2020,212159,1400,2,48
57804,South Dakota,46,2023,9527,5100,0,46


In [5]:
print(pop_df.shape)
print(type(pop_df.AGE[0]))

(90480, 7)
<class 'str'>


In [6]:
def fetch_acs_median_income_state(year):
    """
    grab ACS 1-year meadian household income in past 12 months for all states.
    endpoint: https://api.census.gov/data/{year}/acs/acs1
    variable: B19013_001E
    """
    url = f"https://api.census.gov/data/{year}/acs/acs1"
    params = {
        "get": "NAME,B19013_001E",
        "for": "state:*",
    }

    r = requests.get(url, params=params, timeout=60)

    # raise error if not 200
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code}: {r.text[:300]}")

    data = r.json()
    df = pd.DataFrame(data[1:], columns=data[0])

    # clean
    df["YEAR"] = year
    df = df.rename(columns={"state": "STATE", "B19013_001E": "median_hh_income"})
    df["STATE"] = df["STATE"].astype(str).str.zfill(2)
    df["median_hh_income"] = pd.to_numeric(df["median_hh_income"], errors="coerce")

    return df[["STATE", "YEAR", "NAME", "median_hh_income"]]


years = [2021, 2022, 2023]
acs_income = pd.concat([fetch_acs_median_income_state(y) for y in years], ignore_index=True)

acs_income.head()


Unnamed: 0,STATE,YEAR,NAME,median_hh_income
0,1,2021,Alabama,53913
1,72,2021,Puerto Rico,22237
2,4,2021,Arizona,69056
3,5,2021,Arkansas,52528
4,6,2021,California,84907


In [7]:
acs_income.to_csv(
    "median_income_state.csv",
    index=False
)

In [8]:
print(len(acs_income.STATE.unique()))
print(acs_income.STATE.unique())


52
<StringArray>
['01', '72', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16',
 '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29',
 '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42',
 '44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56', '02']
Length: 52, dtype: str


In [9]:
pop_df["STATE"] = pop_df["STATE"].astype(str).str.zfill(2)
acs_income["STATE"] = acs_income["STATE"].astype(str).str.zfill(2)

pop_df["YEAR"] = pop_df["YEAR"].astype(int)
acs_income["YEAR"] = acs_income["YEAR"].astype(int)

merged_df = pop_df.merge(
    acs_income[["STATE", "YEAR", "median_hh_income"]],
    on=["STATE", "YEAR"],
    how="left"
)

merged_df.to_csv(
    "merged_dataset.csv",
    index=False
)

In [10]:
merged_df.sample(5)

Unnamed: 0,NAME,STATE,YEAR,POP,AGE,SEX,state,median_hh_income
83677,Iowa,19,2020,21286,2400,1,19,
17661,Georgia,13,2020,144584,3500,0,13,
46345,North Dakota,38,2020,2104,7400,1,38,
74006,Wyoming,56,2020,72551,4564,1,56,
43893,North Carolina,37,2022,137419,5300,0,37,67481.0


Data Cleaning

In [20]:
print(merged_df.shape)

print(merged_df.columns)

print(merged_df.describe(include='all'))

(90480, 8)
Index(['NAME', 'STATE', 'YEAR', 'POP', 'AGE', 'SEX', 'state',
       'median_hh_income'],
      dtype='str')
           NAME  STATE          YEAR    POP    AGE    SEX  state  \
count     90480  90480  90480.000000  90480  90480  90480  90480   
unique       52     52           NaN  67557    116      3     52   
top     Alabama     01           NaN   6037   0000      0     01   
freq       1740   1740           NaN      9    780  30160   1740   
mean        NaN    NaN   2021.200000    NaN    NaN    NaN    NaN   
std         NaN    NaN      1.166197    NaN    NaN    NaN    NaN   
min         NaN    NaN   2020.000000    NaN    NaN    NaN    NaN   
25%         NaN    NaN   2020.000000    NaN    NaN    NaN    NaN   
50%         NaN    NaN   2021.000000    NaN    NaN    NaN    NaN   
75%         NaN    NaN   2022.000000    NaN    NaN    NaN    NaN   
max         NaN    NaN   2023.000000    NaN    NaN    NaN    NaN   

        median_hh_income  
count       54288.000000  
unique   

In [22]:
merged_df = merged_df.rename(columns=({
    'NAME': 'State_name',
    'STATE': 'State_code',
    'YEAR': 'Year',
    'POP': 'Population',
    'AGE': 'Age',
    'SEX': 'Sex',
    'median_hh_income': 'Median_household_income'
}
))

# Drop duplicate 'state' column
if 'state' in merged_df.columns:
    merged_df = merged_df.drop(columns=['state'])

merged_df.head(5)

Unnamed: 0,State_name,State_code,Year,Population,Age,Sex,Median_household_income
0,Alabama,1,2020,5024294,0,0,
1,Alabama,1,2020,5031864,0,0,
2,Alabama,1,2021,5050380,0,0,53913.0
3,Alabama,1,2022,5073903,0,0,59674.0
4,Alabama,1,2023,5108468,0,0,62212.0


In [29]:
# Alter data type
merged_df['Year'] = merged_df['Year'].astype(int)
merged_df['Population'] = pd.to_numeric(merged_df['Population'], errors='coerce')
merged_df['State_code'] = merged_df['State_code'].astype(str).str.zfill(2)
merged_df['Median_household_income'] = pd.to_numeric(merged_df['Median_household_income'], 
                                                     errors='coerce')

for col in ['State_name', 'Sex']:
    merged_df[col] = merged_df[col].astype('category')

merged_df['Age'] = merged_df['Age'].astype('str')
merged_df['age_int'] = pd.to_numeric(merged_df['Age'], errors='coerce')

merged_df['age_type'] = merged_df['age_int'].apply(
    lambda x: 'single_year_age' if 0 <= x <= 85 else 'group_code'
)

merged_df.head(20)


Unnamed: 0,State_name,State_code,Year,Population,Age,Sex,Median_household_income,age_int,age_type
0,Alabama,1,2020,5024294,0,0,,0,single_year_age
1,Alabama,1,2020,5031864,0,0,,0,single_year_age
2,Alabama,1,2021,5050380,0,0,53913.0,0,single_year_age
3,Alabama,1,2022,5073903,0,0,59674.0,0,single_year_age
4,Alabama,1,2023,5108468,0,0,62212.0,0,single_year_age
5,Alabama,1,2020,57354,1,0,,1,single_year_age
6,Alabama,1,2020,57034,1,0,,1,single_year_age
7,Alabama,1,2021,56741,1,0,53913.0,1,single_year_age
8,Alabama,1,2022,57666,1,0,59674.0,1,single_year_age
9,Alabama,1,2023,57885,1,0,62212.0,1,single_year_age


In [30]:
# Missing value check
merged_df.isna().sum()

State_name                     0
State_code                     0
Year                           0
Population                     0
Age                            0
Sex                            0
Median_household_income    36192
age_int                        0
age_type                       0
dtype: int64

In [38]:
# State-wise interpolation for median household income
merged_df = merged_df.sort_values(by=['State_code', 'Year'])
merged_df['Median_household_income'] = merged_df.groupby('State_code')['Median_household_income'].transform(
                                        lambda x: x.interpolate(method='linear', limit_direction='both')    
                                    )

merged_df['Median_household_income'] = merged_df['Median_household_income'].fillna(
                                        merged_df.groupby('State_code')['Median_household_income'].
                                        transform('median')
                                        )

merged_df.head(25)

Unnamed: 0,State_name,State_code,Year,Population,Age,Sex,Median_household_income,age_int,age_type
0,Alabama,1,2020,5024294,0,0,59674.0,0,single_year_age
1,Alabama,1,2020,5031864,0,0,59674.0,0,single_year_age
5,Alabama,1,2020,57354,1,0,59674.0,1,single_year_age
6,Alabama,1,2020,57034,1,0,59674.0,1,single_year_age
10,Alabama,1,2020,58484,100,0,59674.0,100,group_code
11,Alabama,1,2020,58237,100,0,59674.0,100,group_code
15,Alabama,1,2020,60105,200,0,59674.0,200,group_code
16,Alabama,1,2020,59919,200,0,59674.0,200,group_code
20,Alabama,1,2020,60738,300,0,59674.0,300,group_code
21,Alabama,1,2020,60466,300,0,59674.0,300,group_code


In [None]:
# Remove duplicate rows

merged_df = merged_df.drop_duplicates()

print(merged_df.duplicated().sum())

0


(90417, 9)

In [45]:
# Outlier detection for population
import numpy as np

merged_df['Population_z'] = merged_df.groupby('State_code')['Population'].transform(
                            lambda x: (x - x.mean()) / x.std()
                            )

merged_df_outliers = merged_df[merged_df['Population_z'].abs() > 3]
merged_df_outliers.head()

Unnamed: 0,State_name,State_code,Year,Population,Age,Sex,Median_household_income,age_int,age_type,Population_z
0,Alabama,1,2020,5024294,0,0,59674.0,0,single_year_age,7.684841
1,Alabama,1,2020,5031864,0,0,59674.0,0,single_year_age,7.696959
35,Alabama,1,2020,4787613,499,0,59674.0,499,group_code,7.305944
36,Alabama,1,2020,4796208,499,0,59674.0,499,group_code,7.319704
130,Alabama,1,2020,4023421,1699,0,59674.0,1699,group_code,6.082569


In [47]:
merged_df.to_csv('Cleaned_merged_dataset.csv', index=False)