In [1]:
import pandas as pd
import copy
import numpy as np
import warnings
warnings.simplefilter("ignore")

# Download The Data

In [2]:
raw_data = pd.read_csv('PUMSsubset_SanDiego-2017-2021.txt', delimiter='\t')

# Adjust The Household Income 

In [3]:
def multiply_income(row):
    if row["Year"] == 2019:
        return row["HINCP"] * 344.416/299.433
    elif row["Year"] == 2020:
        return row["HINCP"] * 344.416/303.932
    elif row['Year'] == 2021:
        return row["HINCP"] * 344.416/319.761

raw_data["adjusted_income"] = raw_data.apply(multiply_income, axis=1)

# Create Income Categories

In [4]:
def income_cat(x):
    if x == 9999999:
        return 0
    elif x < 15000:
        return 1
    elif x >= 15000 and x < 30000:
        return 2
    elif x >= 30000 and x < 45000:
        return 3
    elif x >= 45000 and x < 60000:
        return 4
    elif x >= 60000 and x < 75000:
        return 5
    elif x >= 75000 and x < 100000:
        return 6
    elif x >= 100000 and x < 125000:
        return 7
    elif x >= 125000 and x < 150000:
        return 8
    elif x >= 150000 and x < 200000:
        return 9
    elif x >= 200000 and x < 300000:
        return 10
    elif x >= 300000 and x < 500000:
        return 11
    elif x >= 500000 and x < 700000:
        return 12
    elif x >= 700000 and x < 900000:
        return 13
    else:
        return 14

# Create Main Function

In [5]:
def create_frequency_table(df):
    # Create the income categories
    df['IncomeCat'] = df['adjusted_income'].apply(income_cat)

    # Created a weighted frequency
    weighted_freq = df.groupby('IncomeCat').apply(lambda x: np.sum(x['WGTP']) / np.sum(df['WGTP'])).reset_index(name='frequency_percent')

    # Multiple weighted frequency to read as a percent
    weighted_freq['frequency_percent'] = weighted_freq['frequency_percent'] * 100

    return weighted_freq

# Run Main Function

In [6]:
processed_data = raw_data[((raw_data['RELSHIPP'] == 20) & (raw_data['Year'] == 2019) & (raw_data['ADJINC'] > 0) & (raw_data['TYPEHUGQ'] == 1))]

In [7]:
result = create_frequency_table(df=processed_data)
result.to_excel('weighted_distribution_2019.xlsx')
result

Unnamed: 0,IncomeCat,frequency_percent
0,1,6.412612
1,2,7.981669
2,3,7.82643
3,4,9.258228
4,5,8.204004
5,6,12.035872
6,7,10.98998
7,8,8.441687
8,9,10.847896
9,10,10.98691


# Non Weighted Frequency

In [8]:
processed_data

Unnamed: 0,SERIALNO,SPORDER,PUMA,ST,ADJINC,PWGTP,AGEP,COW,MIL,RELSHIPP,...,HHT,HINCP,Year,Age,NonHispanic,RaceCode,RaceName,Gender,adjusted_income,IncomeCat
61440,2019HU0000239,1,7305,6,1070512,23,25,1.0,4.0,20,...,1.0,65000.0,2019,25,0,0,Hispanic,M,74764.772086,5
61444,2019HU0000401,1,7311,6,1070512,16,59,1.0,4.0,20,...,1.0,301900.0,2019,59,1,6,Asian_NH,M,347253.610657,11
61448,2019HU0000404,1,7308,6,1070512,17,67,3.0,4.0,20,...,3.0,65600.0,2019,67,1,1,White_NH,F,75454.908444,6
61451,2019HU0000445,1,7310,6,1070512,12,23,5.0,1.0,20,...,4.0,39000.0,2019,23,1,1,White_NH,M,44858.863252,3
61452,2019HU0000644,1,7313,6,1070512,28,30,1.0,4.0,20,...,1.0,75000.0,2019,30,1,1,White_NH,M,86267.044715,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89864,2019HU1411848,1,7312,6,1070512,9,71,3.0,4.0,20,...,6.0,141000.0,2019,71,1,1,White_NH,F,162182.044063,9
89865,2019HU1412013,1,7320,6,1070512,9,56,1.0,4.0,20,...,2.0,130000.0,2019,56,1,1,White_NH,M,149529.544172,8
89868,2019HU1412139,1,7316,6,1070512,14,68,1.0,4.0,20,...,3.0,65100.0,2019,68,0,0,Hispanic,F,74879.794812,5
89875,2019HU1412145,1,7312,6,1070512,23,86,,4.0,20,...,6.0,12600.0,2019,86,1,1,White_NH,F,14492.863512,1


In [16]:
pd.DataFrame(processed_data['IncomeCat'].value_counts().sort_index()).to_excel('frequency_counts_2019.xlsx')

In [18]:
pd.DataFrame(processed_data['IncomeCat'].value_counts(normalize=True)).sort_index()

Unnamed: 0,IncomeCat
1,0.060745
2,0.07465
3,0.077212
4,0.087641
5,0.079682
6,0.119019
7,0.109322
8,0.084805
9,0.112981
10,0.115817
