In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from zip_code import fetch_census_data as fetch

In [2]:
# Read csv
df = pd.read_csv('city-state_zip-codes.csv')
df

Unnamed: 0,City-State,Zip_Code
0,Clarksburg-MD,['20871']
1,Boyds-MD,['20841']
2,Rockville-MD,"['20847', '20848', '20849', '20850', '20851', ..."
3,Germantown-MD,"['20874', '20876']"
4,Gaithersburg-MD,"['20877', '20878', '20879', '20882']"
5,Frederick-MD,"['21701', '21702', '21703', '21704', '21705', ..."
6,Potomac-MD,"['20817', '20827', '20854', '20859']"
7,NorthPotomac-MD,"['20850', '20878', '20997']"
8,Ashburn-VA,"['20146', '20147', '20148', '20149']"
9,Beaumont-VA,"['23014', '233139']"


In [3]:
# Check for columns name and info
df.columns, df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   City-State  11 non-null     object
 1   Zip_Code    11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


(Index(['City-State', 'Zip_Code'], dtype='object'), None)

In [4]:
# Created this function to clean the strings
def trim_string(word):
    """Removes the first and last characters from a string."""
    if len(word) <= 2:
        return ''  # Return an empty string if word is too short
    return word[1:-1]

In [5]:
# Removing brackets by joining the values into a single string
df['Zip_Code'] = df['Zip_Code'].apply(lambda x: trim_string(x))
df

Unnamed: 0,City-State,Zip_Code
0,Clarksburg-MD,'20871'
1,Boyds-MD,'20841'
2,Rockville-MD,"'20847', '20848', '20849', '20850', '20851', '..."
3,Germantown-MD,"'20874', '20876'"
4,Gaithersburg-MD,"'20877', '20878', '20879', '20882'"
5,Frederick-MD,"'21701', '21702', '21703', '21704', '21705', '..."
6,Potomac-MD,"'20817', '20827', '20854', '20859'"
7,NorthPotomac-MD,"'20850', '20878', '20997'"
8,Ashburn-VA,"'20146', '20147', '20148', '20149'"
9,Beaumont-VA,"'23014', '233139'"


In [6]:
# If there are multiple zip codes, split each into a different row
explode_df = df.assign(Zip_Code=df['Zip_Code'].str.split(", ")).explode('Zip_Code').reset_index(drop=True)
explode_df['Zip_Code'] = explode_df['Zip_Code'].apply(lambda x: trim_string(x))
explode_df

Unnamed: 0,City-State,Zip_Code
0,Clarksburg-MD,20871
1,Boyds-MD,20841
2,Rockville-MD,20847
3,Rockville-MD,20848
4,Rockville-MD,20849
5,Rockville-MD,20850
6,Rockville-MD,20851
7,Rockville-MD,20852
8,Rockville-MD,20853
9,Rockville-MD,2085


In [7]:
final_df = explode_df.copy()

In [8]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   City-State  40 non-null     object
 1   Zip_Code    40 non-null     object
dtypes: object(2)
memory usage: 768.0+ bytes


In [9]:
# These are the variables I was interested in. For more variables, please look census_variables.csv. 
# This .csv file was created by web scrapping of US census webpage using "census_cariable_scrap.py".
#
variables_dict = {'DP05_0044E': 'One race | Asian', 'DP05_0045E': 'One race | Asian | Asian Indian', 'DP05_0046E': 'One race | Asian | Chinease'}
variables = ','.join(list(variables_dict.keys()))
year = '2022' 

In [10]:
# retireve first zip code in the column
final_df['Zip_Code'].iloc[0]

'20871'

In [11]:
# Cheking if the fetch function is working fine
fetch(year, variables, final_df['Zip_Code'].iloc[0])[1]

['10267', '4626', '1810', '20871']

In [12]:
# Run the function on all zip codes and provide the result in combined column.
# If US census doesn't have a value for a zip code, return it None.
final_df['combined'] = final_df['Zip_Code'].apply(lambda x: fetch(year, variables, x)[1] if fetch(year, variables, x) is not None else None)

Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204
Error fetching data: 204


In [13]:
final_df

Unnamed: 0,City-State,Zip_Code,combined
0,Clarksburg-MD,20871,"[10267, 4626, 1810, 20871]"
1,Boyds-MD,20841,"[4267, 1455, 1837, 20841]"
2,Rockville-MD,20847,
3,Rockville-MD,20848,
4,Rockville-MD,20849,
5,Rockville-MD,20850,"[13467, 2687, 6657, 20850]"
6,Rockville-MD,20851,"[2357, 217, 725, 20851]"
7,Rockville-MD,20852,"[9654, 2070, 2442, 20852]"
8,Rockville-MD,20853,"[3907, 348, 779, 20853]"
9,Rockville-MD,2085,


In [14]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   City-State  40 non-null     object
 1   Zip_Code    40 non-null     object
 2   combined    26 non-null     object
dtypes: object(3)
memory usage: 1.1+ KB


In [15]:
split_df = final_df.copy()

In [16]:
split_df = split_df.dropna()
split_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26 entries, 0 to 38
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   City-State  26 non-null     object
 1   Zip_Code    26 non-null     object
 2   combined    26 non-null     object
dtypes: object(3)
memory usage: 832.0+ bytes


In [17]:
# Split information from combined and assign the values to their respective columns
split_df['Asian_Total'] = split_df['combined'].apply(lambda x: x[0]).astype(int)
split_df['Asian_Indian'] = split_df['combined'].apply(lambda x: x[1]).astype(int)
split_df['Asian_Chinease'] = split_df['combined'].apply(lambda x: x[2]).astype(int)

In [18]:
# Since we are done with 'combined' column, we can drop this column
df_clean = split_df.drop(columns=['combined'])
df_clean

Unnamed: 0,City-State,Zip_Code,Asian_Total,Asian_Indian,Asian_Chinease
0,Clarksburg-MD,20871,10267,4626,1810
1,Boyds-MD,20841,4267,1455,1837
5,Rockville-MD,20850,13467,2687,6657
6,Rockville-MD,20851,2357,217,725
7,Rockville-MD,20852,9654,2070,2442
8,Rockville-MD,20853,3907,348,779
10,Germantown-MD,20874,10859,3503,2723
11,Germantown-MD,20876,5279,1580,1443
12,Gaithersburg-MD,20877,6188,1108,1414
13,Gaithersburg-MD,20878,17775,6433,6687


In [19]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26 entries, 0 to 38
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   City-State      26 non-null     object
 1   Zip_Code        26 non-null     object
 2   Asian_Total     26 non-null     int64 
 3   Asian_Indian    26 non-null     int64 
 4   Asian_Chinease  26 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 1.2+ KB


In [20]:
# To create a pivot table based on city-state, I dropped zip code column, becuase during processing the zip code will be 
# added and will not provide any relevant information.
df_no_zip = df_clean.drop(columns=['Zip_Code'])

In [21]:
df_no_zip.columns

Index(['City-State', 'Asian_Total', 'Asian_Indian', 'Asian_Chinease'], dtype='object')

In [22]:
# Created the pivot table and transposed the table.
pivot = pd.pivot_table(df_no_zip, 
                       columns='City-State',
                       values=['Asian_Total', 'Asian_Indian', 'Asian_Chinease'],
                       aggfunc='sum')
pivot = pivot.transpose()
pivot

Unnamed: 0_level_0,Asian_Chinease,Asian_Indian,Asian_Total
City-State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ashburn-VA,2566,24600,38831
Boyds-MD,1837,1455,4267
Clarksburg-MD,1810,4626,10267
Frederick-MD,1918,2502,9538
Gaithersburg-MD,8728,8535,28731
Germantown-MD,4166,5083,16138
NorthPotomac-MD,13344,9120,31242
Potomac-MD,7125,4105,16504
Rockville-MD,10603,5322,29385
Sterling-VA,1316,4843,14919
