In [1]:
#Import modules
import pandas as pd

In [2]:
#Read html to get zip code and population 
tables = pd.read_html("https://www.newjersey-demographics.com/zip_codes_by_population")

In [3]:
#Create data frame from the html content
zip_population=pd.DataFrame(tables[0])

In [4]:
#Check header info
zip_population.head(2)

Unnamed: 0,Rank,Zip Code,Population
0,1,8701,100763
1,2,7055,70199


In [5]:
#Check tail end info
zip_population.tail(5)

Unnamed: 0,Rank,Zip Code,Population
587,588,08011,20
588,589 TIE,08074 and 08890,15
589,590,07926,11
590,591,07881,6
591,United States Census Bureau. B01001 SEX BY AGE...,United States Census Bureau. B01001 SEX BY AGE...,United States Census Bureau. B01001 SEX BY AGE...


In [6]:
#lets clean data
#Remove last row
zip_population=zip_population[:-1]

In [7]:
#Check results
zip_population.tail(2)

Unnamed: 0,Rank,Zip Code,Population
589,590,7926,11
590,591,7881,6


In [8]:
#Lets handle tie situation
tied_zip= zip_population["Rank"].str.endswith("TIE")

In [9]:
#Find how many zip codes are tied
zip_population[tied_zip].count()

Rank          7
Zip Code      7
Population    7
dtype: int64

In [10]:
zip_population[tied_zip]

Unnamed: 0,Rank,Zip Code,Population
236,237 TIE,07078 and 08009,13097
480,481 TIE,07756 and 07088,3133
542,543 TIE,07723 and 08346,695
570,571 TIE,08212 and 08348,188
573,574 TIE,08858 and 08321,147
583,584 TIE,07820 and 08095,36
588,589 TIE,08074 and 08890,15


In [11]:
#Lets split good data and data that need cleanup
zip_population_good =zip_population[~tied_zip]
zip_population_tbd =zip_population[tied_zip]

In [12]:
#Checd good data
zip_population_good.head()

Unnamed: 0,Rank,Zip Code,Population
0,1,8701,100763
1,2,7055,70199
2,3,7087,68484
3,4,7002,65300
4,5,7305,64535


In [13]:
#Check data that need cleanup
zip_population_tbd.head()

Unnamed: 0,Rank,Zip Code,Population
236,237 TIE,07078 and 08009,13097
480,481 TIE,07756 and 07088,3133
542,543 TIE,07723 and 08346,695
570,571 TIE,08212 and 08348,188
573,574 TIE,08858 and 08321,147


In [14]:
#Lets work on tbd data
rank=[]
zipcode=[]
population=[]

#Loop through dataframe
for index,row in zip_population_tbd.iterrows():
    iRank=str(row["Rank"]).split(" ")[0]
    iZip=str(row["Zip Code"]).split("and")
    iPop=row["Population"]
    #Because its tie means we need to create 2 records from 1
    for i in range(len(iZip)):
        rank.append(iRank)
        population.append(iPop)
        zipcode.append(iZip[i])
   

In [15]:
#Check if got array of 7 dups to 14 row
print(len(rank))
print(len(population))
print(len(zipcode))

14
14
14


In [16]:
#Create cleaned data df
zip_population_clean=pd.DataFrame({"Rank":rank,"Zip Code":zipcode,"Population":population})

In [17]:
#Check results
zip_population_clean.head()

Unnamed: 0,Rank,Zip Code,Population
0,237,7078,13097
1,237,8009,13097
2,481,7756,3133
3,481,7088,3133
4,543,7723,695


In [18]:
#Merge good and clean data frames
zip_population=pd.concat([zip_population_good,zip_population_clean])

In [19]:
#Check results
zip_population.head()

Unnamed: 0,Rank,Zip Code,Population
0,1,8701,100763
1,2,7055,70199
2,3,7087,68484
3,4,7002,65300
4,5,7305,64535


In [20]:
#Export data to csv
filepath = os.path.join("..","Final Output Data","ZipPopulation_Data.csv")
zip_population.to_csv(filepath, index = False)