# Import Library

In [212]:
import pandas as pd

import warnings
warnings.simplefilter("ignore")

# Grab the data

In [40]:
# we will be using these columns
use_cols = ["Organization ID", "Organization Name", "Project Type", "address1", "address2", "city", "state", "zip"]

In [41]:
# read the data from HUD
df = pd.read_excel("https://www.huduser.gov/portal/sites/default/files/xls/2019-Housing-Inventory-County-RawFile.xlsx", usecols=use_cols)

In [43]:
df.head()

Unnamed: 0,Organization ID,Organization Name,Project Type,address1,address2,city,state,zip
0,495,Battered Women's Shelter,ES,,,,,
1,495,Battered Women's Shelter,TH,,,,,
2,495,Battered Women's Shelter,RRH,,,,,
3,23705,Crisis Center of Russell County,ES,,,,,
4,1943,Daybreak,ES,,,,,


In [17]:
# all unique values for project type 
df["Project Type"].unique()

array(['ES', 'TH', 'RRH', 'PSH', 'OPH', 'SH'], dtype=object)

# Feature engineering

In [119]:
# one hot encode the "Project Type column"
df2 = pd.concat([df, pd.get_dummies(df["Project Type"])], axis = 1).drop("Project Type", axis = 1)

In [120]:
df2.head()

Unnamed: 0,Organization ID,Organization Name,address1,address2,city,state,zip,ES,OPH,PSH,RRH,SH,TH
0,495,Battered Women's Shelter,,,,,,1,0,0,0,0,0
1,495,Battered Women's Shelter,,,,,,0,0,0,0,0,1
2,495,Battered Women's Shelter,,,,,,0,0,0,1,0,0
3,23705,Crisis Center of Russell County,,,,,,1,0,0,0,0,0
4,1943,Daybreak,,,,,,1,0,0,0,0,0


In [121]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25142 entries, 0 to 25141
Data columns (total 13 columns):
Organization ID      25142 non-null int64
Organization Name    25142 non-null object
address1             19920 non-null object
address2             307 non-null object
city                 20187 non-null object
state                21179 non-null object
zip                  23388 non-null float64
ES                   25142 non-null uint8
OPH                  25142 non-null uint8
PSH                  25142 non-null uint8
RRH                  25142 non-null uint8
SH                   25142 non-null uint8
TH                   25142 non-null uint8
dtypes: float64(1), int64(1), object(5), uint8(6)
memory usage: 1.5+ MB


In [122]:
# an example of an organization with multiple addresses
df2[df2["Organization ID"] == 7]

Unnamed: 0,Organization ID,Organization Name,address1,address2,city,state,zip,ES,OPH,PSH,RRH,SH,TH
159,7,ABCCM,,,,,28801.0,0,0,0,1,0,0
15159,7,ABCCM,30 CUMBERLAND AVE,,Asheville,NC,28801.0,1,0,0,0,0,0
15160,7,ABCCM,30 CUMBERLAND AVE,,ASHEVILLE,NC,28801.0,0,0,0,0,0,1
15161,7,ABCCM,1329 TUNNEL RD,,ASHEVILLE,NC,28805.0,1,0,0,0,0,0
15162,7,ABCCM,1329 TUNNEL RD,,ASHEVILLE,NC,28805.0,0,1,0,0,0,0
15163,7,ABCCM,1329 TUNNEL RD,,ASHEVILLE,NC,28805.0,0,0,0,0,0,1
15164,7,ABCCM,1329 TUNNEL RD,,ASHEVILLE,NC,28805.0,0,0,0,0,0,1
15545,7,ABCCM,1329 TUNNEL RD,,Asheville,ND,28805.0,1,0,0,0,0,0


# Data Cleaning

### Combine and Impute the address and city

In [134]:
# manipulate the string values
df2["city"] = df2["city"].str.title()
df2["address1"] = df2["address1"].str.upper()
df2["address2"] = df2["address2"].str.upper()
df2["zip"] = df2["zip"].fillna("missinggg").apply(lambda x: str(x)[:-2])

In [180]:
# changing nan values into an empty string
def change_nan(text):
    if str(text) == "nan":
        return ""
    else:
        return str(text)

In [187]:
# changing nan values in address2 column
df2["address2"] = df2["address2"].apply(lambda x: change_nan(x))

In [188]:
# combine the address
df2["address"] = df2["address1"] + df2["address2"]

In [191]:
df2 = df2.drop(["address1", "address2"], axis = 1)

In [194]:
df2.head()

Unnamed: 0,Organization ID,Organization Name,city,state,zip,ES,OPH,PSH,RRH,SH,TH,address
19685,1,12 & 12,Tulsa,OK,74119,0,0,0,0,0,1,1214 S. BALTIMORE AVE.
19686,1,12 & 12,Tulsa,OK,74135,0,0,0,0,0,1,6333 E SKELLY DR
19687,1,12 & 12,Tulsa,OK,74135,0,0,0,0,0,1,6333 E SKELLY DR
5667,3,24 Hour Oakland Parent / Teacher Children's Ce...,Oakland,CA,94601,1,0,0,0,0,0,4700 INTERNATIONAL BLVD
12704,6,Abby's House,Worcester,MA,1609,1,0,0,0,0,0,23 CROWN ST


In [210]:
# impute address
df2 = df2.sort_values(["Organization ID", "Organization Name", "zip", "address"]).reset_index(drop = True)
for i in range(1, df2.shape[0]):
    
    # if we found missing value on column address1
    if df2["address"][i] != df2["address"][i]:
        
        if df2["Organization ID"][i] == df2["Organization ID"][i-1] and df2["zip"][i] == df2["zip"][i-1]:
            
            df2["address"][i] = df2["address"][i-1]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [209]:
# impute city
df2 = df2.sort_values(["Organization ID", "Organization Name", "zip", "city"]).reset_index(drop = True)
for i in range(1, df2.shape[0]):
    
    # if we found missing value on column city
    if df2["city"][i] != df2["city"][i]:
        
        if df2["Organization ID"][i] == df2["Organization ID"][i-1] and df2["zip"][i] == df2["zip"][i-1]:
            
            df2["city"][i] = df2["city"][i-1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [211]:
df2[df2["Organization ID"] == 7]

Unnamed: 0,Organization ID,Organization Name,city,state,zip,ES,OPH,PSH,RRH,SH,TH,address
5,7,ABCCM,Asheville,NC,28801,1,0,0,0,0,0,30 CUMBERLAND AVE
6,7,ABCCM,Asheville,NC,28801,0,0,0,0,0,1,30 CUMBERLAND AVE
7,7,ABCCM,Asheville,,28801,0,0,0,1,0,0,30 CUMBERLAND AVE
8,7,ABCCM,Asheville,NC,28805,1,0,0,0,0,0,1329 TUNNEL RD
9,7,ABCCM,Asheville,NC,28805,0,1,0,0,0,0,1329 TUNNEL RD
10,7,ABCCM,Asheville,NC,28805,0,0,0,0,0,1,1329 TUNNEL RD
11,7,ABCCM,Asheville,NC,28805,0,0,0,0,0,1,1329 TUNNEL RD
12,7,ABCCM,Asheville,ND,28805,1,0,0,0,0,0,1329 TUNNEL RD


### Drop duplicates

In [215]:
df3 = df2

In [218]:
col = ["Organization ID", "Organization Name", "address", "city", "state", "zip", "ES", "OPH", "PSH", "RRH", "SH", "TH"]
df3 = df3.drop_duplicates(col)

In [219]:
df3[df3["Organization ID"] == 7]

Unnamed: 0,Organization ID,Organization Name,city,state,zip,ES,OPH,PSH,RRH,SH,TH,address
5,7,ABCCM,Asheville,NC,28801,1,0,0,0,0,0,30 CUMBERLAND AVE
6,7,ABCCM,Asheville,NC,28801,0,0,0,0,0,1,30 CUMBERLAND AVE
7,7,ABCCM,Asheville,,28801,0,0,0,1,0,0,30 CUMBERLAND AVE
8,7,ABCCM,Asheville,NC,28805,1,0,0,0,0,0,1329 TUNNEL RD
9,7,ABCCM,Asheville,NC,28805,0,1,0,0,0,0,1329 TUNNEL RD
10,7,ABCCM,Asheville,NC,28805,0,0,0,0,0,1,1329 TUNNEL RD
12,7,ABCCM,Asheville,ND,28805,1,0,0,0,0,0,1329 TUNNEL RD


In [220]:
col_nostate = ["Organization ID", "Organization Name", "address", "city", "zip", "ES", "OPH", "PSH", "RRH", "SH", "TH"]
df3 = df3.drop_duplicates(col_nostate)

In [225]:
df3[df3["Organization ID"] == 7]

Unnamed: 0,Organization ID,Organization Name,city,state,zip,ES,OPH,PSH,RRH,SH,TH,address
5,7,ABCCM,Asheville,NC,28801,1,0,0,0,0,0,30 CUMBERLAND AVE
6,7,ABCCM,Asheville,NC,28801,0,0,0,0,0,1,30 CUMBERLAND AVE
7,7,ABCCM,Asheville,,28801,0,0,0,1,0,0,30 CUMBERLAND AVE
8,7,ABCCM,Asheville,NC,28805,1,0,0,0,0,0,1329 TUNNEL RD
9,7,ABCCM,Asheville,NC,28805,0,1,0,0,0,0,1329 TUNNEL RD
10,7,ABCCM,Asheville,NC,28805,0,0,0,0,0,1,1329 TUNNEL RD


In [222]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20394 entries, 0 to 25141
Data columns (total 12 columns):
Organization ID      20394 non-null int64
Organization Name    20394 non-null object
city                 17097 non-null object
state                17229 non-null object
zip                  20394 non-null object
ES                   20394 non-null uint8
OPH                  20394 non-null uint8
PSH                  20394 non-null uint8
RRH                  20394 non-null uint8
SH                   20394 non-null uint8
TH                   20394 non-null uint8
address              16861 non-null object
dtypes: int64(1), object(5), uint8(6)
memory usage: 1.2+ MB


In [223]:
'''
Emergency Shelter (ES)
Transitional Housing (TH)
Safe Haven (SH)
Permanent Supportive Housing (PSH)
Rapid Re-housing (RRH)
Other Permanent Housing (OPH)
'''

'\nEmergency Shelter (ES)\nTransitional Housing (TH)\nSafe Haven (SH)\nPermanent Supportive Housing (PSH)\nRapid Re-housing (RRH)\nOther Permanent Housing (OPH)\n'

### Impute the rest of the columns (city, state, address)

In [228]:
df3 = df3.drop(["city", "state"], axis = 1)

In [249]:
df3.columns = ['Organization ID', 'Organization Name', 'Zip Code', 'ES', 'OPH', 'PSH',
       'RRH', 'SH', 'TH', 'address']

In [263]:
df3.set_index("Zip Code")

Unnamed: 0_level_0,Organization ID,Organization Name,ES,OPH,PSH,RRH,SH,TH,address
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
74119,1,12 & 12,0,0,0,0,0,1,1214 S. BALTIMORE AVE.
74135,1,12 & 12,0,0,0,0,0,1,6333 E SKELLY DR
94601,3,24 Hour Oakland Parent / Teacher Children's Ce...,1,0,0,0,0,0,4700 INTERNATIONAL BLVD
1609,6,Abby's House,1,0,0,0,0,0,23 CROWN ST
28801,7,ABCCM,1,0,0,0,0,0,30 CUMBERLAND AVE
28801,7,ABCCM,0,0,0,0,0,1,30 CUMBERLAND AVE
28801,7,ABCCM,0,0,0,1,0,0,30 CUMBERLAND AVE
28805,7,ABCCM,1,0,0,0,0,0,1329 TUNNEL RD
28805,7,ABCCM,0,1,0,0,0,0,1329 TUNNEL RD
28805,7,ABCCM,0,0,0,0,0,1,1329 TUNNEL RD


In [264]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20394 entries, 0 to 25141
Data columns (total 10 columns):
Organization ID      20394 non-null int64
Organization Name    20394 non-null object
Zip Code             20394 non-null object
ES                   20394 non-null uint8
OPH                  20394 non-null uint8
PSH                  20394 non-null uint8
RRH                  20394 non-null uint8
SH                   20394 non-null uint8
TH                   20394 non-null uint8
address              16861 non-null object
dtypes: int64(1), object(3), uint8(6)
memory usage: 916.1+ KB


In [256]:
df_zip = pd.read_csv("USZipCodes202003.csv")

In [257]:
df_zip["Zip Code"] = df_zip["Zip Code"].apply(lambda x: str(x))

In [262]:
df4 = df3.join(df_zip, how = "left", on = "Zip Code")[["Organization ID", "Organization Name", "City", "State", "Zip Code", "ES",
                                                                          "OPH", "PSH", "RRH", "SH", "TH"]]

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [243]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34595 entries, 0 to 34594
Data columns (total 11 columns):
Organization ID      34595 non-null int64
Organization Name    34595 non-null object
City                 33049 non-null object
State                33049 non-null object
Zip Code             33049 non-null object
ES                   34595 non-null uint8
OPH                  34595 non-null uint8
PSH                  34595 non-null uint8
RRH                  34595 non-null uint8
SH                   34595 non-null uint8
TH                   34595 non-null uint8
dtypes: int64(1), object(4), uint8(6)
memory usage: 1.8+ MB
