In [6]:
from uszipcode import SearchEngine
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np


# read input file, rename column containing zip code to ZipCode if necessary
df = pd.read_csv(r"C:\Users\email\OneDrive\Documents\Python\zip_codes\cluster_data_pregrouping.csv")

se = SearchEngine()
zip_df = []
bad_zip = []
for i in df["ZipCode"].dropna().unique():
    row = [i]
    x = se.by_zipcode(i) # x is a SimpleZipCode object
    # appending zip code information:
    # geography:
    row.append(x.lat)
    row.append(x.lng)
    row.append(x.bounds_north)
    row.append(x.bounds_south)
    row.append(x.bounds_east)
    row.append(x.bounds_west)
    row.append(x.radius_in_miles)
    row.append(x.land_area_in_sqmi)
    row.append(x.water_area_in_sqmi)
    # population:
    row.append(x.population)
    row.append(x.population_density)
    # housing:
    row.append(x.housing_units)
    row.append(x.occupied_housing_units)
    row.append(x.median_home_value)
    row.append(x.median_household_income)
    skip = False
    for j in row:
        if j is None:
            # add ZIPs with missing info to this list to impute them
            bad_zip.append(row)
            skip = True
            break
    if not skip: 
        zip_df.append(row)

# converting data into a dataframe:
zipped = pd.DataFrame(zip_df, columns=[
    "ZipCode",      "Latitude",             "Longitude",       "BoundNorth",
    "BoundSouth",   "BoundEast",            "BoundWest",       "RadiusInMiles", 
    "LandArea",     "WaterArea",            "Population",      "PopulationDensity", 
    "HousingUnits", "OccupiedHousingUnits", "MedianHomeValue", "MedianHouseholdIncome" 
    ])

# for bad zips, give them values of their closest neighbor

# enter coordinates into model
train = pd.read_csv(r"C:\Users\email\OneDrive\Documents\Python\zip_codes\zip_reference.csv", header=None).iloc[1:,1:] 
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(train.iloc[:,1:3])
for i in range(len(bad_zip)):
    # search for closest coordinates
    distances, indices = nbrs.kneighbors(np.reshape(bad_zip[i][1:3], (1, -1)))
    # give bad zip value of closest coordinate
    bad_zip[i][3:] = train.iloc[indices[0,0],3:] 

imputed_zips = pd.DataFrame(bad_zip, columns=[
    "ZipCode",      "Latitude",             "Longitude",       "BoundNorth",
    "BoundSouth",   "BoundEast",            "BoundWest",       "RadiusInMiles", 
    "LandArea",     "WaterArea",            "Population",      "PopulationDensity", 
    "HousingUnits", "OccupiedHousingUnits", "MedianHomeValue", "MedianHouseholdIncome" 
    ])

all_zip = pd.concat([zipped, imputed_zips])

# merging with our original data, df
output = pd.merge(left = df, right = all_zip, how =  'left', on = 'ZipCode')

# export final file
#
output.to_csv("zip_codes\\final_result_2.csv")

In [7]:
output.head()

Unnamed: 0,ProviderID,CCN,Type_Of_Facility,Medicare_Certified_Beds,Total_Staffed_Beds,ZipCode,KeepState,FINAL_Zip_First_3,FINAL_RUCA1,Hospital_Overall_rating,...,BoundWest,RadiusInMiles,LandArea,WaterArea,Population,PopulationDensity,HousingUnits,OccupiedHousingUnits,MedianHomeValue,MedianHouseholdIncome
0,706051,53300,Childrens,358.0,358.0,93720,CA,937,1,,...,-119.79279,3.0,9.96,0.05,45191,4538.0,18236,17368,266300,75200
1,706051,53300,Childrens,358.0,358.0,93230,CA,932,1,,...,-119.774504,15.0,261.81,0.95,65264,249.0,22331,21067,182500,52611
2,2537483,60076,Short Term Acute Care,36.0,25.0,80751,CO,807,4,,...,-103.518795,20.0,469.75,4.21,18857,40.0,7113,6466,120700,38946
3,705834,100075,Short Term Acute Care,1364.0,1218.0,33607,FL,336,1,4.0,...,-82.57569,5.0,9.03,0.56,23541,2607.0,11159,9552,113500,31831
4,705834,100075,Short Term Acute Care,1364.0,1218.0,33614,FL,336,1,4.0,...,-82.527625,3.0,9.78,0.39,46449,4751.0,20488,18346,114600,35098


In [13]:
sum(output["ZipCode"] == 93720)
output.to_csv("final_result_2.csv")

In [16]:
x = se.by_zipcode('36202')
#x = [ x.lat, x.lng, x.population, x.population_density, x.land_area_in_sqmi, x.median_household_income]
print(x)

SimpleZipcode(zipcode='36202', zipcode_type='PO BOX', major_city='Anniston', post_office_city=None, common_city_list=['Anniston'], county='Calhoun County', state='AL', lat=33.66, lng=-85.81, timezone='America/Chicago', radius_in_miles=None, area_code_list='256', population=None, population_density=None, land_area_in_sqmi=None, water_area_in_sqmi=None, housing_units=None, occupied_housing_units=None, median_home_value=None, median_household_income=None, bounds_west=None, bounds_east=None, bounds_north=None, bounds_south=None)


In [18]:
distances, indices = nbrs.kneighbors(np.reshape([x.lat, x.lng], (1, -1)))
train.iloc[indices[0][0],:]

1          36207
2          33.66
3         -85.81
4      33.804138
5      33.588552
6      -85.63838
7     -85.827954
8            9.0
9          73.85
10          0.62
11         19801
12         268.0
13          9372
14          8385
15        131600
16         44176
Name: 10892, dtype: object

In [23]:
output.loc[output.iloc[:,5] == 36202]

Unnamed: 0,ProviderID,CCN,Type_Of_Facility,Medicare_Certified_Beds,Total_Staffed_Beds,ZipCode,KeepState,FINAL_Zip_First_3,FINAL_RUCA1,Hospital_Overall_rating,...,BoundWest,RadiusInMiles,LandArea,WaterArea,Population,PopulationDensity,HousingUnits,OccupiedHousingUnits,MedianHomeValue,MedianHouseholdIncome
43,2537069,10078,Short Term Acute Care,323.0,287.0,36202,AL,362,1,,...,-85.827954,9.0,73.85,0.62,19801,268.0,9372,8385,131600,44176
10180,2537094,12011,Long Term Care,,,36202,AL,362,1,,...,-85.827954,9.0,73.85,0.62,19801,268.0,9372,8385,131600,44176
11213,2537094,12011,Long Term,38.0,38.0,36202,AL,362,1,,...,-85.827954,9.0,73.85,0.62,19801,268.0,9372,8385,131600,44176


In [1]:
str(None)

'None'