In [105]:
import pandas as pd
import folium
import requests
import geopy
import geopandas as gpd
from geopy.extra.rate_limiter import RateLimiter
from functools import partial
from geopy.geocoders import Nominatim
import json
from shapely.geometry import Point
import re 

from flatten_json import flatten
from folium.plugins import FastMarkerCluster

In [117]:
#clean zip_codes/population data
def clean_zips(state):


    zip_codes = pd.read_csv(f'./data/unformatted_zips/{state}_zip_codes.csv')
    zip_codes.drop([0], axis='index', inplace = True)
    zip_codes.drop(['GEO_ID'], axis='columns', inplace = True)
    zip_codes = zip_codes.rename(columns = {'NAME' : 'zip_code', 'P001001' : 'population'})
    zip_codes['zip_code'] = zip_codes['zip_code'].str[6:11]
    zip_codes.to_csv(f'./data/formatted_zips/{state}_zip_codes.csv')

    zip_code_list = list(zip_codes['zip_code'].astype(str))



In [118]:
#pulling npi data
def npi_pull(state):
    zip_codes = pd.read_csv(f'./data/formatted_zips/{state}_zip_codes.csv')


    zip_code_list = list(zip_codes['zip_code'].astype(str))


    endpoint = 'https://npiregistry.cms.hhs.gov/api/'

    df_small = pd.DataFrame()
    df_large = pd.DataFrame()
    run_count = 1


    for current_zip in zip_code_list:
        percent_complete = 100*run_count/len(zip_code_list)
        print(str(percent_complete) + "percent complete")
        counter = 0
        output = {'result_count' : 0}

        #search for dental taxonomy

        while output['result_count'] > 0 or counter ==0 and counter<= 1000:
            print("dental " + str(counter))
            params = {'version' : '2.1',
                     'taxonomy_description' : 'dental',
                     'state' : f'{state}',
                     'limit' : '200',
                     'address_purpose' : 'LOCATION',
                     'skip' : f'{counter}',
                     'postal_code' : f'{current_zip}'}

            res = requests.get(endpoint, params)

            output = res.json()
            df_small  = df_small.append(pd.DataFrame(res.json()))
            counter = counter+200
            if counter >1000:
                print('>1000')

        #search for dentist taxonomy   

        counter = 0
        output = {'result_count' : 0}


        while output['result_count'] > 0 or counter ==0 and counter<= 1000:
            print('dentist  ' + str(counter))
            params = {'version' : '2.0',
                     'taxonomy_description' : 'dentist',
                     'state' : f'{state}',
                     'limit' : '200',
                     'address_purpose' : 'LOCATION',
                     'skip' : f'{counter}',
                     'postal_code' : f'{current_zip}'}

            res = requests.get(endpoint, params)

            output = res.json()
            df_small  = df_small.append(pd.DataFrame(res.json()))
            counter = counter+200
            if counter >1000:
                print('>1000')
        run_count = run_count+1

    dicta = df_small['results']
    dict_flattened = (flatten(record, '.') for record in dicta)
    df_large = pd.DataFrame(dict_flattened)
    df_large.to_csv(f'./data/State CSVs/{state}_df_large.csv')

    df_small = pd.json_normalize(df_small['results'])
    df_small.to_csv(f'./data/State CSVs/{state}_df_small.csv')



In [108]:
#clean addresses column

def address_extract(state):
    df_small = pd.read_csv(f'./data/State CSVs/{state}_df_small.csv')

    try:
        address_errors = pd.read_csv('./data/addresses_wo_dups/errors.csv').reset_index(drop=True)
    except:
        address_errors = pd.DataFrame()
        address_errors['error'] =""

    df_addresses_wo_dups = pd.DataFrame()
    df_addresses_wo_dups['address_1'] =""
    df_addresses_wo_dups['city'] =""
    df_addresses_wo_dups['state'] =""
    df_addresses_wo_dups['postal_code'] =""
    df_addresses_wo_dups.reset_index(drop = True)
    df_small.reset_index(drop = True)



    for index, elem in df_small.iterrows():
        print(index)

        try:
            dict = eval(df_small.loc[index, 'addresses'])[0]

            df_addresses_wo_dups = df_addresses_wo_dups.append({'address_1': str(dict['address_1']),
                                                                'city' : str(dict['city']),
                                                                'state' : str(dict['state']),
                                                                'postal_code' : str(dict['postal_code'])[:5]},
                                       ignore_index = True)


            if df_small.isna().loc[index, 'practiceLocations'] == False:

                for elem in eval(df_small.loc[index, 'practiceLocations']):

                    dict = elem
                    df_addresses_wo_dups = df_addresses_wo_dups.append({'address_1': str(dict['address_1']),
                                                                        'city' : str(dict['city']),
                                                                        'state' : str(dict['state']),
                                                                        'postal_code' : str(dict['postal_code'])[:5]},
                                               ignore_index = True)
        except:
            address_errors = address_errors.append({'error' : str(df_small.loc[index, 'number'])},
                                                  ignore_index = True)


    df_addresses_wo_dups = df_addresses_wo_dups.drop_duplicates(['address_1', 'city'], keep='first').reset_index(drop=True)
    df_addresses_wo_dups.reset_index(drop=True)
    df_addresses_wo_dups.to_csv(f'./data/addresses_wo_dups/{state}_addresses_wo_dups.csv')
    address_errors.to_csv('./data/addresses_wo_dups/errors.csv')


In [109]:
#single upload Census Retrieval

def census_retrieval(state):

    df_addresses_wo_dups = pd.read_csv(f'./data/addresses_wo_dups{state}_addresses_wo_dups.csv').reset_index(drop=True)

    endpoint2 = 'https://geocoding.geo.census.gov/geocoder/geographies/address'
    df_addresses_wo_dups['output'] = " "
    for index, item in df_addresses_wo_dups.iterrows():
        print(index)
        params2 = {'benchmark' : 'Public_AR_Current',
                   'vintage' : 'Census2010_Current',
                   'street' : str(df_addresses_wo_dups.loc[index, 'address_1']),
                   'city' : str(df_addresses_wo_dups.loc[index, 'city']),
                     'state' : str(df_addresses_wo_dups.loc[index, 'state']),
                       'format': 'json',
                  'layers' : 'all'}

        df_addresses_wo_dups.loc[index, 'output'] = str(requests.get(endpoint2, params2).json())

    df_addresses_wo_dups.to_csv(f'./data/addresses_wo_dups_census/{state}_addresses_wo_dups_census.csv')

In [119]:
#workflow

states = ['AR']

for state_abv in states:
    clean_zips(state_abv)
    npi_pull(state_abv)
    address_extract(state_abv)
    census_retrieval(state_abv)

0.16778523489932887percent complete
dental 0
dentist  0
0.33557046979865773percent complete
dental 0
dentist  0
0.5033557046979866percent complete
dental 0
dentist  0
0.6711409395973155percent complete
dental 0
dentist  0
0.8389261744966443percent complete
dental 0
dentist  0
dentist  200
1.0067114093959733percent complete
dental 0
dental 200
dentist  0
dentist  200
1.174496644295302percent complete
dental 0
dental 200
dentist  0
dentist  200
1.342281879194631percent complete
dental 0
dentist  0
1.5100671140939597percent complete
dental 0
dentist  0
1.6778523489932886percent complete
dental 0
dentist  0
dentist  200
1.8456375838926173percent complete
dental 0
dental 200
dentist  0
dentist  200
2.0134228187919465percent complete
dental 0
dentist  0
dentist  200
2.1812080536912752percent complete
dental 0
dentist  0
dentist  200
2.348993288590604percent complete
dental 0
dentist  0
2.5167785234899327percent complete
dental 0
dentist  0
dentist  200
2.684563758389262percent complete
denta

dentist  0
23.322147651006713percent complete
dental 0
dentist  0
23.48993288590604percent complete
dental 0
dentist  0
dentist  200
23.65771812080537percent complete
dental 0
dentist  0
23.825503355704697percent complete
dental 0
dental 200
dentist  0
dentist  200
23.993288590604028percent complete
dental 0
dentist  0
24.161073825503355percent complete
dental 0
dentist  0
dentist  200
24.328859060402685percent complete
dental 0
dentist  0
24.496644295302012percent complete
dental 0
dentist  0
24.664429530201343percent complete
dental 0
dental 200
dentist  0
dentist  200
24.83221476510067percent complete
dental 0
dentist  0
25.0percent complete
dental 0
dentist  0
25.16778523489933percent complete
dental 0
dentist  0
dentist  200
25.335570469798657percent complete
dental 0
dentist  0
25.503355704697988percent complete
dental 0
dental 200
dentist  0
dentist  200
25.671140939597315percent complete
dental 0
dental 200
dentist  0
dentist  200
25.838926174496645percent complete
dental 0
den

dental 200
dentist  0
dentist  200
45.80536912751678percent complete
dental 0
dental 200
dentist  0
dentist  200
45.97315436241611percent complete
dental 0
dentist  0
46.14093959731544percent complete
dental 0
dental 200
dentist  0
dentist  200
46.308724832214764percent complete
dental 0
dentist  0
46.47651006711409percent complete
dental 0
dentist  0
46.644295302013425percent complete
dental 0
dentist  0
46.81208053691275percent complete
dental 0
dental 200
dentist  0
46.97986577181208percent complete
dental 0
dentist  0
47.147651006711406percent complete
dental 0
dentist  0
47.31543624161074percent complete
dental 0
dentist  0
47.48322147651007percent complete
dental 0
dentist  0
47.651006711409394percent complete
dental 0
dentist  0
47.81879194630873percent complete
dental 0
dentist  0
47.986577181208055percent complete
dental 0
dentist  0
dentist  200
48.15436241610738percent complete
dental 0
dentist  0
48.32214765100671percent complete
dental 0
dentist  0
48.48993288590604percent

dentist  0
dentist  200
69.12751677852349percent complete
dental 0
dentist  0
dentist  200
69.29530201342281percent complete
dental 0
dentist  0
69.46308724832215percent complete
dental 0
dentist  0
69.63087248322148percent complete
dental 0
dentist  0
69.79865771812081percent complete
dental 0
dentist  0
69.96644295302013percent complete
dental 0
dentist  0
dentist  200
70.13422818791946percent complete
dental 0
dentist  0
70.30201342281879percent complete
dental 0
dental 200
dentist  0
dentist  200
70.46979865771812percent complete
dental 0
dentist  0
dentist  200
70.63758389261746percent complete
dental 0
dentist  0
70.80536912751678percent complete
dental 0
dentist  0
70.97315436241611percent complete
dental 0
dentist  0
71.14093959731544percent complete
dental 0
dentist  0
71.30872483221476percent complete
dental 0
dentist  0
71.47651006711409percent complete
dental 0
dentist  0
71.64429530201342percent complete
dental 0
dentist  0
71.81208053691275percent complete
dental 0
dentis

dentist  0
92.11409395973155percent complete
dental 0
dentist  0
92.28187919463087percent complete
dental 0
dentist  0
92.4496644295302percent complete
dental 0
dentist  0
92.61744966442953percent complete
dental 0
dentist  0
92.78523489932886percent complete
dental 0
dentist  0
92.95302013422818percent complete
dental 0
dentist  0
93.12080536912751percent complete
dental 0
dentist  0
dentist  200
93.28859060402685percent complete
dental 0
dentist  0
93.45637583892618percent complete
dental 0
dentist  0
93.6241610738255percent complete
dental 0
dentist  0
dentist  200
93.79194630872483percent complete
dental 0
dentist  0
93.95973154362416percent complete
dental 0
dentist  0
94.12751677852349percent complete
dental 0
dentist  0
94.29530201342281percent complete
dental 0
dental 200
dentist  0
dentist  200
94.46308724832215percent complete
dental 0
dental 200
dentist  0
dentist  200
94.63087248322148percent complete
dental 0
dental 200
dentist  0
dentist  200
94.79865771812081percent comp

1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472


FileNotFoundError: [Errno 2] No such file or directory: './data/addresses_wo_dupsAR_addresses_wo_dups.csv'

df_small = pd.read_csv('./State CSVs/TN_df_small.csv')
df_small['location'] =""
df_small.reset_index(drop = True)

#listofprimaryaddresses
for index, elem in df_small.iterrows():
    try:
        dict = eval(df_small.loc[index, 'addresses'])[0]
        df_small.loc[index, 'addresses'] = str(dict['address_1'] + " " + dict['city'] + " " + dict['state']
                                              + " " + dict["postal_code"])
        
        #get last 5 digs of zip and do value counts and heat map and per capita dentsists per person
    except:
        print('errr')

In [5]:
df_drop = pd.read_csv('TN_cleaned_no_dups_geocoded.csv')

In [6]:
df_drop['geometry'] = ""
counter = 0 

for index, elem in df_drop.iterrows():
    try:    
        df_drop.loc[index, 'geometry'] = Point(pd.json_normalize(pd.json_normalize(
                eval(df_drop.loc[index, 'output']))['result.addressMatches'][0])['coordinates.x'],
        pd.json_normalize(pd.json_normalize(
                eval(df_drop.loc[index, 'output']))['result.addressMatches'][0])['coordinates.y'])
    except:
        print(f"none found at {index}")
        counter = counter+1
 
print(counter)

none found at 0
none found at 12
none found at 13
none found at 21
none found at 30
none found at 53
none found at 84
none found at 87
none found at 88
none found at 89
none found at 90
none found at 103
none found at 104
none found at 108
none found at 122
none found at 132
none found at 133
none found at 143
none found at 148
none found at 158
none found at 213
none found at 216
none found at 217
none found at 224
none found at 226
none found at 228
none found at 231
none found at 233
none found at 235
none found at 246
none found at 247
none found at 248
none found at 278
none found at 279
none found at 280
none found at 282
none found at 283
none found at 284
none found at 286
none found at 287
none found at 288
none found at 298
none found at 305
none found at 312
none found at 320
none found at 321
none found at 324
none found at 327
none found at 335
none found at 344
none found at 350
none found at 351
none found at 352
none found at 355
none found at 363
none found at 373
none

none found at 3020
none found at 3025
none found at 3030
none found at 3033
none found at 3034
none found at 3040
none found at 3056
none found at 3062
none found at 3064
none found at 3065
none found at 3070
none found at 3071
none found at 3072
none found at 3073
none found at 3077
none found at 3099
none found at 3102
none found at 3107
none found at 3108
none found at 3109
none found at 3111
none found at 3125
none found at 3126
none found at 3127
none found at 3128
none found at 3129
none found at 3138
none found at 3139
none found at 3140
none found at 3149
none found at 3157
none found at 3158
none found at 3163
none found at 3164
none found at 3165
none found at 3166
none found at 3167
none found at 3174
none found at 3175
none found at 3176
none found at 3177
none found at 3179
none found at 3180
none found at 3182
none found at 3189
none found at 3190
none found at 3192
none found at 3195
none found at 3198
none found at 3199
none found at 3224
none found at 3225
none found a

In [11]:
geo_df_drop = gpd.GeoDataFrame(df_drop)
geo_df_drop['lat'] = ""
geo_df_drop['long'] = ""

for index, elem in geo_df_drop.iterrows():
    try:
        geo_df_drop.loc[index, 'lat'] = geo_df_drop.loc[index, 'geometry'].y
        geo_df_drop.loc[index, 'long'] = geo_df_drop.loc[index, 'geometry'].x
    
    except:
        pass
                                            

In [10]:
geo_df_drop.drop_duplicates('lat', keep=False, inplace = True)


map_nash = folium.Map(location=[36.1627, -86.7816], zoom_start = 12)


locations = geo_df_drop[['lat', 'long']].values.tolist()
map_nash.add_child(
    FastMarkerCluster(locations)
)



#display our map
map_nash

In [None]:
m = folium.Map(location=[36.1627, -86.7816])
folium.Marker(
    [df_try_geo.geometry[1].y, df_try_geo.geometry[1].x ], popup="<i>ANTIOCH DENTAL</i>").add_to(m)

folium.Marker(
    [df_try_geo.geometry[2].y, df_try_geo.geometry[2].x ], popup="NASHVILLE DENTURES & IMPLANTS, PLLC").add_to(m)
m


In [112]:
#testing

new_df = pd.json_normalize(pd.json_normalize(eval(a))['result.addressMatches'][0])
new_df = new_df.transpose()
new_df


Unnamed: 0,0
matchedAddress,"225 MAIN AVE, DAYTON, TN, 37321"
coordinates.x,-85.01263
coordinates.y,35.49196
tigerLine.tigerLineId,59530811
tigerLine.side,L
addressComponents.fromAddress,171
addressComponents.toAddress,231
addressComponents.preQualifier,
addressComponents.preDirection,
addressComponents.preType,


In [None]:
locator = Nominatim(user_agent='dentist research')

# 1 - conveneint function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)


a = locator.geocode('310 GREAT CIRCLE ROAD NASHVILLE TN')

# # 3 - create longitude, laatitude and altitude from location column (returns tuple)
#series_b = df_small['location'].apply(lambda loc: tuple(loc.point) if loc else None)



In [None]:
params = {'version' : '2.0',
             'taxonomy_description' : 'dentist',
             'state' : 'TN',
             'city': 'nashville',
             'limit' : '200',
             'address_purpose' : 'LOCATION',
             'skip' : '0',
         'pretty' : 'on'}


res = requests.get(endpoint, params)

# df = pd.json_normalize(res.json()['results'])

# df_new = pd.DataFrame()
# df_new = pd.json_normalize(res.json()['results'], record_path = ['addresses'], meta = ['enumeration_type',
#                                                                                       'number',
#                                                                                       'last_updated_epoch',
#                                                                                       'created_epoch',
#                                                                                       'other_names',
#                                                                                       'taxonomies',
#                                                                                       'identifiers',
                                                                                    
#                                                                                       'practiceLocations'],errors='ignore')
# df_new
# # df_nested_list = pd.json_normalize(df['results'], record_path = ['taxonomies'], meta = ['enumeration_type',
#                                                                                       'number',
#                                                                                       'last_updated_epoch',
#                                                                                       'created_epoch',
#                                                                                       'other_names',
#                                                                                       'addresses',
#                                                                                       'practiceLocations'],errors='ignore')

# df_nested_list = pd.json_normalize(df_nested_list, record_path = ['identifiers'], meta = ['enumeration_type',
#                                                                                       'number',
#                                                                                       'last_updated_epoch',
#                                                                                       'created_epoch',
#                                                                                       'other_names',
#                                                                                       'taxonomies',
#                                                                                       'addresses',
#                                                                                       'basic',
#                                                                                       'practiceLocations'],errors='ignore')



#df_nested_list


In [None]:
dentists = pd.DataFrame()

for index, elem in df.iterrows():
    dentists = dentists.append({'name' : df.loc[index, 'basic.name'],
                                'address': df.loc[index, 'addresses'][0]['address_1'], 
                                                'city' : df.loc[index, 'addresses'][0]['city'],
                                            'state' : df.loc[index, 'addresses'][0]['state']}, ignore_index=True)

    

dentists

In [126]:
#cleaning for batch upload

df_small = pd.read_csv('./State CSVs/TN_df_small.csv')
df_small['unique_id'] =""
df_small['address_1'] =""
df_small['city'] =""
df_small['state'] =""
df_small['zip_code'] =""
df_small.reset_index(drop = True)

#seperating address into seperate columns and cleaning for batch upload
for index, elem in df_small.iterrows():
    try:
        dict = eval(df_small.loc[index, 'addresses'])[0]
       
        unique_id = ""
        address = str(dict['address_1'])
    
    
        if str(dict['address_1'])[0] in ['0','1','2','3','4','5','6','7','8','9']: 
            try:
                unique_id = (re.match(r'(\d+)(?:-\d+(?=\s))?\s(.*)', str(dict['address_1'])).groups()[0])
                address = (re.match(r'(\d+)(?:-\d+(?=\s))?\s(.*)', str(dict['address_1'])).groups()[1])
            except:
                unique_id = ""
                address = str(dict['address_1'])
        
        df_small.loc[index, 'unique_id'] = unique_id
        df_small.loc[index, 'address_1'] = address
        df_small.loc[index, 'city'] = str(dict['city'])
        df_small.loc[index, 'state'] = str(dict['state'])
        df_small.loc[index, 'zip_code'] = str(dict['postal_code'])[:5]


        #df_small.loc[index, 'city'] = str(dict['address_1'] + " " + dict['city'] + " " + dict['state']
                #                              + " " + dict["postal_code"])
        
        #get last 5 digs of zip and do value counts and heat map and per capita dentsists per person
    except:
        print('errr')
TN_cleaned_addresses = df_small[['unique_id', 'address_1', 'city', 'state', 'zip_code']]

TN_cleaned_addresses.to_csv('TN_cleaned_addresses.csv', index=False)
TN_cleaned_addresses

Unnamed: 0,unique_id,address_1,city,state,zip_code
0,,RICHARDS RD,ANTIOCH,TN,37013
1,616,BELL RD,ANTIOCH,TN,37013
2,504,COLLINS PARK DR,ANTIOCH,TN,37013
3,940,RICHARDS ROAD,ANTIOCH,TN,37013
4,2711,MURFREESBORO RD.,ANTIOCH,TN,37013
...,...,...,...,...,...
5774,2781,AIRWAYS BLVD,MEMPHIS,TN,38132
5775,310,GREAT CIRCLE ROAD,NASHVILLE,TN,37243
5776,,MID-CUMBERLAND REGIONAL HEALTH DEPT,NASHVILLE,TN,37243
5777,710,HART LN,NASHVILLE,TN,37243


In [221]:
flatten(TN_df_small['addresses'])

AssertionError: flatten requires a dictionary input

In [200]:
TN_df_small = pd.read_csv('./State CSVs/TN_df_small.csv')

In [240]:
dictb = TN_df_small['addresses']
dict_flattened = (flatten(record[1], '.') for record in dictb)

pd.DataFrame(dict_flattened)

AssertionError: flatten requires a dictionary input

In [66]:
#drop duplicates

df_drop = pd.read_csv('TN_cleaned_addresses.csv').drop_duplicates(['address_1', 'city'], keep='first').reset_index(drop=True)
df_drop.reset_index(drop=True)
df_drop.shape

df_drop.to_csv('TN_cleaned_addresses.csv')