In [1]:
import pandas as pd

df = pd.read_csv('csv/dental.csv')
df

Unnamed: 0,Clinic Name,Address,"City, State, Zip",Phone,Borough,id
0,Sun River Health - Bay Street,57 Bay St,"Staten Island, NY 10301",855-681-8700,Staten Island,1
1,Community Health Center of Richmond,439 Port Richmond Ave,"Staten Island, NY 10302",917-830-0838,Staten Island,2
2,Metro Community Health Center - Staten Island,2324 Forest Ave,"Staten Island, NY 1030 3",718-447-0200,Staten Island,3
3,Community Health Center of Richmond Stapleton -,"St. George, 135 Canal St, Suite 200","Staten Island, NY 10304",917-830-1950,Staten Island,4
4,Staten Island University Hospital Northwell,475 Seaview Ave,"Staten Island, NY 10305",718-226-9080,Staten Island,5
...,...,...,...,...,...,...
108,"Sunset Terrace Family Health Center, NYU Lango...",514 49th St,,718-431-2622,Brooklyn,109
109,Adapt Community Network,175 Lawrence Ave,"Brooklyn, NY 11230",718-436-7600,Brooklyn,110
110,Joseph P. Addabbo Family Health Center,120 Richard St,,718-945-7150,Brooklyn,111
111,"Wycoff Heights Medical Center, Department of","Dental Medicine, 374 Stockholm St","Brooklyn, NY 11237",718-963-7174,Brooklyn,112


In [2]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
import time
import re

# Initialize geocoder
geolocator = Nominatim(user_agent="my_app")



# Clean the address column
df['Full Address'] = df['Address'] + ', ' + df['City, State, Zip']
df['LONGITUDE'] = None
df['LATITUDE'] = None

# Function to get coordinates with retry logic
def get_coordinates(address, geolocator, retries=3):
    for _ in range(retries):
        try:
            location = geolocator.geocode(address, timeout=10)
            if location:
                return location.latitude, location.longitude
            else:
                return None, None
        except GeocoderTimedOut:
            time.sleep(1)
    return None, None

# Apply geocoding with progress bar
for idx, address in tqdm(enumerate(df['Full Address']), total=len(df), desc="Geocoding Addresses"):
    latitude, longitude = get_coordinates(address, geolocator)
    df.at[idx, 'LATITUDE'] = latitude
    df.at[idx, 'LONGITUDE'] = longitude

# Print the updated DataFrame
print(df[['Full Address', 'LATITUDE', 'LONGITUDE']])

Geocoding Addresses: 100%|██████████| 113/113 [01:20<00:00,  1.40it/s]

                                          Full Address   LATITUDE  LONGITUDE
0                   57 Bay St, Staten Island, NY 10301  40.638722 -74.074764
1       439 Port Richmond Ave, Staten Island, NY 10302   40.63138 -74.138315
2            2324 Forest Ave, Staten Island, NY 1030 3       None       None
3    St. George, 135 Canal St, Suite 200, Staten Is...       None       None
4             475 Seaview Ave, Staten Island, NY 10305  40.585438 -74.084764
..                                                 ...        ...        ...
108                                                NaN  46.314475  11.048029
109               175 Lawrence Ave, Brooklyn, NY 11230  40.630978 -73.973463
110                                                NaN  46.314475  11.048029
111  Dental Medicine, 374 Stockholm St, Brooklyn, N...       None       None
112                                                NaN  46.314475  11.048029

[113 rows x 3 columns]





In [3]:
nan_latitude_count = df['LATITUDE'].isna().sum()
nan_longitude_count = df['LONGITUDE'].isna().sum()

nan_latitude_count, nan_longitude_count

(55, 55)

In [4]:
df_nan_latitude = df[df['LATITUDE'].isna()]
df_nan_latitude

Unnamed: 0,Clinic Name,Address,"City, State, Zip",Phone,Borough,id,Full Address,LONGITUDE,LATITUDE
2,Metro Community Health Center - Staten Island,2324 Forest Ave,"Staten Island, NY 1030 3",718-447-0200,Staten Island,3,"2324 Forest Ave, Staten Island, NY 1030 3",,
3,Community Health Center of Richmond Stapleton -,"St. George, 135 Canal St, Suite 200","Staten Island, NY 10304",917-830-1950,Staten Island,4,"St. George, 135 Canal St, Suite 200, Staten Is...",,
5,161st Street Dental Clinic (Montefiore),"305 East 161st St, Lower Level","Bronx, NY 10451",718-579-2535,Bronx,6,"305 East 161st St, Lower Level, Bronx, NY 10451",,
7,Sun River Health - Inwood,1543- 45 Inwood Ave,"Bronx, NY 10452",855-681-8700,Bronx,8,"1543- 45 Inwood Ave, Bronx, NY 10452",,
8,Morrisania Diagnostic and Treatment Center *,"1225 Gerard Ave, 3rd Floor","Bronx, NY 10452",718-960-2911,Bronx,9,"1225 Gerard Ave, 3rd Floor, Bronx, NY 10452",,
13,Walton Family Health Practice Dental Department,"(The Institute for Family Health), 1894 Walton...","Bronx, NY 10453",718-583-2700,Bronx,14,"(The Institute for Family Health), 1894 Walton...",,
16,"BronxCare Dr. Martin Luther King, Jr. Health C...","1265 Franklin Ave, 3rd Floor","Bronx, NY 10456",718-992-7669,Bronx,17,"1265 Franklin Ave, 3rd Floor, Bronx, NY 10456",,
23,Children’s Aid Society - Bronx Health Services,"910 East 172nd St, 3rd Floor","Bronx, NY 10460",347-767-2000,Bronx,24,"910 East 172nd St, 3rd Floor, Bronx, NY 10460",,
24,Blondell Dental Clinic (Montefiore),"1575 Blondell Ave, Suite 150,, DENT AL CLINICS...","Bronx, NY 10461",718-405-8190,Bronx,25,"1575 Blondell Ave, Suite 150,, DENT AL CLINICS...",,
25,Jarrett Pediatric Clinic (Montefiore),"1516 Jarrett Place, Lower Level, Suite 100","Bronx, NY 10461",718-405-8194,Bronx,26,"1516 Jarrett Place, Lower Level, Suite 100, Br...",,


In [5]:
# Function to remove content inside parentheses
def remove_parentheses(text):
    return re.sub(r'\([^)]*\)', '', text).strip()

# Apply the function to remove parentheses from the 'Full Address' column
df_nan_latitude['Cleaned Address'] = df_nan_latitude['Address'].apply(remove_parentheses)

# Display the cleaned addresses
df_nan_latitude[['Address', 'Cleaned Address']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nan_latitude['Cleaned Address'] = df_nan_latitude['Address'].apply(remove_parentheses)


Unnamed: 0,Address,Cleaned Address
2,2324 Forest Ave,2324 Forest Ave
3,"St. George, 135 Canal St, Suite 200","St. George, 135 Canal St, Suite 200"
5,"305 East 161st St, Lower Level","305 East 161st St, Lower Level"
7,1543- 45 Inwood Ave,1543- 45 Inwood Ave
8,"1225 Gerard Ave, 3rd Floor","1225 Gerard Ave, 3rd Floor"
13,"(The Institute for Family Health), 1894 Walton...",", 1894 Walton Ave, 2nd Floor"
16,"1265 Franklin Ave, 3rd Floor","1265 Franklin Ave, 3rd Floor"
23,"910 East 172nd St, 3rd Floor","910 East 172nd St, 3rd Floor"
24,"1575 Blondell Ave, Suite 150,, DENT AL CLINICS...","1575 Blondell Ave, Suite 150,, DENT AL CLINICS..."
25,"1516 Jarrett Place, Lower Level, Suite 100","1516 Jarrett Place, Lower Level, Suite 100"


In [6]:
# Function to remove leading commas from the Cleaned Address column
def remove_leading_comma(text):
    return text.lstrip(',')

# Apply the function to remove leading commas
df_nan_latitude['Cleaned Address'] = df_nan_latitude['Cleaned Address'].apply(remove_leading_comma)

# Display the first few rows to confirm the changes
df_nan_latitude[['Address', 'Cleaned Address']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nan_latitude['Cleaned Address'] = df_nan_latitude['Cleaned Address'].apply(remove_leading_comma)


Unnamed: 0,Address,Cleaned Address
2,2324 Forest Ave,2324 Forest Ave
3,"St. George, 135 Canal St, Suite 200","St. George, 135 Canal St, Suite 200"
5,"305 East 161st St, Lower Level","305 East 161st St, Lower Level"
7,1543- 45 Inwood Ave,1543- 45 Inwood Ave
8,"1225 Gerard Ave, 3rd Floor","1225 Gerard Ave, 3rd Floor"
13,"(The Institute for Family Health), 1894 Walton...","1894 Walton Ave, 2nd Floor"
16,"1265 Franklin Ave, 3rd Floor","1265 Franklin Ave, 3rd Floor"
23,"910 East 172nd St, 3rd Floor","910 East 172nd St, 3rd Floor"
24,"1575 Blondell Ave, Suite 150,, DENT AL CLINICS...","1575 Blondell Ave, Suite 150,, DENT AL CLINICS..."
25,"1516 Jarrett Place, Lower Level, Suite 100","1516 Jarrett Place, Lower Level, Suite 100"


In [8]:
from tqdm import tqdm

for idx, address in tqdm(enumerate(df_nan_latitude['Cleaned Address']), total=len(df_nan_latitude), desc="Geocoding Addresses"):
    latitude, longitude = get_coordinates(address, geolocator)
    df_nan_latitude.at[idx, 'LATITUDE'] = latitude
    df_nan_latitude.at[idx, 'LONGITUDE'] = longitude

# Print the updated DataFrame with the relevant columns
print(df_nan_latitude[['Cleaned Address', 'LATITUDE', 'LONGITUDE']])

Geocoding Addresses: 100%|██████████| 79/79 [01:03<00:00,  1.25it/s]

                        Cleaned Address   LATITUDE  LONGITUDE
2                       2324 Forest Ave       None       None
3   St. George, 135 Canal St, Suite 200  40.860465 -73.038858
5        305 East 161st St, Lower Level   51.95588   1.326434
7                   1543- 45 Inwood Ave       None       None
8           1225 Gerard Ave, 3rd  Floor       None       None
..                                  ...        ...        ...
69                                  NaN  46.314475  11.048029
75                                  NaN  46.314475  11.048029
76                                  NaN  46.314475  11.048029
77                                  NaN  46.314475  11.048029
78                                  NaN  46.314475  11.048029

[90 rows x 3 columns]





In [12]:
df_nan_latitude

Unnamed: 0,Clinic Name,Address,"City, State, Zip",Phone,Borough,id,Full Address,LONGITUDE,LATITUDE,Cleaned Address
2,Metro Community Health Center - Staten Island,2324 Forest Ave,"Staten Island, NY 1030 3",718-447-0200,Staten Island,3.0,"2324 Forest Ave, Staten Island, NY 1030 3",,,2324 Forest Ave
3,Community Health Center of Richmond Stapleton -,"St. George, 135 Canal St, Suite 200","Staten Island, NY 10304",917-830-1950,Staten Island,4.0,"St. George, 135 Canal St, Suite 200, Staten Is...",-73.038858,40.860465,"St. George, 135 Canal St, Suite 200"
5,161st Street Dental Clinic (Montefiore),"305 East 161st St, Lower Level","Bronx, NY 10451",718-579-2535,Bronx,6.0,"305 East 161st St, Lower Level, Bronx, NY 10451",1.326434,51.95588,"305 East 161st St, Lower Level"
7,Sun River Health - Inwood,1543- 45 Inwood Ave,"Bronx, NY 10452",855-681-8700,Bronx,8.0,"1543- 45 Inwood Ave, Bronx, NY 10452",,,1543- 45 Inwood Ave
8,Morrisania Diagnostic and Treatment Center *,"1225 Gerard Ave, 3rd Floor","Bronx, NY 10452",718-960-2911,Bronx,9.0,"1225 Gerard Ave, 3rd Floor, Bronx, NY 10452",,,"1225 Gerard Ave, 3rd Floor"
...,...,...,...,...,...,...,...,...,...,...
69,,,,,,,,11.048029,46.314475,
75,,,,,,,,11.048029,46.314475,
76,,,,,,,,11.048029,46.314475,
77,,,,,,,,11.048029,46.314475,


In [13]:
df_nan_latitude.dropna(subset=['Clinic Name'], inplace=True)

In [24]:
df_nan_latitude.columns

Index(['Clinic Name', 'City, State, Zip', 'Phone', 'Borough', 'id',
       'Full Address', 'LONGITUDE', 'LATITUDE', 'Address'],
      dtype='object')

In [23]:
df_nan_latitude.drop(columns=['Address'], inplace=True)

# Rename 'Cleaned Address' to 'Address'
df_nan_latitude.rename(columns={'Cleaned Address': 'Address'}, inplace=True)


In [22]:
df.columns

Index(['Clinic Name', 'Address', 'City, State, Zip', 'Phone', 'Borough', 'id',
       'Full Address', 'LONGITUDE', 'LATITUDE'],
      dtype='object')

In [31]:
# Step 1: Create a subset of df without NaN values in LATITUDE and LONGITUDE
df_no_nan = df.dropna(subset=['LATITUDE', 'LONGITUDE'])

# Step 2: Append df_nan_latitude to this subset
df_combined = pd.concat([df_no_nan, df_nan_latitude], ignore_index=True)

# Display the first few rows of the combined DataFrame to confirm
df_combined.head()

Unnamed: 0,Clinic Name,Address,"City, State, Zip",Phone,Borough,id,Full Address,LONGITUDE,LATITUDE
0,Sun River Health - Bay Street,57 Bay St,"Staten Island, NY 10301",855-681-8700,Staten Island,1.0,"57 Bay St, Staten Island, NY 10301",-74.074764,40.638722
1,Community Health Center of Richmond,439 Port Richmond Ave,"Staten Island, NY 10302",917-830-0838,Staten Island,2.0,"439 Port Richmond Ave, Staten Island, NY 10302",-74.138315,40.63138
2,Community Health Center of Richmond Stapleton -,"St. George, 135 Canal St, Suite 200","Staten Island, NY 10304",917-830-1950,Staten Island,4.0,"St. George, 135 Canal St, Suite 200, Staten Is...",-73.038858,40.860465
3,Staten Island University Hospital Northwell,475 Seaview Ave,"Staten Island, NY 10305",718-226-9080,Staten Island,5.0,"475 Seaview Ave, Staten Island, NY 10305",-74.084764,40.585438
4,161st Street Dental Clinic (Montefiore),"305 East 161st St, Lower Level","Bronx, NY 10451",718-579-2535,Bronx,6.0,"305 East 161st St, Lower Level, Bronx, NY 10451",1.326434,51.95588


In [32]:
df_combined

Unnamed: 0,Clinic Name,Address,"City, State, Zip",Phone,Borough,id,Full Address,LONGITUDE,LATITUDE
0,Sun River Health - Bay Street,57 Bay St,"Staten Island, NY 10301",855-681-8700,Staten Island,1.0,"57 Bay St, Staten Island, NY 10301",-74.074764,40.638722
1,Community Health Center of Richmond,439 Port Richmond Ave,"Staten Island, NY 10302",917-830-0838,Staten Island,2.0,"439 Port Richmond Ave, Staten Island, NY 10302",-74.138315,40.63138
2,Community Health Center of Richmond Stapleton -,"St. George, 135 Canal St, Suite 200","Staten Island, NY 10304",917-830-1950,Staten Island,4.0,"St. George, 135 Canal St, Suite 200, Staten Is...",-73.038858,40.860465
3,Staten Island University Hospital Northwell,475 Seaview Ave,"Staten Island, NY 10305",718-226-9080,Staten Island,5.0,"475 Seaview Ave, Staten Island, NY 10305",-74.084764,40.585438
4,161st Street Dental Clinic (Montefiore),"305 East 161st St, Lower Level","Bronx, NY 10451",718-579-2535,Bronx,6.0,"305 East 161st St, Lower Level, Bronx, NY 10451",1.326434,51.95588
...,...,...,...,...,...,...,...,...,...
132,Interfaith Medical Center - Smile Brooklyn Dental,"Clinic, 1545 Atlantic Ave","Brooklyn, NY 11213",718-613-7140,Brooklyn,98.0,"Clinic, 1545 Atlantic Ave, Brooklyn, NY 11213",,
133,NYU Langone Family Health Center Park Slope,"220 13th St,,, August 202 3, *This clinic is u...","Brooklyn, NY 11215",718-832-5980,Brooklyn,99.0,"220 13th St,,, August 202 3, *This clinic is u...",,
134,NY Presbyterian Methodist Brooklyn Dental,"Services, 506 6th St, Kirkwood Pavilion, 1st F...","Brooklyn, NY 11215",718-780-5410,Brooklyn,100.0,"Services, 506 6th St, Kirkwood Pavilion, 1st F...",,
135,Maimonides Medical Center,"4802 10th Ave, Admin Bldg, 2nd Floor","Brooklyn, NY 11219",718-283-2084,Brooklyn,105.0,"4802 10th Ave, Admin Bldg, 2nd Floor, Brooklyn...",,


In [33]:
# Check the number of NaN values in LATITUDE and LONGITUDE in the combined DataFrame
nan_count_latitude = df_combined['LATITUDE'].isna().sum()
nan_count_longitude = df_combined['LONGITUDE'].isna().sum()

print(f"NaN values in LATITUDE: {nan_count_latitude}")
print(f"NaN values in LONGITUDE: {nan_count_longitude}")

NaN values in LATITUDE: 31
NaN values in LONGITUDE: 31


In [34]:
df_combined.to_csv('dental.csv', index=False)