In [27]:
import pandas as pd

df = pd.read_csv('csv/food_rows.csv')

In [28]:
# Convert LATITUDE and LONGITUDE columns to float
df['LATITUDE'] = pd.to_numeric(df['LATITUDE'], errors='coerce')
df['LONGITUDE'] = pd.to_numeric(df['LONGITUDE'], errors='coerce')

In [30]:
df.dtypes

ID                int64
TYPE             object
PROGRAM          object
ORG PHONE        object
DISTADDDI        object
TB               object
DRIOSTZIP        object
DAYS             object
HOURS            object
FULL_ADDRESS     object
LONGITUDE       float64
LATITUDE        float64
dtype: object

In [31]:
invalid_coords = df[(df['LATITUDE'].isna()) | (df['LONGITUDE'].isna())]
print(invalid_coords)

Empty DataFrame
Columns: [ID, TYPE, PROGRAM, ORG PHONE, DISTADDDI, TB, DRIOSTZIP, DAYS, HOURS, FULL_ADDRESS, LONGITUDE, LATITUDE]
Index: []


In [32]:
df[['LATITUDE', 'LONGITUDE']]

Unnamed: 0,LATITUDE,LONGITUDE
0,40.693683,-73.968168
1,40.654492,-73.955831
2,40.668113,-73.912934
3,40.576623,-73.963443
4,40.668113,-73.912934
...,...,...
552,40.660046,-73.922280
553,40.652073,-74.004772
554,40.735740,-73.850750
555,40.668113,-73.912934


In [33]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
from tqdm import tqdm
import pandas as pd

# Initialize geolocator
geolocator = Nominatim(user_agent="my_app")

# Function to get address from coordinates
def get_address(lat, lon, geolocator, retries=3):
    for _ in range(retries):
        try:
            location = geolocator.reverse((lat, lon), timeout=10)
            if location:
                return location.address, location.raw.get('address', {})
            else:
                return None, None
        except GeocoderTimedOut:
            time.sleep(1)
    return None, None

# Cache to store results
cache = {}

def cached_address(lat, lon):
    coords = (lat, lon)
    if coords in cache:
        return cache[coords]
    else:
        address, address_parts = get_address(lat, lon, geolocator)
        cache[coords] = (address, address_parts)
        return address, address_parts

# Convert LATITUDE and LONGITUDE columns to float
df['LATITUDE'] = pd.to_numeric(df['LATITUDE'], errors='coerce')
df['LONGITUDE'] = pd.to_numeric(df['LONGITUDE'], errors='coerce')

# Add new columns to the original DataFrame
df['FULL ADDRESS'] = None
df['BOROUGH'] = None
df['ZICODE'] = None

# Iterate over the DataFrame and update missing values
start_time = time.time()
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    latitude = row['LATITUDE']
    longitude = row['LONGITUDE']
    if pd.isna(latitude) or pd.isna(longitude):
        print(f"Skipping invalid coordinates at index {index}: ({latitude}, {longitude})")
        continue
    full_address, address_parts = cached_address(latitude, longitude)
    df.loc[index, 'FULL ADDRESS'] = full_address
    if address_parts:
        df.loc[index, 'BOROUGH'] = address_parts.get('suburb')
        df.loc[index, 'ZICODE'] = address_parts.get('postcode')

print(f"Processing completed in {time.time() - start_time:.2f} seconds")

100%|██████████| 557/557 [00:54<00:00, 10.21it/s]

Processing completed in 54.57 seconds





In [35]:
df.columns

Index(['ID', 'TYPE', 'PROGRAM', 'ORG PHONE', 'DISTADDDI', 'TB', 'DRIOSTZIP',
       'DAYS', 'HOURS', 'FULL_ADDRESS', 'LONGITUDE', 'LATITUDE',
       'FULL ADDRESS', 'BOROUGH', 'ZICODE'],
      dtype='object')

In [36]:
cols = ['FULL_ADDRESS','DISTADDDI','TB', 'DRIOSTZIP']
df.drop(cols, axis=1, inplace=True)

In [37]:
df

Unnamed: 0,ID,TYPE,PROGRAM,ORG PHONE,DAYS,HOURS,LONGITUDE,LATITUDE,FULL ADDRESS,BOROUGH,ZICODE
0,80015,FP,THE BIBLE CHURCH OF CHRIST,718-293-1928,"SUN (2,4)",4-6PM,-73.968168,40.693683,"138, Waverly Avenue, Clinton Hill, Brooklyn, K...",Brooklyn,11205
1,80026,FP,BRONX SEVENTH DAY ADVENTIST CHURCH,646-353-8926,THUR,2-5PM,-73.955831,40.654492,"Bethanie Eglise Adventiste du 7eme Jour, 2059-...",Brooklyn,11226
2,80029,FP,BRONX TEMPLE SEVENTH DAY ADVENTIST CHURCH,718-842-4504,"M,W,THUR",11:30AM-2:30PM,-73.912934,40.668113,"521, Thomas S. Boyland Street, Brownsville, Br...",Brooklyn,11212
3,80065,FP,GETHSEMANE BAPTIST CHURCH,347-948-0772,THUR,8-10AM,-73.963443,40.576623,"Ocean View Jewish Center, 3100, Brighton 4th S...",Brooklyn,11235
4,80081,FP,HOLY TABERNACLE CHURCH INC.,(718) 293-9862,TUE,4:30-5PM,-73.912934,40.668113,"521, Thomas S. Boyland Street, Brownsville, Br...",Brooklyn,11212
...,...,...,...,...,...,...,...,...,...,...,...
552,87090,FP,MOSAIC BEACON COMMUNITY CENTER,917-962-9955,FRI,3-5PM,-73.922280,40.660046,"1089, Clarkson Avenue, Brooklyn Community Dist...",Brooklyn,11212
553,87208,FP,ISAACS HOLMES FOOD PANTRY,212-360-7620,WED,9:30-11AM,-74.004772,40.652073,"443, 39th Street, Brooklyn, Kings County, City...",Brooklyn,11232
554,87233,FP,FOREST HILLS SENIORS CENTER,929-349-9740,THUR,8:30-10:30AM,-73.850750,40.735740,"108-03, 62nd Drive, Forest Hills, Queens, Quee...",Queens,11375
555,87341,FP,CHURCH OF GOD OF SALVATION,718-693-6486,WED/THUR,2-4PM/10AM-4PM,-73.912934,40.668113,"521, Thomas S. Boyland Street, Brownsville, Br...",Brooklyn,11212


In [38]:
df.to_csv('csv/food_rows.csv', index=False)