In [5]:
import pandas as pd
import googlemaps
import time
from tqdm import tqdm
import os
from dotenv import load_dotenv

In [6]:
# --- CONFIGURATION ---

# This command looks for a .env file in your project's root directory
# and loads the variables from it into the environment.
load_dotenv()

# Now, securely get the API key from the environment variables.
# os.getenv() will return None if the key isn't found.
API_KEY = os.getenv('GOOGLE_API_KEY')

# A robust check to make sure the API key was loaded correctly.
if not API_KEY:
    raise ValueError("Google API Key not found. Make sure you have a .env file in the root directory with GOOGLE_API_KEY='YourKey'")
else:
    print("API Key successfully loaded from .env file.")

API Key successfully loaded from .env file.


In [7]:
# Initialize the Google Maps client
gmaps = googlemaps.Client(key=API_KEY)

# Define file paths
input_path = '../data/processed/nyc_sales_analysis_ready.parquet'
output_path = '../data/processed/manhattan_sales_geocoded.parquet'
checkpoint_path = '../data/processed/nyc_sales_geocoded_checkpoint.parquet'

In [8]:
# --- 1. Load the Analysis-Ready Dataset ---
df = pd.read_parquet(input_path)
print(f"Loaded {len(df)} properties to geocode.")

# --- 2. Define the Geocoding Function ---
def geocode_address(address, borough):
    """
    Geocodes a single address using the Google Geocoding API.
    Includes the borough for better accuracy in NYC.
    """
    full_query = f"{address}, {borough}, NY"
    try:
        geocode_result = gmaps.geocode(full_query)
        if geocode_result:
            location = geocode_result[0]['geometry']['location']
            return location['lat'], location['lng']
        else:
            return None, None
    except Exception as e:
        print(f"An error occurred for address '{full_query}': {e}")
        return None, None

# --- 3. Implement Checkpointing to Handle Interruptions ---
# Check if a checkpoint file already exists. If so, load it and only process the remaining rows.
try:
    df_checkpoint = pd.read_parquet(checkpoint_path)
    # Get the addresses we've already processed
    processed_indices = df_checkpoint.index
    # Select only the rows from the original dataframe that have NOT been processed
    df_to_process = df.drop(processed_indices)
    print(f"Loaded checkpoint. {len(df_checkpoint)} addresses already geocoded.")
    print(f"Resuming with {len(df_to_process)} remaining addresses.")
except FileNotFoundError:
    print("No checkpoint file found. Starting from scratch.")
    df_to_process = df.copy()
    df_checkpoint = pd.DataFrame()


# --- 4. RUN ON THE FULL DATASET (WITH CHECKPOINTING) ---
if not df_to_process.empty:
    print(f"\n--- Starting geocoding for {len(df_to_process)} properties. This may take a while... ---")
    tqdm.pandas(desc="Geocoding Progress")

    # This loop will run for the remaining addresses and save progress every 500 rows
    results = []
    for index, row in tqdm(df_to_process.iterrows(), total=len(df_to_process)):
        lat, lng = geocode_address(row['address'], row['borough_name'])
        results.append({'latitude': lat, 'longitude': lng})
        
        # Save progress every 500 iterations
        if (len(results) % 500 == 0):
            temp_df = pd.DataFrame(results, index=df_to_process.index[:len(results)])
            df_to_save = pd.concat([df_checkpoint, temp_df])
            df_to_save.to_parquet(checkpoint_path)
            # print(f"Saved checkpoint with {len(df_to_save)} rows.")
            
    # Combine results for the processed batch
    final_batch_results = pd.DataFrame(results, index=df_to_process.index)
    
    # Combine with any previously checkpointed data
    df_final = pd.concat([df_checkpoint, final_batch_results])
    
    # --- 5. Save the Final Enriched Data ---
    print(f"\nGeocoding complete. Saving {len(df_final)} rows to {output_path}")
    # Join the lat/lon data back to the original dataframe
    df_geocoded = df.join(df_final[['latitude', 'longitude']])
    df_geocoded.to_parquet(output_path)
    print("Final file saved successfully!")

    # Optional: remove the checkpoint file after successful completion
    # os.remove(checkpoint_path)
else:
    print("\nAll addresses have already been geocoded. Loading final file.")
    df_geocoded = pd.read_parquet(output_path)

display(df_geocoded.head())

Loaded 6505 properties to geocode.
Loaded checkpoint. 6500 addresses already geocoded.
Resuming with 5 remaining addresses.

--- Starting geocoding for 5 properties. This may take a while... ---


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.29it/s]


Geocoding complete. Saving 6505 rows to ../data/processed/manhattan_sales_geocoded.parquet
Final file saved successfully!





Unnamed: 0,borough,neighborhood,building_class_category,tax_class_at_present,block,lot,easement,building_class_at_present,address,apartment_number,...,year_built,tax_class_at_time_of_sale,building_class_at_time_of_sale,sale_price,sale_date,borough_name,age,price_per_sqft,latitude,longitude
7664,1.0,ALPHABET CITY,01 ONE FAMILY DWELLINGS,1,376,43,,S1,743 EAST 6TH STREET,,...,1940,1.0,S1,3200000.0,2019-07-24,Manhattan,85.0,869.565217,40.722937,-73.977391
7667,1.0,ALPHABET CITY,03 THREE FAMILY DWELLINGS,1,377,56,,C0,263 EAST 7TH STREET,,...,1899,1.0,C0,6300000.0,2019-04-30,Manhattan,126.0,1750.0,40.723652,-73.977354
7801,1.0,CHELSEA,01 ONE FAMILY DWELLINGS,1,720,6,,A4,483 WEST 22ND STREET,,...,1901,1.0,A4,6725000.0,2019-09-03,Manhattan,124.0,1620.481928,40.746989,-74.004069
7802,1.0,CHELSEA,01 ONE FAMILY DWELLINGS,1,764,49,,A9,218 WEST 15TH STREET,,...,1910,1.0,A9,2421900.0,2019-05-22,Manhattan,115.0,615.476493,40.739534,-74.000271
7803,1.0,CHELSEA,01 ONE FAMILY DWELLINGS,1,768,8,,A4,253 WEST 18TH STREET,,...,1901,1.0,A4,12709140.0,2019-04-11,Manhattan,124.0,1819.750859,40.74214,-74.000014
