In [10]:
import pandas as pd
import requests
from tqdm import tqdm
import os
from dotenv import load_dotenv
import time

In [11]:
# --- 1. CONFIGURATION & SETUP ---
load_dotenv()
API_KEY = os.getenv('WALKSCORE_API_KEY')
if not API_KEY:
    raise ValueError("Walk Score API Key not found in .env file.")
else:
    print("Walk Score API Key successfully loaded.")

# Define file paths
input_path = '../data/processed/manhattan_sales_transit.parquet'
output_path = '../data/processed/manhattan_sales_walkscore.parquet'
checkpoint_path = '../data/processed/walkscore_checkpoint.parquet'

Walk Score API Key successfully loaded.


In [12]:
# --- 2. LOAD DATA ---
try:
    df = pd.read_parquet(input_path)
    print(f"Loaded {len(df)} properties with transit data.")
except FileNotFoundError:
    raise FileNotFoundError("Run the 03_... transit notebook first.")

Loaded 6505 properties with transit data.


In [13]:
def get_all_scores(address, lat, lon):
    """
    Gets Walk Score, Transit Score, and Bike Score by including the correct
    parameters in the API request URL.
    """
    if pd.isna(lat) or pd.isna(lon):
        return None, None, None

    # *** THE FIX IS HERE: Add &transit=1&bike=1 to the URL ***
    url = (f"http://api.walkscore.com/score?format=json&address={requests.utils.quote(address)}"
           f"&lat={lat}&lon={lon}&transit=1&bike=1&wsapikey={API_KEY}")
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()
        
        if data.get('status') == 1:
            walkscore = data.get('walkscore')
            
            # Use robust parsing in case one of the scores is still unavailable for a specific address
            transit_score = data.get('transit', {}).get('score')
            bike_score = data.get('bike', {}).get('score')
                
            return walkscore, transit_score, bike_score
        else:
            return None, None, None
    except Exception as e:
        print(f"An error occurred for address '{address}': {e}")
        return None, None, None


In [14]:
# --- 4. IMPLEMENT CHECKPOINTING ---
try:
    df_checkpoint = pd.read_parquet(checkpoint_path)
    processed_indices = df_checkpoint.index
    df_to_process = df.drop(processed_indices)
    print(f"Loaded checkpoint. {len(df_checkpoint)} properties already processed for Walk Score.")
    print(f"Resuming with {len(df_to_process)} remaining properties.")
except FileNotFoundError:
    print("No Walk Score checkpoint file found. Starting fresh.")
    df_to_process = df.copy()

No Walk Score checkpoint file found. Starting fresh.


In [15]:
# --- 5. EXECUTE ON REMAINING PROPERTIES ---
if not df_to_process.empty:
    print(f"\n--- Starting Walk Score enrichment for {len(df_to_process)} properties. ---")
    
    tqdm.pandas(desc="Getting All Scores")
    new_scores_tuples = df_to_process.progress_apply(
        lambda row: get_all_scores(row['address'], row['latitude'], row['longitude']),
        axis=1
    )
    
    # Convert the series of tuples into a DataFrame
    new_scores_df = pd.DataFrame(new_scores_tuples.tolist(), index=new_scores_tuples.index, columns=['walk_score', 'transit_score', 'bike_score'])
    
    if 'df_checkpoint' in locals() and not df_checkpoint.empty:
        scores_combined_df = pd.concat([df_checkpoint, new_scores_df])
    else:
        scores_combined_df = new_scores_df
        
    scores_combined_df.to_parquet(checkpoint_path)
    print("All processing complete. Final checkpoint saved.")
else:
    print("All properties already processed. Loading from checkpoint.")
    scores_combined_df = pd.read_parquet(checkpoint_path)


--- Starting Walk Score enrichment for 6505 properties. ---


Getting All Scores:  65%|███████████████████████████████████████████████▌                         | 4236/6505 [14:45<59:52,  1.58s/it]

An error occurred for address '400 EAST 67': HTTPConnectionPool(host='api.walkscore.com', port=80): Read timed out. (read timeout=10)


Getting All Scores: 100%|█████████████████████████████████████████████████████████████████████████| 6505/6505 [28:26<00:00,  3.81it/s]

All processing complete. Final checkpoint saved.





In [16]:
# --- 6. JOIN AND SAVE THE ENRICHED DATASET ---
# Join the new score columns to our main DataFrame
df_enriched = df.join(scores_combined_df)

print(f"\nWalk Score analysis complete. Saving {len(df_enriched)} rows to {output_path}")
df_enriched.to_parquet(output_path)
print("File saved successfully!")

# --- 7. FINAL INSPECTION ---
print("\n--- Final DataFrame Head with New Walk Score Features ---")
display(df_enriched[['address', 'subway_distance_meters', 'walk_score', 'transit_score', 'bike_score']].head())
print("\nNull count for new columns:")
print(df_enriched[['walk_score', 'transit_score', 'bike_score']].isnull().sum())


Walk Score analysis complete. Saving 6505 rows to ../data/processed/manhattan_sales_walkscore.parquet
File saved successfully!

--- Final DataFrame Head with New Walk Score Features ---


Unnamed: 0,address,subway_distance_meters,walk_score,transit_score,bike_score
7664,743 EAST 6TH STREET,1202,96.0,100.0,98.0
7667,263 EAST 7TH STREET,1106,96.0,100.0,98.0
7801,483 WEST 22ND STREET,561,98.0,100.0,95.0
7802,218 WEST 15TH STREET,160,100.0,100.0,95.0
7803,253 WEST 18TH STREET,198,100.0,100.0,95.0



Null count for new columns:
walk_score       1
transit_score    1
bike_score       7
dtype: int64
