*In this notebook, we get the latitude and longitude of the eviction addresses that we have. This will allow us to calculate the distance with our clusters, and to assign (or not) a cluster to each eviction address.*

# Get longitude and latitude of eviction addresses with geocoding API

In [None]:
import pandas as pd
import requests
import time
import random
import os
from tqdm import tqdm
from dotenv import load_dotenv

# ----------------------------
# Load API Key
# ----------------------------
load_dotenv()
api_key = os.getenv("GOOGLE_MAPS_API_KEY")
if not api_key:
    raise ValueError("❌ GOOGLE_MAPS_API_KEY not found in .env file")

# ----------------------------
# Load CSV (only first 5 for testing)
# ----------------------------
df = pd.read_csv("../datasets/mobile_data_addresses_feb.csv")

# ----------------------------
# Geocoding function
# ----------------------------
def get_coords(address, api_key, retries=3):
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={requests.utils.quote(address)}&key={api_key}"
    for _ in range(retries):
        try:
            res = requests.get(url, timeout=10)
            if res.status_code == 200:
                result = res.json()
                if result["status"] == "OK":
                    loc = result["results"][0]["geometry"]["location"]
                    return loc["lat"], loc["lng"]
                elif result["status"] == "OVER_QUERY_LIMIT":
                    wait = 5 + random.random()
                    print(f"⚠️ Rate limit hit. Waiting {wait:.1f}s...")
                    time.sleep(wait)
            else:
                print(f"⚠️ HTTP error {res.status_code}")
        except Exception as e:
            print(f"❌ Error: {e}")
    return None, None

# ----------------------------
# Batch geocoding with tqdm
# ----------------------------
def batch_geocode(data, address_col="x", batch_size=10, delay=1.5):
    latitudes = []
    longitudes = []
    
    for i in tqdm(range(0, len(data), batch_size), desc="Geocoding"):
        batch = data.iloc[i:i+batch_size]
        for address in batch[address_col]:
            lat, lng = get_coords(address, api_key)
            latitudes.append(lat)
            longitudes.append(lng)
        time.sleep(delay + random.uniform(0.3, 0.7))  # short random sleep to avoid burst detection

    return latitudes, longitudes

# ----------------------------
# Run geocoding
# ----------------------------
lat, lng = batch_geocode(df, address_col="x", batch_size=10, delay=1)

# ----------------------------
# Add to DataFrame and save
# ----------------------------
df["latitude"] = lat
df["longitude"] = lng

df.to_csv("../datasets/mobile_data_addresses_feb_geocoded_sample.csv", index=False)
print("✅ Geocoded sample saved.")



# Assign all clusters to eviction address based on longitude and latitude (50 meters)

In [6]:
import pandas as pd
cluster_quarterly_metrics = pd.read_csv("../datasets/cluster_quarterly_metrics.csv")

In [7]:
import pandas as pd
cluster_quarterly_metrics.head()

Unnamed: 0.1,Unnamed: 0,caid,quarter,cluster,total_pings,unique_days,unique_hours,zipcode,centroid_latitude,centroid_longitude,...,time_window_coverage,total_weekend_days,total_weekday_days,weekend_days,weekday_days,weekend_focus_score,dominance_score,hour_entropy,max_consecutive_hours,median_income_household_2023
0,0,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,0,32,3,3,90020,34.065744,-118.29635,...,0.666667,1.0,3.0,,3.0,,0.666667,0.900256,253,55832
1,1,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,1,9,1,1,90002,33.959281,-118.253437,...,0.333333,1.0,3.0,1.0,,,0.1875,,1,56158
2,2,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,2,7,1,1,91606,34.182663,-118.383647,...,0.333333,1.0,3.0,,1.0,,0.145833,,1,66884
3,3,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,0,106,2,12,90020,34.065744,-118.29635,...,0.666667,,3.0,,2.0,,0.946429,2.257982,87,55832
4,4,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,3,6,1,2,90020,34.065623,-118.2925,...,0.333333,,3.0,,1.0,,0.053571,0.450561,4,55832


In [8]:
"""
We have a file with the list of eviction address. We assign each cluster to an eviction address, if it is close enough. There will be lots of clusters that are not assigned to an eviction address.
"""
import pandas as pd
import numpy as np
from tqdm import tqdm

# Load data
df = pd.read_csv("../datasets/cluster_quarterly_metrics.csv")
address_df = pd.read_csv("../datasets/mobile_data_addresses_feb_geocoded_sample.csv")

# Convert to radians
cluster_lat = np.radians(df["centroid_latitude"].values)
cluster_lon = np.radians(df["centroid_longitude"].values)
address_lat = np.radians(address_df["latitude"].values)
address_lon = np.radians(address_df["longitude"].values)

# Earth radius
R = 6371000

# Result containers
matched_flags = []
matched_addresses = []

# Haversine function
def haversine(lat1, lon1, lat2, lon2):
    dlat = lat2 - lat1[:, np.newaxis]
    dlon = lon2 - lon1[:, np.newaxis]
    a = np.sin(dlat/2)**2 + np.cos(lat1[:, np.newaxis]) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Chunk size
batch_size = 500  # adjust depending on memory

# Process in chunks
for i in tqdm(range(0, len(cluster_lat), batch_size), desc="Matching addresses"):
    batch_lat = cluster_lat[i:i+batch_size]
    batch_lon = cluster_lon[i:i+batch_size]
    
    dist_matrix = haversine(batch_lat, batch_lon, address_lat, address_lon)
    closest_idx = dist_matrix.argmin(axis=1)
    closest_dist = dist_matrix[np.arange(len(batch_lat)), closest_idx]

    for j, dist in enumerate(closest_dist):
        if dist <= 50:
            matched_flags.append(1)
            matched_addresses.append(address_df.iloc[closest_idx[j]]["x"])
        else:
            matched_flags.append(0)
            matched_addresses.append(None)

# Assign to DataFrame
df["matches_known_address"] = matched_flags
df["matched_address"] = matched_addresses

# Save
df.to_csv("../datasets/filtered_data_with_matched_addresses.csv", index=False)
print("✅ Saved safely with batching!")



Matching addresses: 100%|██████████| 2335/2335 [01:37<00:00, 23.98it/s]


✅ Saved safely with batching!
