In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import numpy as np
import re

geolocator = Nominatim(user_agent="mi_proyecto_inmobiliario_chile_v5_final", timeout=30)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.5)

df = pd.read_csv('../Data/Procesados/data_propiedades.csv')
if not 'latitud' in df.columns:
    df['latitud'] = np.nan


df_missing = df[df['latitud'].isna()].copy()
if len(df_missing) == 0:
    print("¡Proceso terminado! Todas las direcciones fueron geocodificadas con éxito.")
    exit()

def clean_address_v5(address):
    address = str(address)
    if address.lower() == 'nan':
        return np.nan

    cleaned = re.sub(r',\s*(Región Metropolitana|RM)\s*.*', '', address, flags=re.IGNORECASE)
    cleaned = re.sub(r',\s*\d{7,8}\s*', ', ', cleaned)
    
    parts = [p.strip() for p in cleaned.split(',')]
    calle_sin_numero = re.sub(r'\s+\d+.*', '', parts[0]).strip() 
    comuna = parts[-1] if len(parts) > 1 else calle_sin_numero
    if calle_sin_numero.lower() == comuna.lower():
        cleaned_final = f"{calle_sin_numero}, Chile"
    else:
        cleaned_final = f"{calle_sin_numero}, {comuna}, Chile"

    return cleaned_final.replace('  ', ' ')

def get_lat_lon(cleaned_address):
    try:
        location = geocode(cleaned_address)
        if location:
            return location.latitude, location.longitude
        return np.nan, np.nan
    except Exception as e:
        return np.nan, np.nan


df_missing['direccion_limpia'] = df_missing['direccion'].apply(clean_address_v5)
df_missing[['latitud_nueva', 'longitud_nueva']] = df_missing['direccion_limpia'].apply(
    lambda x: pd.Series(get_lat_lon(x)))


df.update(df_missing[['latitud_nueva', 'longitud_nueva']].rename(columns={
    'latitud_nueva': 'latitud',
    'longitud_nueva': 'longitud'}))

if 'direccion_limpia' in df.columns:
    df = df.drop(columns=['direccion_limpia'])
output_filename = '..Data/Procesados/data_propiedades_loc.csv'
df.to_csv(output_filename, index=False)
total_geocoded = df['latitud'].count()
