In [5]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

# 1. Cargar datos
data = pd.read_csv('zonaprop_propiedades.csv')


In [6]:
# 2. Procesar tipo de vivienda
def obtener_clase(valor):
    if pd.isnull(valor):
        return None
    return valor.split('·')[0].strip()
data['vivienda'] = data['title'].apply(obtener_clase)


In [7]:
# 3. Procesar precio
ARS_TO_USD = 1 / 1200 # Tasa de conversión aproximada, ajustar según sea necesario
def convertir_precio(valor):
    if pd.isnull(valor):
        return None
    try:
        partes = valor.split()
        moneda = partes[0]
        numero_str = partes[1].replace('.', '').replace(',', '')
        numero = int(numero_str)
        if moneda == 'USD':
            return numero
        elif moneda == 'ARS':
            return int(numero * ARS_TO_USD)
        else:
            return None
    except:
        return None
data['price'] = data['rent_price'].apply(convertir_precio)


In [8]:
import time
from geopy.geocoders import OpenCage

api_key = "2da76b0a220c4d239c9c251fc20bab83"
geolocator = OpenCage(api_key)

def geocodear(direccion):
    try:
        ubicacion = geolocator.geocode(direccion + ', Argentina')
        if ubicacion:
            print(f'{ubicacion.latitude}, {ubicacion.longitude}')
            return ubicacion.latitude, ubicacion.longitude
        else:
            return None, None
    except Exception as e:
        print(f"Error geocodificando la dirección {direccion}: {e}")
        return None, None

data[['latitud', 'longitud']] = data['location'].apply(lambda x: pd.Series(geocodear(x)))
time.sleep(0.25)  # Esperar para evitar exceder el límite de solicitudes
data[['location', 'latitud', 'longitud']].head()

data_old = data.copy()

-34.4003895, -58.653931
-34.4925042, -58.5289983
-34.5900661, -58.4229274
-34.34905, -58.79308
-34.0, -64.0
-34.4842308, -58.4850135
-34.6127624, -58.3922796
-35.67502, -58.43456
-31.4135, -64.18105
-34.54969, -58.48572
-34.583527, -58.4294069
-34.50936, -58.519579
-34.54955, -58.46684
-34.42603, -58.57962
-32.94682, -60.63932
-31.61689, -61.93256
-34.5582702, -58.4601674
-31.389976, -64.2576766
-31.4135, -64.18105
-31.61689, -61.93256
-34.5857544, -58.4936798
-34.0, -64.0
-34.41427, -58.65171
-34.600734, -58.570615
-34.0, -64.0
-34.6418643, -58.668896
-34.0, -64.0
-34.5627, -58.45829
-34.58856, -58.43053
-38.0114306, -57.5554303
-34.6042823, -58.4469588
-31.4135, -64.18105
-34.5782395, -58.4255992
-32.94682, -60.63932
-31.38945, -64.17566
-31.4135, -64.18105
-34.0, -64.0
-34.62264, -58.44104
-34.60001, -58.44735
-34.5900661, -58.4229274
-31.4492883, -64.212081
-34.5656222, -58.4466359
-34.5565347, -58.4711526
-31.61689, -61.93256
-34.60064, -58.51231
-34.37079, -58.86977
-34.6614447, 

In [9]:
# 4. Procesar expensas
def convertir_expensas(valor):
    if pd.isnull(valor):
        return None
    if "No disponible" in valor:
        return None
    try:
        numero_str = valor.replace('Expensas $', '').strip().replace('.', '').replace(',', '')
        numero = int(numero_str)
        return numero
    except:
        return None
data['expenses'] = data['expenses_price'].apply(convertir_expensas)


In [10]:
# 5. Extraer variables numéricas
def extraer_numero_regex(valor):
    if pd.isnull(valor):
        return None
    try:
        match = re.search(r'(\d+(?:[.,]\d+)?)', valor)
        if match:
            numero_str = match.group(1).replace(',', '.')
            if '.' in numero_str:
                return float(numero_str)
            else:
                return int(numero_str)
        else:
            return None
    except:
        return None

columnas_mapeo = {
    'icon-stotal': 'm2_totales',
    'icon-scubierta': 'm2_cubiertos',
    'icon-ambiente': 'ambientes',
    'icon-bano': 'baños',
    'icon-cochera': 'cocheras',
    'icon-dormitorio': 'dormitorios'
}
for col_original, col_nueva in columnas_mapeo.items():
    data[col_nueva] = data[col_original].apply(extraer_numero_regex)


In [11]:
# 6. Procesar antigüedad
def convertir_antiguedad(valor):
    if pd.isnull(valor):
        return None
    if 'A estrenar' in valor:
        return 0
    try:
        match = re.search(r'\d+', valor)
        if match:
            return int(match.group(0))
        else:
            return None
    except:
        return None
data['antiguedad'] = data['icon-antiguedad'].apply(convertir_antiguedad)


In [12]:
# 7. Procesar features generales
data['general_features'] = data['general_features'].fillna('').astype(str)
def extract_plants(text):
    match = re.search(r'Cantidad plantas\s*:\s*(\d+|5 o más)', text)
    if match:
        if match.group(1) == '5 o más':
            return 5
        return int(match.group(1))
    return 1
def has_pool(text):
    return 'sí' if 'Pileta' in text else 'no'
def is_credit_compatible(text):
    return 'sí' if 'Apto profesional' in text else 'no'
data['Cantidad_plantas'] = data['general_features'].apply(extract_plants)
data['Pileta'] = data['general_features'].apply(has_pool)
data['Apto_credito'] = data['general_features'].apply(is_credit_compatible)


In [13]:
# 8. Seleccionar columnas relevantes

columns_to_keep = [
    'Cantidad_plantas', 'Pileta', 'Apto_credito', 'antiguedad', 'dormitorios',
    'cocheras', 'baños', 'ambientes', 'm2_totales', 'm2_cubiertos', 'expenses',
    'price', 'vivienda','latitud', 'longitud'
]
data = data[columns_to_keep]


In [14]:
# 9. Eliminar outliers extremos (3*IQR)
def get_extreme_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers
columns_numeric = ['price', 'm2_totales', 'm2_cubiertos', 'dormitorios', 'baños', 'antiguedad']
extreme_outliers = pd.DataFrame()
for column in columns_numeric:
    outliers = get_extreme_outliers(data, column)
    extreme_outliers = pd.concat([extreme_outliers, outliers])
extreme_outliers = extreme_outliers.drop_duplicates()
data = data.drop(index=extreme_outliers.index)


In [15]:
# 10. Eliminar duplicados
data = data.drop_duplicates()


In [16]:
# 11. Eliminar filas con "No disponible" en vivienda
data = data[~data["vivienda"].str.strip().str.lower().eq("no disponible")]


In [17]:
# 12. Imputar valores nulos
columnas_media_round = ['antiguedad', 'dormitorios', 'baños', 'ambientes']
columnas_media = ['m2_totales', 'm2_cubiertos','expenses','latitud', 'longitud']

mask_casas = data['vivienda'].str.contains('Casa', case=False, na=False)
data.loc[mask_casas, 'expenses'] = data.loc[mask_casas, 'expenses'].fillna(0)
for columna in columnas_media_round:
    data[columna] = data.groupby('vivienda')[columna].transform(lambda x: x.fillna(round(x.mean(), 0)))
for columna in columnas_media:
    data[columna] = data.groupby('vivienda')[columna].transform(lambda x: x.fillna(x.mean()))
data['cocheras'] = data['cocheras'].fillna(0)
data = data.dropna(subset=['price'])


In [18]:
# 13. Estandarizar variables numéricas, menos precio porque es la variable objetivo y es logarítmica
columnas_salida = ['Cantidad_plantas', 'antiguedad', 'dormitorios', 'cocheras', 'baños',
                   'ambientes', 'm2_totales', 'm2_cubiertos', 'expenses','latitud', 'longitud']
scaler = StandardScaler()
scaled_values = scaler.fit_transform(data[columnas_salida])
df_scaled = pd.DataFrame(scaled_values, columns=columnas_salida, index=data.index)
data[columnas_salida] = df_scaled

#Logaritmo en la variable objetivo 'price'
data['price'] = np.log1p(data['price'])


In [22]:
#13.5 Clusterización de latitud y longitud
import folium
from sklearn.cluster import DBSCAN

X = data[['latitud', 'longitud']].values

dbscan = DBSCAN(eps=0.2, min_samples=20)
clusters = dbscan.fit_predict(X)

data['cluster'] = clusters

n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
n_noise = list(clusters).count(-1)

print(f"\n🎯 RESULTADOS DBSCAN:")
print(f"Clusters encontrados: {n_clusters}")
print(f"Puntos en clusters: {len(data) - n_noise}")
print(f"Outliers (ruido): {n_noise}")


🎯 RESULTADOS DBSCAN:
Clusters encontrados: 4
Puntos en clusters: 789
Outliers (ruido): 20


In [24]:
# 14. Codificar variables categóricas
columnas_categoricas = ["Pileta","Apto_credito","vivienda","cluster"]
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop="first")
encoded_data = onehot_encoder.fit_transform(data[columnas_categoricas])
feature_names = onehot_encoder.get_feature_names_out(columnas_categoricas)
encoded_df = pd.DataFrame(encoded_data, columns=feature_names, index=data.index)
data = data.drop(columns=columnas_categoricas)
data = pd.concat([data, encoded_df], axis=1)


In [25]:
data

Unnamed: 0,Cantidad_plantas,antiguedad,dormitorios,cocheras,baños,ambientes,m2_totales,m2_cubiertos,expenses,price,latitud,longitud,Pileta_sí,Apto_credito_sí,vivienda_Departamento,vivienda_PH,cluster_0,cluster_1,cluster_2,cluster_3
0,-0.117407,-0.135918,1.289863,-0.050272,1.121699,0.792867,0.516783,1.969679,-0.715738,13.287880,-0.089146,0.186911,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.117407,1.071301,0.331645,-0.050272,-1.003487,0.143647,-0.785713,-0.728626,0.411538,11.931642,-0.105846,0.211397,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,8.517362,-0.890430,0.331645,-0.050272,1.121699,0.143647,3.371502,0.932830,1.936677,12.847929,-0.079839,0.159640,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,-0.117407,-0.940731,2.248080,0.309643,5.372073,2.091308,1.212101,3.556183,-0.715738,13.038767,-0.016557,-0.860854,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,-0.117407,1.574310,0.331645,-0.050272,0.059106,0.143647,-0.462538,0.145825,0.942021,12.100718,0.452367,-0.896337,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1169,-0.117407,0.316789,0.331645,-0.410188,0.059106,0.792867,0.301332,0.220778,-0.715738,11.849405,-0.129598,0.176348,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1170,-0.117407,0.568293,0.331645,0.309643,-1.003487,0.143647,0.448230,0.220778,-0.715738,11.775297,-0.016557,-0.860854,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1171,-0.117407,0.065285,0.331645,0.309643,0.059106,0.792867,0.448230,-0.528752,-0.715738,11.849405,-0.016557,-0.860854,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1172,-0.117407,-0.739528,0.331645,0.309643,-1.003487,-1.154794,0.203400,-0.528752,-0.715738,11.759793,-0.135455,0.186043,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [26]:
# 15. Guardar el dataset final y los objetos de transformación
data.to_csv("dataset_geo.csv", index=False)
joblib.dump(scaler, 'models/standardscaler.joblib')
joblib.dump(onehot_encoder, 'models/onehotencoder.joblib')
joblib.dump(dbscan, 'models/dbscan.joblib')
print("Preprocesamiento finalizado y archivos guardados.")


Preprocesamiento finalizado y archivos guardados.
