# Carga de datos

In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# Ruta del archivo
file_path = '/content/drive/My Drive/ar_properties.csv'

# Cargo el archivo en un DataFrame
ar_properties_df = pd.read_csv(file_path)

Mounted at /content/drive


In [2]:
ar_properties_df.head()

Unnamed: 0,id,ad_type,start_date,end_date,created_on,lat,lon,l1,l2,l3,...,bedrooms,bathrooms,surface_total,surface_covered,price,currency,price_period,title,property_type,operation_type
0,S0we3z3V2JpHUJreqQ2t/w==,Propiedad,2019-04-14,2019-06-14,2019-04-14,-34.943312,-54.929656,Uruguay,Maldonado,Punta del Este,...,,1.0,45.0,40.0,13000.0,UYU,Mensual,Departamento - Roosevelt,Departamento,Alquiler
1,kMxcmAS8NvrynGBVbMOEaQ==,Propiedad,2019-04-14,2019-04-16,2019-04-14,-34.63181,-58.420599,Argentina,Capital Federal,Boedo,...,,,,,0.0,,Mensual,PH - Boedo,PH,Venta
2,Ce3ojF+ZTOkB8d+LI9dpxg==,Propiedad,2019-04-14,9999-12-31,2019-04-14,,,Argentina,Bs.As. G.B.A. Zona Norte,,...,,1.0,200.0,,,,,Ituzaingo 1100 - $ 1 - Casa Alquiler,Casa,Alquiler
3,AUGpj3raGmOCiulSMGIBPA==,Propiedad,2019-04-14,9999-12-31,2019-04-14,-34.654705,-58.790894,Argentina,Bs.As. G.B.A. Zona Oeste,Moreno,...,,2.0,460.0,100.0,,,Mensual,Dr. Vera 300 - Consulte precio - Casa en Venta,Casa,Venta
4,m+MwZmJl3OoxmfWcB//sBA==,Propiedad,2019-04-14,2019-07-09,2019-04-14,-34.654949,-58.787117,Argentina,Bs.As. G.B.A. Zona Oeste,Moreno,...,,3.0,660.0,148.0,,,Mensual,L. N. Alem 2400 - Consulte precio - Casa en ...,Casa,Venta


# 1- Filtre únicamente las propiedades en Argentina.


In [3]:
# Creo un nuevo DataFrame solo con las propiedades de Argentina
df_argentina = ar_properties_df[ar_properties_df['l1'] == 'Argentina']


In [4]:
#df_argentina.head()

# Análisis exploratorio

In [5]:
df_argentina.shape

(374977, 24)

In [6]:
df_argentina.info()

<class 'pandas.core.frame.DataFrame'>
Index: 374977 entries, 1 to 388890
Data columns (total 24 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               374977 non-null  object 
 1   ad_type          374977 non-null  object 
 2   start_date       374977 non-null  object 
 3   end_date         374977 non-null  object 
 4   created_on       374977 non-null  object 
 5   lat              325591 non-null  float64
 6   lon              325591 non-null  float64
 7   l1               374977 non-null  object 
 8   l2               374977 non-null  object 
 9   l3               368952 non-null  object 
 10  l4               115268 non-null  object 
 11  l5               2406 non-null    object 
 12  l6               0 non-null       float64
 13  rooms            235830 non-null  float64
 14  bedrooms         152764 non-null  float64
 15  bathrooms        282784 non-null  float64
 16  surface_total    302852 non-null  float64
 

In [7]:
#Convierto las columnas de fechas a objetos de tipo datetime y elimino timezone
def a_fecha(dataframe, columna):
  dataframe.loc[:, columna] = pd.to_datetime(dataframe[columna], errors='coerce')

a_fecha(df_argentina,'start_date')
a_fecha(df_argentina,'end_date')

In [8]:
#Me quedo con las columnas nùmericas y el tag de salida para aplicar el modelo
columnas_num= ['lat', 'lon', 'l6', 'rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered', 'price', 'operation_type']
df_modelo = df_argentina[columnas_num]
df_modelo.head()

Unnamed: 0,lat,lon,l6,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,operation_type
1,-34.63181,-58.420599,,,,,,,0.0,Venta
2,,,,2.0,,1.0,200.0,,,Alquiler
3,-34.654705,-58.790894,,2.0,,2.0,460.0,100.0,,Venta
4,-34.654949,-58.787117,,2.0,,3.0,660.0,148.0,,Venta
5,-32.935473,-60.683979,,4.0,,1.0,,89.0,,Venta


In [9]:
df_modelo.isnull().sum()

lat                 49386
lon                 49386
l6                 374977
rooms              139147
bedrooms           222213
bathrooms           92193
surface_total       72125
surface_covered     94755
price               20529
operation_type          0
dtype: int64

In [10]:
#Elimino aquellas columnas que tienen mas del 50% de sus valores nulos
thresh = df_modelo.shape[0] * 0.5
df_modelo = df_modelo.dropna(axis=1, thresh=thresh)
df_modelo.head()

Unnamed: 0,lat,lon,rooms,bathrooms,surface_total,surface_covered,price,operation_type
1,-34.63181,-58.420599,,,,,0.0,Venta
2,,,2.0,1.0,200.0,,,Alquiler
3,-34.654705,-58.790894,2.0,2.0,460.0,100.0,,Venta
4,-34.654949,-58.787117,2.0,3.0,660.0,148.0,,Venta
5,-32.935473,-60.683979,4.0,1.0,,89.0,,Venta


In [11]:
# Imputo valores nulos con la media en las columnas numéricas
for column in df_modelo.select_dtypes(include=['float64']).columns:
    df_modelo[column].fillna(df_modelo[column].mean(), inplace=True)
df_modelo.isnull().sum()

lat                0
lon                0
rooms              0
bathrooms          0
surface_total      0
surface_covered    0
price              0
operation_type     0
dtype: int64

# 2- Utilice algún modelo para clasificar las propiedades en venta y alquiler que tome como variables de entrada las características de la propiedad.


In [12]:
#Cargo librerias para aplicar regresión lineal
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

#Separo los predictores del tag de salida
X_logistica = df_modelo.drop('operation_type', axis=1)
y_logistica= df_modelo['operation_type']


In [13]:
#Separo los datos en conjuntos de entrenamiento y prueba
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_logistica, y_logistica, test_size=0.2, random_state=42)

In [14]:
# Normalizo las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_log)
X_test_scaled = scaler.transform(X_test_log)

In [15]:
#Entreno el modelo
model = LogisticRegression()
model.fit(X_train_scaled, y_train_log)

In [16]:
# Hacer predicciones en el conjunto de prueba
y_pred_log = model.predict(X_test_scaled)

In [17]:
# Evaluo el rendimiento del modelo
accuracy = accuracy_score(y_test_log, y_pred_log)
print("Precisión del modelo:", accuracy)

Precisión del modelo: 0.8650061336604619


In [18]:
# Imprimir el reporte de clasificación
print(classification_report(y_test_log, y_pred_log))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

         Alquiler       0.71      0.88      0.79     20334
Alquiler temporal       0.00      0.00      0.00      2928
            Venta       0.94      0.91      0.92     51734

         accuracy                           0.87     74996
        macro avg       0.55      0.60      0.57     74996
     weighted avg       0.84      0.87      0.85     74996



  _warn_prf(average, modifier, msg_start, len(result))


Este valor de precisión dice que el modelo es bastante bueno para distinguir entre propiedades en venta y en alquiler. Sin embargo, mientras es muy eficaz prediciendo propiedades en venta, tiene grandes dificultades para clasificar propiedades de alquiler temporal. Esta diferencia de rendimiento parece deberse a la disparidad entre número de instancias para cada una.

# 3- Ajuste un modelo para describir el precio en función de las características de una propiedad


In [19]:
#Preparo el dataset para realizar una regresión lineal
#Copio el df con las columnas numericas, la columna de localidades y  el tag anterior a otro dataframe
columnas_lineal= ['l2','lat', 'lon', 'rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered', 'price', 'operation_type']
df_lineal = df_argentina[columnas_lineal]
df_lineal.head()

Unnamed: 0,l2,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,operation_type
1,Capital Federal,-34.63181,-58.420599,,,,,,0.0,Venta
2,Bs.As. G.B.A. Zona Norte,,,2.0,,1.0,200.0,,,Alquiler
3,Bs.As. G.B.A. Zona Oeste,-34.654705,-58.790894,2.0,,2.0,460.0,100.0,,Venta
4,Bs.As. G.B.A. Zona Oeste,-34.654949,-58.787117,2.0,,3.0,660.0,148.0,,Venta
5,Santa Fe,-32.935473,-60.683979,4.0,,1.0,,89.0,,Venta


In [20]:
#Trnasformo las variables categóricas a numericas
df_encoded = pd.get_dummies(df_lineal, columns=['l2','operation_type'])
df_encoded.shape
df_encoded.head()

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,l2_Bs.As. G.B.A. Zona Norte,l2_Bs.As. G.B.A. Zona Oeste,...,l2_San Juan,l2_San Luis,l2_Santa Cruz,l2_Santa Fe,l2_Santiago Del Estero,l2_Tierra Del Fuego,l2_Tucumán,operation_type_Alquiler,operation_type_Alquiler temporal,operation_type_Venta
1,-34.63181,-58.420599,,,,,,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
2,,,2.0,,1.0,200.0,,,True,False,...,False,False,False,False,False,False,False,True,False,False
3,-34.654705,-58.790894,2.0,,2.0,460.0,100.0,,False,True,...,False,False,False,False,False,False,False,False,False,True
4,-34.654949,-58.787117,2.0,,3.0,660.0,148.0,,False,True,...,False,False,False,False,False,False,False,False,False,True
5,-32.935473,-60.683979,4.0,,1.0,,89.0,,False,False,...,False,False,False,True,False,False,False,False,False,True


In [21]:
#Reemplazo los valores nulos en todas las columnas por sus medias
medias = df_encoded.mean()
df_sin_nulos = df_encoded.fillna(medias)


In [22]:
#Importo librerías
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [23]:
#Separo predictores del tag de salida
X_lineal = df_sin_nulos.drop('price', axis=1)
y_lineal = df_sin_nulos['price']

In [24]:
#Separo los datos en conjuntos de entrenamiento y prueba
X_train_lin, X_test_lin, y_train_lin, y_test_lin = train_test_split(X_lineal, y_lineal, test_size=0.2, random_state=42)

In [25]:
# Normalizo las características
scaler = StandardScaler()
X_train_scaled_lin = scaler.fit_transform(X_train_lin)
X_test_scaled_lin = scaler.transform(X_test_lin)

In [26]:
#Entreno el modelo
model = LinearRegression()
model.fit(X_train_lin, y_train_lin)

In [27]:
# Realizo predicciones en el conjunto de prueba
y_pred_lin = model.predict(X_test_lin)

In [28]:
# Evaluo el modelo
mse = mean_squared_error(y_test_lin, y_pred_lin)
r2 = r2_score(y_test_lin, y_pred_lin)

print("Error Cuadrático Medio (MSE):", mse)
print("R cuadrado:", r2)

Error Cuadrático Medio (MSE): 6897176265995.297
R cuadrado: 0.0036527426408550223


Los valores del error cuadrático medio y el R cuadrado muestran que el modelo no es bueno. Esto podrìa deberse a una baja relaciòn lineal entre las variables

In [29]:
#Observo la correlaciòn y veo que el precio tiene muy baja colilinealidad con los otros predictores
df_encoded.corr()




Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,l2_Bs.As. G.B.A. Zona Norte,l2_Bs.As. G.B.A. Zona Oeste,...,l2_San Juan,l2_San Luis,l2_Santa Cruz,l2_Santa Fe,l2_Santiago Del Estero,l2_Tierra Del Fuego,l2_Tucumán,operation_type_Alquiler,operation_type_Alquiler temporal,operation_type_Venta
lat,1.0,-0.152257,0.057904,0.027013,0.006354,0.009385,0.008657,0.020189,0.014381,-0.015345,...,0.050262,0.039756,-0.124249,0.244343,0.0428,-0.264475,0.301324,0.007997,-0.014482,-0.001406
lon,-0.152257,1.0,-0.108939,-0.023686,0.00357,-0.067799,-0.000743,-0.011263,0.1423,0.088583,...,-0.125568,-0.116378,-0.073641,-0.141154,-0.024044,-0.094635,-0.188911,0.004075,0.049049,-0.025116
rooms,0.057904,-0.108939,1.0,0.841634,0.618151,0.088419,0.053325,0.013738,0.081606,8.6e-05,...,0.033121,0.002672,0.002741,-0.007883,-0.004734,0.003393,0.017483,-0.106051,-0.104877,0.15125
bedrooms,0.027013,-0.023686,0.841634,1.0,0.668809,0.097768,0.046879,0.043487,0.110781,-0.007111,...,0.005596,0.00179,-0.000103,-0.042911,-0.00163,-0.000687,0.000313,-0.049579,-0.026539,0.057789
bathrooms,0.006354,0.00357,0.618151,0.668809,1.0,0.081148,0.061721,0.026677,0.248115,-0.044524,...,0.001576,-0.00429,0.003471,-0.085449,-0.002209,-0.009749,0.008217,-0.060126,-0.068097,0.08889
surface_total,0.009385,-0.067799,0.088419,0.097768,0.081148,1.0,0.269234,0.008166,0.00986,0.000971,...,0.018226,0.020134,0.003706,-0.00588,0.00582,-0.001388,-0.002482,-0.030504,-0.016111,0.035858
surface_covered,0.008657,-0.000743,0.053325,0.046879,0.061721,0.269234,1.0,0.006969,0.000858,-0.000606,...,0.0002,-3.8e-05,0.000749,-0.002464,1.7e-05,-0.00019,-0.000346,-0.001987,-0.002587,0.002993
price,0.020189,-0.011263,0.013738,0.043487,0.026677,0.008166,0.006969,1.0,0.001649,-0.006553,...,0.006633,-5.6e-05,-0.000129,0.013779,0.009307,0.000225,0.012345,-0.031043,-0.010341,0.034171
l2_Bs.As. G.B.A. Zona Norte,0.014381,0.1423,0.081606,0.110781,0.248115,0.00986,0.000858,0.001649,1.0,-0.14012,...,-0.018509,-0.02324,-0.009628,-0.159471,-0.006374,-0.013805,-0.039114,0.025001,-0.055502,-0.000583
l2_Bs.As. G.B.A. Zona Oeste,-0.015345,0.088583,8.6e-05,-0.007111,-0.044524,0.000971,-0.000606,-0.006553,-0.14012,1.0,...,-0.010897,-0.013682,-0.005669,-0.093886,-0.003753,-0.008127,-0.023028,0.031883,-0.05483,-0.007471
