In [None]:
# initial setup
try:
    # settings colab:
    import google.colab
        
except ModuleNotFoundError:    
    # settings local:
    %run "common/0_notebooks_base_setup.py"

In [1]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
from sklearn.metrics import r2_score
import sklearn.metrics as metrics
import sklearn.linear_model as linear_model
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Definimos parámetros globales para matplotlib.
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 16

In [2]:
# LECTURA DEL ARCHIVO DE DATOS
data_clean = pd.read_csv("CleanCSV/Clean08_OultiersxZona.csv")

In [3]:
# SÓLO PARA USO EN COMPARACIONES
data_clean_ORIGINAL = data_clean.copy(deep=True) 

In [4]:
data_clean.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,País,Provincia,Área,Localidad,Zona,Lugar,operation,property_type,...,baulera,piscina_house,piscina_depart,desc_banios,tipo_propiedad_Fondo de Comercio,tipo_propiedad_PH,tipo_propiedad_apartment,tipo_propiedad_house,tipo_propiedad_store,Outliers
0,0,0,Argentina,Capital Federal,Capital Federal,Mataderos,Mataderos,Mataderos,sell,PH,...,0.0,0,0,1.0,0,1,0,0,0,
1,1,1,Argentina,Buenos Aires,Zona Sur,La Plata,La Plata,La Plata,sell,apartment,...,0.0,0,0,1.0,0,0,1,0,0,
2,2,2,Argentina,Capital Federal,Capital Federal,Mataderos,Mataderos,Mataderos,sell,apartment,...,0.0,0,0,1.0,0,0,1,0,0,


In [5]:
display(data_clean.shape)
data_clean.columns

(114088, 45)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'País', 'Provincia', 'Área', 'Localidad',
       'Zona', 'Lugar', 'operation', 'property_type', 'geonames_id', 'lat',
       'lon', 'price_aprox_usd', 'surface_total_in_m2',
       'surface_covered_in_m2', 'price_usd_per_m2', 'floor', 'rooms',
       'expenses', 'url_clean', 'url_terraza', 'url_quincho', 'desc_terraza',
       'desc_quincho', 'garage', 'lavadero', 'balcon', 'parrilla', 'jardin',
       'patio', 'amenities', 'estrenar', 'gimnasio', 'subte', 'baulera',
       'piscina_house', 'piscina_depart', 'desc_banios',
       'tipo_propiedad_Fondo de Comercio', 'tipo_propiedad_PH',
       'tipo_propiedad_apartment', 'tipo_propiedad_house',
       'tipo_propiedad_store', 'Outliers'],
      dtype='object')

#### BINARIZACIÓN DE LAT-LOT (PARCELAS)

In [6]:
data_clean.lat.describe()
data_clean.lat.isnull().sum()
data_clean.lon.describe()
data_clean.lon.isnull().sum()

48510

In [7]:
data_clean.dropna(subset=['lat', 'lon'], how="any", axis=0, inplace= True)

In [8]:
data_clean.lat.describe()
data_clean.lat.isnull().sum()
data_clean.lon.describe()
data_clean.lon.isnull().sum()

0

In [9]:
data_clean.shape

(65578, 45)

In [10]:
data_clean.property_type.value_counts()

apartment            38773
house                21033
PH                    3620
store                 2059
Fondo de Comercio       93
Name: property_type, dtype: int64

In [11]:
lat_categories = pd.cut(data_clean.lat, 4000, right=False, labels=False) # labels=False implica que me devuelve el bin al que pertenece
lat_categories

0         1358
1         1342
2         1359
3         1359
4         1133
          ... 
114021    1362
114026    1363
114028    1365
114077    1365
114085    1364
Name: lat, Length: 65578, dtype: int64

In [12]:
lat_categories = lat_categories.astype("str", copy=True)
type(lat_categories[0])

str

In [13]:
lon_categories = pd.cut(data_clean.lon, 3500, right=False, labels=False) # labels=False impluca que me devuelve el bin al que pertenece
lon_categories.value_counts()

2891    1794
2749    1697
2892    1624
2756    1461
2755    1279
        ... 
2079       1
2463       1
2911       1
2526       1
0          1
Name: lon, Length: 1124, dtype: int64

In [14]:
lon_categories = lon_categories.astype("str", copy=True)
type(lon_categories[0])

str

In [15]:
lat_lon = lat_categories + "/" + lon_categories

lat_lon.value_counts()

1133/2891    1043
1133/2892     953
1374/2727     498
1132/2892     447
1365/2746     423
             ... 
1367/2648       1
1338/2837       1
1138/2852       1
1426/2541       1
1381/2697       1
Length: 5334, dtype: int64

In [16]:
lat_lon.shape

(65578,)

In [17]:
data_clean = pd.concat([data_clean, lat_lon],axis=1)

In [18]:
data_clean.columns

Index([                      'Unnamed: 0',                     'Unnamed: 0.1',
                                   'País',                        'Provincia',
                                   'Área',                        'Localidad',
                                   'Zona',                            'Lugar',
                              'operation',                    'property_type',
                            'geonames_id',                              'lat',
                                    'lon',                  'price_aprox_usd',
                    'surface_total_in_m2',            'surface_covered_in_m2',
                       'price_usd_per_m2',                            'floor',
                                  'rooms',                         'expenses',
                              'url_clean',                      'url_terraza',
                            'url_quincho',                     'desc_terraza',
                           'desc_quincho',          

In [19]:
data_clean.rename(columns={0: "lat_lon"}, inplace=True)


In [20]:
data_clean.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'País', 'Provincia', 'Área', 'Localidad',
       'Zona', 'Lugar', 'operation', 'property_type', 'geonames_id', 'lat',
       'lon', 'price_aprox_usd', 'surface_total_in_m2',
       'surface_covered_in_m2', 'price_usd_per_m2', 'floor', 'rooms',
       'expenses', 'url_clean', 'url_terraza', 'url_quincho', 'desc_terraza',
       'desc_quincho', 'garage', 'lavadero', 'balcon', 'parrilla', 'jardin',
       'patio', 'amenities', 'estrenar', 'gimnasio', 'subte', 'baulera',
       'piscina_house', 'piscina_depart', 'desc_banios',
       'tipo_propiedad_Fondo de Comercio', 'tipo_propiedad_PH',
       'tipo_propiedad_apartment', 'tipo_propiedad_house',
       'tipo_propiedad_store', 'Outliers', 'lat_lon'],
      dtype='object')

In [21]:
data_clean.drop(['Unnamed: 0', 'Unnamed: 0.1', 'operation', 'property_type', 'geonames_id', 'lat', 'lon', 'floor', 'rooms', 'expenses', 
                 'url_clean', 'url_terraza', 'url_quincho', 'desc_terraza', 'desc_quincho', 'desc_banios', 'tipo_propiedad_Fondo de Comercio',
                 'Outliers'], axis=1, inplace=True)

In [22]:
data_clean.columns

Index(['País', 'Provincia', 'Área', 'Localidad', 'Zona', 'Lugar',
       'price_aprox_usd', 'surface_total_in_m2', 'surface_covered_in_m2',
       'price_usd_per_m2', 'garage', 'lavadero', 'balcon', 'parrilla',
       'jardin', 'patio', 'amenities', 'estrenar', 'gimnasio', 'subte',
       'baulera', 'piscina_house', 'piscina_depart', 'tipo_propiedad_PH',
       'tipo_propiedad_apartment', 'tipo_propiedad_house',
       'tipo_propiedad_store', 'lat_lon'],
      dtype='object')

In [23]:
dummies_lat_lon = pd.get_dummies(data_clean['lat_lon'], drop_first=True)  # acá usa drop para generar N-1 columnas
dummies_lat_lon.shape

(65578, 5333)

In [24]:
data_clean = pd.concat([data_clean, dummies_lat_lon],axis=1)

In [25]:
data_clean.columns

Index(['País', 'Provincia', 'Área', 'Localidad', 'Zona', 'Lugar',
       'price_aprox_usd', 'surface_total_in_m2', 'surface_covered_in_m2',
       'price_usd_per_m2',
       ...
       '989/701', '989/702', '990/695', '990/700', '990/707', '990/714',
       '991/717', '993/697', '994/695', '994/734'],
      dtype='object', length=5361)

In [26]:
data_clean.isnull().sum().sum()

52667

In [27]:
dataf = data_clean.dropna(how="any", axis=0)

In [28]:
dataf.isnull().sum().sum()

0

In [29]:
X = dataf.drop(['País', 'Provincia', 'Área', 'Localidad', 'Zona', 'Lugar', 'price_aprox_usd', 'price_usd_per_m2', 'lat_lon'], axis=1)

In [30]:
X.shape

(43068, 5352)

In [31]:
y = dataf['price_usd_per_m2']

In [32]:
y.shape

(43068,)

In [33]:
print("Shape X:", X.shape)
print("Type X:", type(X))
print("Shape y:", y.shape)
print("Type y:", type(y))

Shape X: (43068, 5352)
Type X: <class 'pandas.core.frame.DataFrame'>
Shape y: (43068,)
Type y: <class 'pandas.core.series.Series'>


<hr id="Ubicaciones">

In [34]:
# importamos el modelo lineal y algunas funciones para calcular la bondad de ajuste.
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

### Primer aproximación con STATS MODEL - OLS

In [35]:
# USANDO statsmodel
# Tenemos que agregar explícitamente a una constante:
X_const = sm.add_constant(X)
X_const.head(3)

Unnamed: 0,const,surface_total_in_m2,surface_covered_in_m2,garage,lavadero,balcon,parrilla,jardin,patio,amenities,...,989/701,989/702,990/695,990/700,990/707,990/714,991/717,993/697,994/695,994/734
0,1.0,55.0,40.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,55.0,55.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,35.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


#### usando "X_const"

In [36]:
# USANDO TRAIN / TEST / SPLIT
X_train, X_test, y_train, y_test = train_test_split(X_const, y)

In [37]:
# USANDO statsmodel

model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model      
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: TEST:", r2_score(y_test, predictions2))  # PREDICT


EMC train: 162816.67354338834
EMC train: 189079.4523683028
R2:  0.7775183909820169
R2: train: 0.7775183909820169
R2: TEST: 0.7358147664837686


In [38]:
print (model.summary())

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


                            OLS Regression Results                            
Dep. Variable:       price_usd_per_m2   R-squared:                       0.778
Model:                            OLS   Adj. R-squared:                  0.752
Method:                 Least Squares   F-statistic:                     30.26
Date:                Sat, 20 Jun 2020   Prob (F-statistic):               0.00
Time:                        15:50:30   Log-Likelihood:            -2.3965e+05
No. Observations:               32301   AIC:                         4.860e+05
Df Residuals:                   28956   BIC:                         5.140e+05
Df Model:                        3344                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

#### Primer eliminación de features con STATS MODEL - OLS

In [39]:
not_feature_cols = model.pvalues[model.pvalues>0.01].index.values

In [40]:
not_feature_cols

array(['surface_covered_in_m2', 'lavadero', 'parrilla', ..., '989/699',
       '989/701', '990/707'], dtype=object)

In [41]:
X2_const = X_const.drop(not_feature_cols, axis=1)

In [42]:
X2_const.head(3)

Unnamed: 0,const,surface_total_in_m2,garage,balcon,patio,amenities,estrenar,gimnasio,subte,baulera,...,989/694,989/700,989/702,990/695,990/700,990/714,991/717,993/697,994/695,994/734
0,1.0,55.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### SEGUNDA aproximación con STATS MODEL - OLS

#### usando "X2_const"

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X2_const, y)

In [44]:
# USANDO statsmodel

model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: TEST:", r2_score(y_test, predictions2))  # PREDICT


EMC train: 187267.847338627
EMC train: 197236.16684845684
R2:  0.7427641368768956
R2: train: 0.7427641368768956
R2: TEST: 0.7287475372344738


In [45]:
model.summary()

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,price_usd_per_m2,R-squared:,0.743
Model:,OLS,Adj. R-squared:,0.738
Method:,Least Squares,F-statistic:,149.8
Date:,"Sat, 20 Jun 2020",Prob (F-statistic):,0.0
Time:,15:51:05,Log-Likelihood:,-241900.0
No. Observations:,32301,AIC:,485000.0
Df Residuals:,31689,BIC:,490200.0
Df Model:,611,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1561.3144,6.503,240.081,0.000,1548.568,1574.061
surface_total_in_m2,-2.6802,0.024,-111.195,0.000,-2.727,-2.633
garage,98.0850,5.401,18.162,0.000,87.499,108.671
balcon,73.8042,5.543,13.314,0.000,62.939,84.669
patio,-219.8709,6.650,-33.061,0.000,-232.906,-206.836
amenities,82.0981,10.299,7.972,0.000,61.912,102.284
estrenar,114.1539,8.587,13.293,0.000,97.322,130.986
gimnasio,184.8235,11.776,15.695,0.000,161.742,207.905
subte,-37.0897,11.586,-3.201,0.001,-59.800,-14.380

0,1,2,3
Omnibus:,2393.975,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5471.171
Skew:,0.471,Prob(JB):,0.0
Kurtosis:,4.782,Cond. No.,4.07e+20


In [46]:
X2_const.columns

Index(['const', 'surface_total_in_m2', 'garage', 'balcon', 'patio',
       'amenities', 'estrenar', 'gimnasio', 'subte', 'baulera',
       ...
       '989/694', '989/700', '989/702', '990/695', '990/700', '990/714',
       '991/717', '993/697', '994/695', '994/734'],
      dtype='object', length=1284)

#### SEGUNDA eliminación de features con STATS MODEL - OLS

In [47]:
not_feature_cols = model.pvalues[model.pvalues>0.01].index.values
not_feature_cols

array(['1084/2136', '1096/2699', '1114/2844', '1115/2845', '1126/2886',
       '1127/2891', '1133/2885', '1134/2882', '1135/2879', '1135/2885',
       '1181/2979', '1183/2982', '1190/2998', '1192/2996', '1194/3002',
       '1219/3030', '1221/3029', '1340/2669', '1340/2825', '1341/2738',
       '1341/2739', '1343/2741', '1343/2847', '1345/2790', '1346/2746',
       '1350/2790', '1351/2770', '1355/2774', '1356/2743', '1356/2754',
       '1357/2729', '1357/2762', '1358/2687', '1358/2689', '1360/2693',
       '1360/2714', '1361/2695', '1362/1176', '1362/2681', '1362/2688',
       '1363/2730', '1365/2706', '1365/2728', '1368/2702', '1369/2688',
       '1369/2693', '1369/2735', '1370/2711', '1372/2659', '1372/2670',
       '1372/2682', '1372/2700', '1372/2714', '1372/2720', '1372/2729',
       '1373/2680', '1373/2697', '1375/2721', '1377/2706', '1377/2712',
       '1379/2701', '1380/2700', '1473/1086', '1473/2389', '1474/2393',
       '1475/1101', '1476/1100', '1477/1088', '1481/2386', '1482

In [48]:
X3_const = X2_const.drop(not_feature_cols, axis=1)

In [49]:
X3_const.head(3)

Unnamed: 0,const,surface_total_in_m2,garage,balcon,patio,amenities,estrenar,gimnasio,subte,baulera,...,989/694,989/700,989/702,990/695,990/700,990/714,991/717,993/697,994/695,994/734
0,1.0,55.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


#### con "X3_const"

In [50]:
# USANDO statsmodel
X_train, X_test, y_train, y_test = train_test_split(X3_const, y)
model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: TEST:", r2_score(y_test, predictions2))  # PREDICT


EMC train: 190259.0568356789
EMC train: 199303.56079122197
R2:  0.738465292639384
R2: train: 0.738465292639384
R2: TEST: 0.7265122975489497


In [51]:
model.summary()

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,price_usd_per_m2,R-squared:,0.738
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,185.6
Date:,"Sat, 20 Jun 2020",Prob (F-statistic):,0.0
Time:,15:51:38,Log-Likelihood:,-242160.0
No. Observations:,32301,AIC:,485300.0
Df Residuals:,31816,BIC:,489400.0
Df Model:,484,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1555.4224,6.445,241.355,0.000,1542.791,1568.054
surface_total_in_m2,-2.6870,0.024,-111.756,0.000,-2.734,-2.640
garage,96.0144,5.412,17.740,0.000,85.406,106.623
balcon,78.9541,5.567,14.182,0.000,68.042,89.866
patio,-217.0798,6.701,-32.394,0.000,-230.215,-203.945
amenities,76.8956,10.355,7.426,0.000,56.600,97.192
estrenar,118.7453,8.579,13.841,0.000,101.929,135.561
gimnasio,190.1525,11.797,16.118,0.000,167.029,213.276
subte,-33.4864,11.563,-2.896,0.004,-56.150,-10.822

0,1,2,3
Omnibus:,2513.461,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5973.401
Skew:,0.481,Prob(JB):,0.0
Kurtosis:,4.874,Cond. No.,1.79e+20


## Eligiendo entre modelos

#### con "X3"

In [52]:
X3 = X3_const.drop(["const"], axis=1)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X3, y)

## Ridge Regression

In [54]:
# X_train, X_test, y_train, y_test
# SIEMPRE NORMALIZAMOS CUANDO HACEMOS REGULARIZACIÓN (RIGDE, LASSO, ETC..) PORQUE SINO COMO LOS BETAS DE CADA
# FEATURE ESTÁN EN ESCALAS DIFERENTE, PENALIZARÍA MAS A FEATURES EN RANGOS MUY GRANDES,
# ES DECIR, PENALIZARÍA DIFERENTE A CADA FEATURE
# ADEMÁS, AYUDA AL TIEMPO DE PROCESAMIENTO
# OJO QUE LOS BETA QUE OBTENEMOS ESTARÁN TAMBIÉN EN OTRA ESCALA, TAMBIÉN ESTARÁN NORMALIZADOS

rlm = linear_model.Ridge(alpha=0.01, normalize=True)   # EN GENERAL SIEMPRESE SE HACE NORMALIZACIÓN CON RIDGE

# Ajustamos nuevamente, esta vez con regularizacion (RIDGE)
ridge_model = rlm.fit(X_train, y_train)
predictions = ridge_model.predict(X_train)
# SUPUESTAMENTE EL MÉTODO PREDICT TOMA LOS X Y LOS NORMALIZA ANTES DE PREDECIR, SI ASÍ LO SETIE EN EL MODELO.
# SINO NO FUNCIONARÍA, O SI YO QUISIERA ARMAR EL MODELO CON LOS BETAS Y CORFICIENTES YO DEBERÍA NORMALIZAR LAS X

print ("r^2 Train:", ridge_model.score(X_train, y_train))
print ("r^2 TEST:", ridge_model.score(X_test, y_test))
# EL R2 DE ENTRENAMIENTO SE MANTUVO CON RESPECTO A LA REGRESIÓN LINAL SIN AJUSTE (era 0.73)

r^2 Train: 0.7407368867794366
r^2 TEST: 0.7186578792126608


In [55]:
ridge_model.alpha

0.01

## Lasso



In [56]:
# X_train, X_test, y_train, y_test

lasso = linear_model.Lasso(alpha=0.01, normalize=True) # SIEMPRE HACER NORMALIZACIÓN CON LASSO

# Ajustamos nuevamente, esta vez con regularizacion (LASSO)
lasso_model =lasso.fit(X_train, y_train)

print ("r^2 Train:", lasso_model.score(X_train, y_train))
print ("r^2 TEST:", lasso_model.score(X_test, y_test))


# EL R2 DE ENTRENAMIENTO ES MEJOR QUE RIDGE PERO BAJÓ CON RESPECTO A LA REGRESIÓN LINAL SIN AJUSTE


r^2 Train: 0.7356652283285716
r^2 TEST: 0.7131506661087583


In [57]:
lasso_model.alpha

0.01

### AGREGANDO COLUMNAS CUADRÁTICAS:

In [58]:
X3.shape

(43068, 1087)

In [59]:
X3.columns

Index(['surface_total_in_m2', 'garage', 'balcon', 'patio', 'amenities',
       'estrenar', 'gimnasio', 'subte', 'baulera', 'piscina_house',
       ...
       '989/694', '989/700', '989/702', '990/695', '990/700', '990/714',
       '991/717', '993/697', '994/695', '994/734'],
      dtype='object', length=1087)

In [60]:
sup_x_sup = X3.surface_total_in_m2 * X3.surface_total_in_m2
sup_x_sup.name

'surface_total_in_m2'

In [61]:
sup_x_sup.rename("sup_x_sup", axis=1, inplace=True)
sup_x_sup.name

'sup_x_sup'

In [62]:
X3_cuad = pd.concat([X3, sup_x_sup],axis=1)
X3_cuad.shape

(43068, 1088)

In [63]:
X3_cuad.columns

Index(['surface_total_in_m2', 'garage', 'balcon', 'patio', 'amenities',
       'estrenar', 'gimnasio', 'subte', 'baulera', 'piscina_house',
       ...
       '989/700', '989/702', '990/695', '990/700', '990/714', '991/717',
       '993/697', '994/695', '994/734', 'sup_x_sup'],
      dtype='object', length=1088)

#### con "X3_cuad"

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X3_cuad, y)

### Ridge Regression

In [65]:
# X_train, X_test, y_train, y_test
# SIEMPRE NORMALIZAMOS CUANDO HACEMOS REGULARIZACIÓN (RIGDE, LASSO, ETC..) PORQUE SINO COMO LOS BETAS DE CADA
# FEATURE ESTÁN EN ESCALAS DIFERENTE, PENALIZARÍA MAS A FEATURES EN RANGOS MUY GRANDES,
# ES DECIR, PENALIZARÍA DIFERENTE A CADA FEATURE
# ADEMÁS, AYUDA AL TIEMPO DE PROCESAMIENTO
# OJO QUE LOS BETA QUE OBTENEMOS ESTARÁN TAMBIÉN EN OTRA ESCALA, TAMBIÉN ESTARÁN NORMALIZADOS

rlm = linear_model.Ridge(alpha=0.01, normalize=True)   # EN GENERAL SIEMPRESE SE HACE NORMALIZACIÓN CON RIDGE

# Ajustamos nuevamente, esta vez con regularizacion (RIDGE)
ridge_model = rlm.fit(X_train, y_train)
predictions = ridge_model.predict(X_train)
# SUPUESTAMENTE EL MÉTODO PREDICT TOMA LOS X Y LOS NORMALIZA ANTES DE PREDECIR, SI ASÍ LO SETIE EN EL MODELO.
# SINO NO FUNCIONARÍA, O SI YO QUISIERA ARMAR EL MODELO CON LOS BETAS Y CORFICIENTES YO DEBERÍA NORMALIZAR LAS X

#plt.scatter(X_train.surface_total_in_m2, y_train)
#plt.title("Muestra de datos #1")
#plt.scatter(X_train.surface_total_in_m2, predictions, c='r')

print ("r^2 Train:", ridge_model.score(X_train, y_train))
print ("r^2 TEST:", ridge_model.score(X_test, y_test))
# EL R2 DE ENTRENAMIENTO SE MANTUVO CON RESPECTO A LA REGRESIÓN LINAL SIN AJUSTE (era 0.73)

r^2 Train: 0.7504772350360203
r^2 TEST: 0.7332028236743584


In [66]:
ridge_model.alpha

0.01

### Lasso



In [67]:
# X_train, X_test, y_train, y_test

lasso = linear_model.Lasso(alpha=0.01, normalize=True) # SIEMPRE HACER NORMALIZACIÓN CON LASSO

# Ajustamos nuevamente, esta vez con regularizacion (LASSO)
lasso_model =lasso.fit(X_train, y_train)
#predictions = lasso_model.predict(X2_train)

#plt.scatter(X2_train.surface_total_in_m2, y_train)
#plt.title("Muestra de datos #1")
#plt.scatter(X_train.surface_total_in_m2, predictions, c='r')

print ("r^2 Train:", lasso_model.score(X_train, y_train))
print ("r^2 TEST:", lasso_model.score(X_test, y_test))


# EL R2 DE ENTRENAMIENTO ES MEJOR QUE RIDGE PERO BAJÓ CON RESPECTO A LA REGRESIÓN LINAL SIN AJUSTE


r^2 Train: 0.7459640894681098
r^2 TEST: 0.7303984159075829


In [68]:
lasso_model.alpha

0.01

#### CORSS-VALIDATION Y REGULTARIZACIÓN  -  con "X3_cuad"

In [69]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold
#from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# KFold: Provides train/test indices to split data in train/test sets. 
# Split dataset into k consecutive folds (without shuffling by default).
# OBJETO PARA CROSS-VALIDATION (K-Folds cross-validator) 
cv = KFold(3, shuffle=True)
# LO BUENO DE USAR ESTE OBJETO, ES QUE PODRÍA USARLO PARA HACER SIEMPRE LAS MISMAS PARTICIONES


### Regularización: Lasso y Ridge -  con "X3_cuad"

##### Lasso CV (Cross Validation)

In [70]:
# INSTANCIO Y FITEO EL MODELO LASSO  -- ACÁ PRUEBO SIN NORMALIZAR

model = linear_model.LassoCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean(), zero_coefs=(model.coef_ == 0).sum()))
# se muestra que eliminó ("descartó") cierta cantidad de coeficientes


{'alpha': 0.01, 'scores': array([0.7292935 , 0.73120399, 0.73998111]), 'mean_score': 0.7334928680674201, 'zero_coefs': 610}


In [71]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

0.7303984159075829

##### Ridge CV (Cross Validation)

In [72]:
# INSTANCIO Y FITEO EL MODELO RIDGE  -- ACÁ PRUEBO NORMALIZANDO
model = linear_model.RidgeCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean()))
# se muestra que eliminó ("descartó") cierta cantidad de coeficientes


{'alpha': 0.01, 'scores': array([0.74082895, 0.73084414, 0.73896272]), 'mean_score': 0.7368786009836857}


In [73]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

0.7332028236743633

### AGREGANDO COLUMNAS RELACIONALES:

In [79]:
X3_cuad.shape

(43068, 1088)

In [80]:
X3_cuad.columns[0:20]

Index(['surface_total_in_m2', 'garage', 'balcon', 'patio', 'amenities',
       'estrenar', 'gimnasio', 'subte', 'baulera', 'piscina_house',
       'piscina_depart', 'tipo_propiedad_store', '1004/734', '1066/2294',
       '1066/2295', '1068/1205', '1069/1214', '1070/2298', '1071/1227',
       '1084/2145'],
      dtype='object')

In [81]:
X3_cuad.columns[-1]

'sup_x_sup'

##### SELECCIONAR A MANO LAS QUE NO VOY A MULTIPLICAR

In [82]:
relac = X3_cuad.drop(['surface_total_in_m2', 'garage', 'balcon', 'patio', 'amenities','estrenar', 'gimnasio', 'subte', 'baulera',
                      'piscina_house', 'piscina_depart', 'tipo_propiedad_store', "sup_x_sup"],
                     axis=1)

In [83]:
relac.shape

(43068, 1075)

In [84]:
X3_cuad.surface_total_in_m2.shape

(43068,)

In [85]:
relaciones = relac.mul(X3_cuad.surface_total_in_m2, axis=0)
relaciones.shape

(43068, 1075)

In [86]:
relaciones[relaciones.columns[0]]

0         0.0
2         0.0
4         0.0
6         0.0
7         0.0
         ... 
114019    0.0
114020    0.0
114021    0.0
114026    0.0
114085    0.0
Name: 1004/734, Length: 43068, dtype: float64

In [87]:
relaciones[relaciones.columns[0]].loc[(relaciones[relaciones.columns[0]])>0]

37095    80.0
Name: 1004/734, dtype: float64

In [88]:
# PEQUEÑA VERIFICACIÓN
display(X3_cuad.surface_total_in_m2[(relaciones[relaciones.columns[0]])>0])
print("")
display(relaciones[relaciones.columns[0]].loc[(relaciones[relaciones.columns[0]])>0])  # miro cóm queda

37095    80.0
Name: surface_total_in_m2, dtype: float64




37095    80.0
Name: 1004/734, dtype: float64

In [89]:
columnas = list(range(relaciones.columns.size))
columnas[-1]

1074

In [90]:
relaciones.set_axis(columnas, axis=1, inplace=True)   # renombro las columnas nuevas (las relacionales)

In [91]:
relaciones.columns

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074],
           dtype='int64', length=1075)

In [92]:
relaciones.shape

(43068, 1075)

In [93]:
X3_cuad_relac = pd.concat([X3_cuad, relaciones],axis=1)
X3_cuad_relac.shape

(43068, 2163)

In [94]:
X3_cuad_relac.columns

Index(['surface_total_in_m2',              'garage',              'balcon',
                     'patio',           'amenities',            'estrenar',
                  'gimnasio',               'subte',             'baulera',
             'piscina_house',
       ...
                        1065,                  1066,                  1067,
                        1068,                  1069,                  1070,
                        1071,                  1072,                  1073,
                        1074],
      dtype='object', length=2163)

### AHORA PROBAMOS CON LAS NUEVAS COLUMNAS RELACIONALES

#### con "X3_cuad_relac"

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X3_cuad_relac, y)

### Regularización: Lasso y Ridge -  con "X3_cuad_relac"

#### con "X3_cuad_relac"

##### Lasso CV (Cross Validation)

In [96]:
# INSTANCIO Y FITEO EL MODELO LASSO  -- ACÁ PRUEBO SIN NORMALIZAR

model = linear_model.LassoCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean(), zero_coefs=(model.coef_ == 0).sum()))
# se muestra que eliminó ("descartó") cierta cantidad de coeficientes


{'alpha': 0.020000000000000004, 'scores': array([0.71594841, 0.72611033, 0.72656359]), 'mean_score': 0.7228741111983333, 'zero_coefs': 1577}


In [97]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

0.715832512616752

##### Ridge CV (Cross Validation)

In [98]:
# INSTANCIO Y FITEO EL MODELO RIDGE  -- ACÁ PRUEBO NORMALIZANDO
model = linear_model.RidgeCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean()))


{'alpha': 0.01, 'scores': array([0.7127136 , 0.73458752, 0.72855007]), 'mean_score': 0.7252837325265867}


In [99]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

0.7265750775227637

#### STATS MODEL con "X3_cuad_relac"

In [100]:
# USANDO statsmodel
# Tenemos que agregar explícitamente a una constante:
X3_cuad_relac_const = sm.add_constant(X3_cuad_relac)
X3_cuad_relac_const.head(3)

Unnamed: 0,const,surface_total_in_m2,garage,balcon,patio,amenities,estrenar,gimnasio,subte,baulera,...,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074
0,1.0,55.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
X3_cuad_relac_const = X3_cuad_relac_const.astype("int", copy=True)   # lo paso a int sino no me da la memoria

In [102]:
# USANDO statsmodel
X_train, X_test, y_train, y_test = train_test_split(X3_cuad_relac_const, y)
model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: test:", r2_score(y_test, predictions2))  # PREDICT


EMC train: 175812.8825685604
EMC train: 200013.90647498486
R2:  0.7575360912181817
R2: train: 0.7575360912181817
R2: test: 0.7281720208425049


##### SE OBSERVA QUE BAJÓ EL R2 TEST

In [103]:
print (model.summary())

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


                            OLS Regression Results                            
Dep. Variable:       price_usd_per_m2   R-squared:                       0.758
Model:                            OLS   Adj. R-squared:                  0.751
Method:                 Least Squares   F-statistic:                     109.9
Date:                Sat, 20 Jun 2020   Prob (F-statistic):               0.00
Time:                        15:58:29   Log-Likelihood:            -2.4089e+05
No. Observations:               32301   AIC:                         4.836e+05
Df Residuals:                   31407   BIC:                         4.911e+05
Df Model:                         893                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1684.5509 

### PROBAMOS ELIMINANDO LAS COLUMNAS algunas DUMMIES DE UBICACIÓN y algunas RELACIONALES

#### con "X4_cuad_relac_const"

In [104]:
not_feature_cols = model.pvalues[model.pvalues>0.01].index.values
not_feature_cols

array(['1004/734', '1066/2294', '1127/2890', '1131/2894', '1133/2889',
       '1189/2995', '1190/2996', '1193/3002', '1193/3003', '1194/3004',
       '1195/3000', '1195/3001', '1195/3004', '1220/3030', '1244/3022',
       '1271/2062', '1299/2807', '1339/2817', '1340/2740', '1341/1072',
       '1342/2765', '1343/2730', '1343/2738', '1343/2757', '1343/2844',
       '1344/2739', '1346/2755', '1347/2793', '1348/2775', '1348/2791',
       '1348/2792', '1348/2793', '1349/2758', '1349/2787', '1349/2794',
       '1350/2751', '1350/2793', '1351/2785', '1351/2786', '1352/2756',
       '1352/2767', '1352/2784', '1355/2706', '1355/2710', '1355/2769',
       '1356/2736', '1356/2741', '1358/2729', '1358/2738', '1358/2739',
       '1359/2679', '1359/2706', '1359/2707', '1359/2725', '1359/2744',
       '1359/2745', '1359/2753', '1360/2687', '1360/2728', '1360/2737',
       '1361/2735', '1361/2762', '1362/2675', '1362/2684', '1362/2743',
       '1363/2685', '1364/2729', '1364/2738', '1366/2681', '1366/

In [105]:
X4_cuad_relac_const = X3_cuad_relac_const.drop(not_feature_cols, axis=1)

In [106]:
X4_cuad_relac_const.head(3)

Unnamed: 0,const,surface_total_in_m2,garage,balcon,patio,amenities,estrenar,gimnasio,subte,baulera,...,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074
0,1,55,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,55,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,35,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### STATS MODEL con "X4_cuad_relac_const"

#### con "X4_cuad_relac_const"

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X4_cuad_relac_const, y)

In [108]:
# USANDO statsmodel

model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: test:", r2_score(y_test, predictions2))  # PREDICT

EMC train: 191819.9386583977
EMC train: 200849.837342674
R2:  0.7368436334990339
R2: train: 0.7368436334990339
R2: test: 0.7227235739096772


### TAMBIÉN PROBEMOS RIDGE_CV y LASSO_CV CON con "X4_cuad_relac"

In [109]:
X4_cuad_relac = X4_cuad_relac_const.drop(["const"], axis=1)

In [110]:
X4_cuad_relac.head(3)

Unnamed: 0,surface_total_in_m2,garage,balcon,patio,amenities,estrenar,gimnasio,subte,baulera,piscina_house,...,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074
0,55,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,55,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,35,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### con "X4_cuad_relac"

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X4_cuad_relac, y)

### Regularización: Lasso y Ridge -  con "X4_cuad_relac"

#### con "X4_cuad_relac"

##### Lasso CV (Cross Validation)

In [112]:
# INSTANCIO Y FITEO EL MODELO LASSO  -- ACÁ PRUEBO SIN NORMALIZAR

model = linear_model.LassoCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean(), zero_coefs=(model.coef_ == 0).sum()))
# se muestra que eliminó ("descartó") cierta cantidad de coeficientes


{'alpha': 0.01, 'scores': array([0.72303867, 0.70699083, 0.71947698]), 'mean_score': 0.7165021596160118, 'zero_coefs': 816}


In [113]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

0.7174581238364627

##### Ridge CV (Cross Validation)

In [114]:
# INSTANCIO Y FITEO EL MODELO RIDGE  -- ACÁ PRUEBO NORMALIZANDO
model = linear_model.RidgeCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean()))
# se muestra que eliminó ("descartó") cierta cantidad de coeficientes


{'alpha': 0.01, 'scores': array([0.72818993, 0.71636049, 0.70751161]), 'mean_score': 0.7173540094685483}


In [115]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

0.7216414936209771

<hr id="ToCSV">
<h2 style="color: lightblue">GRAFICOS</h2>