In [None]:
# initial setup
try:
    # settings colab:
    import google.colab
        
except ModuleNotFoundError:    
    # settings local:
    %run "common/0_notebooks_base_setup.py"

In [1]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
from sklearn.metrics import r2_score
import sklearn.metrics as metrics
import sklearn.linear_model as linear_model
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

import re
import unicodedata

%matplotlib inline

# Definimos parámetros globales para matplotlib.
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 16

In [2]:
# LECTURA DEL ARCHIVO DE DATOS
data_clean = pd.read_csv("CleanCSV/Clean08_OultiersxZona.csv")
#data_clean =  pd.read_csv('CleanCSV/Clean08_OultiersxZona_Ale_02.csv')

In [3]:
# SÓLO PARA USO EN COMPARACIONES
data_clean_ORIGINAL = data_clean.copy(deep=True) 

In [4]:
data_clean.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,País,Provincia,Área,Localidad,Zona,Lugar,operation,property_type,...,gimnasio,subte,baulera,piscina_house,piscina_depart,desc_banios,tipo_propiedad_PH,tipo_propiedad_apartment,tipo_propiedad_house,tipo_propiedad_store
0,0,0,Argentina,Capital Federal,Capital Federal,Mataderos,Mataderos,Mataderos,sell,PH,...,0.0,0.0,0.0,0,0,1.0,1,0,0,0
1,1,1,Argentina,Buenos Aires,Zona Sur,La Plata,La Plata,La Plata,sell,apartment,...,0.0,0.0,0.0,0,0,1.0,0,1,0,0
2,2,2,Argentina,Capital Federal,Capital Federal,Mataderos,Mataderos,Mataderos,sell,apartment,...,0.0,0.0,0.0,0,0,1.0,0,1,0,0


In [5]:
display(data_clean.shape)
data_clean.columns

(101993, 44)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'País', 'Provincia', 'Área', 'Localidad',
       'Zona', 'Lugar', 'operation', 'property_type', 'geonames_id', 'lat',
       'lon', 'price_aprox_usd', 'surface_total_in_m2',
       'surface_covered_in_m2', 'price_usd_per_m2', 'floor', 'rooms',
       'expenses', 'description', 'url_clean', 'url_terraza', 'url_quincho',
       'desc_terraza', 'desc_quincho', 'garage', 'lavadero', 'balcon',
       'parrilla', 'jardin', 'patio', 'amenities', 'estrenar', 'gimnasio',
       'subte', 'baulera', 'piscina_house', 'piscina_depart', 'desc_banios',
       'tipo_propiedad_PH', 'tipo_propiedad_apartment', 'tipo_propiedad_house',
       'tipo_propiedad_store'],
      dtype='object')

### 0) Primer aproximación al Modelo con lo datos disponibles sólo en Capital

#### Hacemos esto para comparar los R2 que obtendremos más adelante utilizando otros features que harán que el dataset disponible varíe

In [6]:
Xo_yo = data_clean.loc[(data_clean["Área"] == "Capital Federal")].loc[:, ['surface_total_in_m2', 'surface_covered_in_m2', 'price_usd_per_m2', 
                                                                          'garage', 'lavadero', 'balcon', 'parrilla', 'jardin', 'patio',
                                                                          'amenities', 'estrenar', 'gimnasio', 'subte', 'baulera', 
                                                                          'piscina_house', 'piscina_depart', 'tipo_propiedad_PH', 
                                                                          'tipo_propiedad_apartment', 'tipo_propiedad_house',
                                                                          'tipo_propiedad_store', "Lugar"]]
Xo_yo.isnull().sum()

surface_total_in_m2          744
surface_covered_in_m2       2141
price_usd_per_m2            3528
garage                         0
lavadero                       0
balcon                         0
parrilla                       0
jardin                         0
patio                          0
amenities                      0
estrenar                       0
gimnasio                       0
subte                          0
baulera                        0
piscina_house                  0
piscina_depart                 0
tipo_propiedad_PH              0
tipo_propiedad_apartment       0
tipo_propiedad_house           0
tipo_propiedad_store           0
Lugar                          0
dtype: int64

In [7]:
# Eliminamos con nulos
Xo_yo.dropna(how="any", axis=0, inplace= True)
Xo_yo.shape

(22732, 21)

In [9]:
Xo = Xo_yo[['surface_total_in_m2', 'surface_covered_in_m2',  
            'garage', 'lavadero', 'balcon', 'parrilla', 'jardin', 'patio',
            'amenities', 'estrenar', 'gimnasio', 'subte', 'baulera', 
            'piscina_house', 'piscina_depart', 'tipo_propiedad_PH', 
            'tipo_propiedad_apartment', 'tipo_propiedad_house',
            'tipo_propiedad_store', "Lugar"]]
Xo.shape

(22732, 20)

#### 0-a) Armamos las Dummies de "Lugar" (Para el caso de Capital federal, son los barrios/sub-barrios):

In [10]:
dummies_lugar = pd.get_dummies(Xo['Lugar'], prefix="dumm", drop_first=True)  # acá usa drop para generar N-1 columnas
dummies_lugar.shape

(22732, 61)

In [11]:
Xo = pd.concat([Xo, dummies_lugar],axis=1)
Xo.drop(['Lugar'], axis=1, inplace=True)

In [12]:
Xo.columns

Index(['surface_total_in_m2', 'surface_covered_in_m2', 'garage', 'lavadero',
       'balcon', 'parrilla', 'jardin', 'patio', 'amenities', 'estrenar',
       'gimnasio', 'subte', 'baulera', 'piscina_house', 'piscina_depart',
       'tipo_propiedad_PH', 'tipo_propiedad_apartment', 'tipo_propiedad_house',
       'tipo_propiedad_store', 'dumm_Agronomía', 'dumm_Almagro',
       'dumm_Balvanera', 'dumm_Barracas', 'dumm_Barrio Norte', 'dumm_Belgrano',
       'dumm_Boca', 'dumm_Boedo', 'dumm_Caballito', 'dumm_Capital Federal',
       'dumm_Catalinas', 'dumm_Centro / Microcentro', 'dumm_Chacarita',
       'dumm_Coghlan', 'dumm_Colegiales', 'dumm_Congreso', 'dumm_Constitución',
       'dumm_Flores', 'dumm_Floresta', 'dumm_Las Cañitas', 'dumm_Liniers',
       'dumm_Mataderos', 'dumm_Monserrat', 'dumm_Monte Castro', 'dumm_Nuñez',
       'dumm_Once', 'dumm_Palermo', 'dumm_Palermo Chico',
       'dumm_Palermo Hollywood', 'dumm_Palermo Soho', 'dumm_Palermo Viejo',
       'dumm_Parque Avellaneda', 'du

In [13]:
yo = Xo_yo[['price_usd_per_m2']]
yo.shape

(22732, 1)

In [14]:
print("Shape X:", Xo.shape)
print("Type X:", type(Xo))
print("Shape y:", yo.shape)
print("Type y:", type(yo))

Shape X: (22732, 80)
Type X: <class 'pandas.core.frame.DataFrame'>
Shape y: (22732, 1)
Type y: <class 'pandas.core.frame.DataFrame'>


#### 0-b) Usando STATS MODEL - Armamos "Xo_const":

In [15]:
# Importamos "train_test_split" y algunas funciones para calcular la bondad de ajuste.
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [16]:
# USANDO statsmodel tenemos que agregar explícitamente a una constante para que encuentre la Ordenada al Origen:
Xo_const = sm.add_constant(Xo)
Xo_const.head(3)

Unnamed: 0,const,surface_total_in_m2,surface_covered_in_m2,garage,lavadero,balcon,parrilla,jardin,patio,amenities,...,dumm_Villa Lugano,dumm_Villa Luro,dumm_Villa Ortuzar,dumm_Villa Pueyrredón,dumm_Villa Real,dumm_Villa Riachuelo,dumm_Villa Santa Rita,dumm_Villa Soldati,dumm_Villa Urquiza,dumm_Villa del Parque
0,1.0,55.0,40.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,55.0,55.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,45.0,40.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


#### 0-c) Usando STATS MODEL - con "Xo_const":

In [17]:
# USANDO TRAIN / TEST / SPLIT
X_train, X_test, y_train, y_test = train_test_split(Xo_const, yo)

In [18]:
# USANDO statsmodel

model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model      
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: TEST:", r2_score(y_test, predictions2))  # PREDICT


EMC train: 15775358.353421735
EMC train: 14660939.291077547
R2:  0.2333036270964267
R2: train: 0.23330362709642716
R2: TEST: 0.12688159620842276


In [19]:
print (model.summary())

                            OLS Regression Results                            
Dep. Variable:       price_usd_per_m2   R-squared:                       0.233
Model:                            OLS   Adj. R-squared:                  0.230
Method:                 Least Squares   F-statistic:                     64.54
Date:                Sun, 21 Jun 2020   Prob (F-statistic):               0.00
Time:                        12:37:51   Log-Likelihood:            -1.6548e+05
No. Observations:               17049   AIC:                         3.311e+05
Df Residuals:                   16968   BIC:                         3.317e+05
Df Model:                          80                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

##### Se observa que para los Datos de Capital Federal, incluyendo Dummies por Barrio obtenermos, R2 Train/Test de 0,23 / 0,12.


### AHORA VAMOS A VOLVER A TODO EL DATASET DISPONIBLE PARA TODA LA ARGENTINA:

<hr id="Ubicaciones">

### 1) Completamos info en la Colunna "ROOMS"

In [20]:
print(data_clean.rooms.isnull().sum())

55900


In [21]:
print(data_clean.rooms.isnull().sum())
regex = re.compile("\d\sambientes", flags = re.IGNORECASE)
regexRooms = data_clean[(data_clean.rooms.isnull())].description.apply(lambda x: regex.search(str(x)))
resultado = regexRooms.apply(lambda x: np.NaN if x is None else x.group(0).lower().replace(" ambientes", "").strip()).astype(np.float64)
data_clean.loc[resultado.index, "rooms"]  = resultado
print(data_clean.rooms.isnull().sum())

55900
46074


In [22]:
print(data_clean.rooms.isnull().sum())
regex = re.compile("\d\samb", flags = re.IGNORECASE)
regexRooms = data_clean[(data_clean.rooms.isnull())].description.apply(lambda x: regex.search(str(x)))
resultado = regexRooms.apply(lambda x: np.NaN if x is None else x.group(0).lower().replace(" amb", "").strip()).astype(np.float64)
data_clean.loc[resultado.index, "rooms"]  = resultado
print(data_clean.rooms.isnull().sum())

46074
43158


In [23]:
print(data_clean.rooms.isnull().sum())
regex = re.compile("1\sambiente", flags = re.IGNORECASE)
regexRooms = data_clean[(data_clean.rooms.isnull())].description.apply(lambda x: regex.search(str(x)))
resultado = regexRooms.apply(lambda x: np.NaN if x is None else x.group(0).lower().replace(" ambiente", "").strip()).astype(np.float64)
data_clean.loc[resultado.index, "rooms"]  = resultado
print(data_clean.rooms.isnull().sum())

43158
43158


In [24]:
print(data_clean.rooms.isnull().sum())
regex = re.compile("un\sambiente", flags = re.IGNORECASE)
regexRooms = data_clean[(data_clean.rooms.isnull())].description.apply(lambda x: regex.search(str(x)))
resultado = regexRooms.apply(lambda x: np.NaN if x is None else x.group(0).lower().replace("un ambiente", "1").strip()).astype(np.float64)
data_clean.loc[resultado.index, "rooms"]  = resultado
print(data_clean.rooms.isnull().sum())

43158
42668


In [25]:
data_clean = data_clean.drop(columns=["description"])

### 2) BINARIZACIÓN DE LAT-LOT (ARMADO DE PARCELAS)

In [26]:
data_clean.lat.describe()
print(data_clean.lat.isnull().sum())
data_clean.lon.describe()
data_clean.lon.isnull().sum()

43620


43620

In [27]:
# Eliminamos filas sin LAT o LOT
data_clean.dropna(subset=['lat', 'lon'], how="any", axis=0, inplace= True)

In [28]:
data_clean.lat.describe()
print(data_clean.lat.isnull().sum())
data_clean.lon.describe()
data_clean.lon.isnull().sum()

0


0

In [29]:
data_clean.shape

(58373, 43)

In [30]:
# Vemos cómo nos queda distribuídas geográficamente las muestras luego del Drop anterior
data_clean.Área.value_counts()[0:40]

Capital Federal        19902
Zona Norte              9225
Costa Atlántica         7263
Zona Sur                5010
Zona Oeste              4757
Santa Fe                4596
Córdoba                 4025
Interior                1310
Neuquén                  416
Río Negro                396
Mendoza                  303
Misiones                 296
Entre Ríos               235
Salta                    186
Tucumán                  111
San Luis                  99
La Pampa                  61
Chubut                    49
Chaco                     35
San Juan                  23
Jujuy                     23
Tierra Del Fuego          20
Santa Cruz                16
Corrientes                 8
La Rioja                   4
Catamarca                  3
Santiago Del Estero        1
Name: Área, dtype: int64

In [31]:
# Con 4000 BINS en LAT se obtienen una resolución de 0.015 grados por BIN (unas 14 cuadras)
lat_bins= pd.cut(data_clean.lat, 4000, right=False)
display(lat_bins)
lat_categories = pd.cut(data_clean.lat, 4000, right=False, labels=False) # labels=False implica que me devuelve el bin al que pertenece
lat_categories

0         [-34.668, -34.653)
1         [-34.905, -34.891)
2         [-34.653, -34.638)
3         [-34.653, -34.638)
4         [-38.007, -37.993)
                 ...        
101930    [-34.609, -34.594)
101935    [-34.594, -34.579)
101937    [-34.564, -34.549)
101985    [-34.564, -34.549)
101990    [-34.579, -34.564)
Name: lat, Length: 58373, dtype: category
Categories (4000, interval[float64]): [[-54.824, -54.809) < [-54.809, -54.794) < [-54.794, -54.779) < [-54.779, -54.765) ... [4.486, 4.501) < [4.501, 4.516) < [4.516, 4.531) < [4.531, 4.605)]

0         1358
1         1342
2         1359
3         1359
4         1133
          ... 
101930    1362
101935    1363
101937    1365
101985    1365
101990    1364
Name: lat, Length: 58373, dtype: int64

In [32]:
lat_categories = lat_categories.astype("str", copy=True)
type(lat_categories[0])

str

In [33]:
# Con 2000 BINS en LON se obtienen una resolución de 0.011 grados por BIN (unas 8 cuadras)
lon_bins= pd.cut(data_clean.lon, 2000, right=False)
display(lon_bins)
lon_categories = pd.cut(data_clean.lon, 2000, right=False, labels=False) # labels=False impluca que me devuelve el bin al que pertenece
lon_categories.value_counts()

0         [-58.517, -58.506)
1         [-57.969, -57.958)
2         [-58.528, -58.517)
3         [-58.517, -58.506)
4         [-57.552, -57.541)
                 ...        
101930    [-58.408, -58.397)
101935    [-58.408, -58.397)
101937    [-58.452, -58.441)
101985    [-58.452, -58.441)
101990    [-58.485, -58.474)
Name: lon, Length: 58373, dtype: category
Categories (2000, interval[float64]): [[-75.679, -75.668) < [-75.668, -75.657) < [-75.657, -75.646) < [-75.646, -75.635) ... [-53.777, -53.766) < [-53.766, -53.755) < [-53.755, -53.744) < [-53.744, -53.711)]

1652    3164
1571    2239
1574    2223
1568    1897
1570    1875
        ... 
1298       1
403        1
1171       1
1235       1
0          1
Name: lon, Length: 736, dtype: int64

In [34]:
lon_categories = lon_categories.astype("str", copy=True)
type(lon_categories[0])

str

In [35]:
lat_lon = lat_categories + "/" + lon_categories
lat_lon.value_counts()

1133/1652    1943
1363/1574     633
1365/1569     563
1132/1652     556
1362/1574     556
             ... 
1524/1039       1
1345/1605       1
1346/1562       1
2025/930        1
1573/1047       1
Length: 3669, dtype: int64

In [36]:
lat_lon.shape

(58373,)

In [37]:
data_clean = pd.concat([data_clean, lat_lon],axis=1)

In [38]:
data_clean.columns

Index([              'Unnamed: 0',             'Unnamed: 0.1',
                           'País',                'Provincia',
                           'Área',                'Localidad',
                           'Zona',                    'Lugar',
                      'operation',            'property_type',
                    'geonames_id',                      'lat',
                            'lon',          'price_aprox_usd',
            'surface_total_in_m2',    'surface_covered_in_m2',
               'price_usd_per_m2',                    'floor',
                          'rooms',                 'expenses',
                      'url_clean',              'url_terraza',
                    'url_quincho',             'desc_terraza',
                   'desc_quincho',                   'garage',
                       'lavadero',                   'balcon',
                       'parrilla',                   'jardin',
                          'patio',                'amen

In [39]:
# Renombramos la columna que recién creamos (por defecto la llamó 0)
data_clean.rename(columns={0: "lat_lon"}, inplace=True)

### 3) Eliminamos algunas columnas que ya no utilizaremos

In [40]:
data_clean.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'País', 'Provincia', 'Área', 'Localidad',
       'Zona', 'Lugar', 'operation', 'property_type', 'geonames_id', 'lat',
       'lon', 'price_aprox_usd', 'surface_total_in_m2',
       'surface_covered_in_m2', 'price_usd_per_m2', 'floor', 'rooms',
       'expenses', 'url_clean', 'url_terraza', 'url_quincho', 'desc_terraza',
       'desc_quincho', 'garage', 'lavadero', 'balcon', 'parrilla', 'jardin',
       'patio', 'amenities', 'estrenar', 'gimnasio', 'subte', 'baulera',
       'piscina_house', 'piscina_depart', 'desc_banios', 'tipo_propiedad_PH',
       'tipo_propiedad_apartment', 'tipo_propiedad_house',
       'tipo_propiedad_store', 'lat_lon'],
      dtype='object')

In [41]:
data_clean.drop(['Unnamed: 0', 'Unnamed: 0.1', 'operation', 'property_type', 'geonames_id', 'lat', 'lon', 
                 'floor', 'expenses', 'url_clean', 'url_terraza', 'url_quincho', 'desc_terraza', 'desc_quincho', 
                 'desc_banios'], axis=1, inplace=True)

In [42]:
data_clean.columns

Index(['País', 'Provincia', 'Área', 'Localidad', 'Zona', 'Lugar',
       'price_aprox_usd', 'surface_total_in_m2', 'surface_covered_in_m2',
       'price_usd_per_m2', 'rooms', 'garage', 'lavadero', 'balcon', 'parrilla',
       'jardin', 'patio', 'amenities', 'estrenar', 'gimnasio', 'subte',
       'baulera', 'piscina_house', 'piscina_depart', 'tipo_propiedad_PH',
       'tipo_propiedad_apartment', 'tipo_propiedad_house',
       'tipo_propiedad_store', 'lat_lon'],
      dtype='object')

### 4) Generamos DUMMIES para los pares de LAT-LON que generamos antes

In [43]:
dummies_lat_lon = pd.get_dummies(data_clean['lat_lon'], drop_first=True)  # acá usa drop para generar N-1 columnas
dummies_lat_lon.shape

(58373, 3668)

In [44]:
data_clean = pd.concat([data_clean, dummies_lat_lon],axis=1)

In [57]:
data_clean.columns

Index(['País', 'Provincia', 'Área', 'Localidad', 'Zona', 'Lugar',
       'surface_total_in_m2', 'surface_covered_in_m2', 'price_usd_per_m2',
       'rooms',
       ...
       '989/399', '989/400', '989/401', '990/397', '990/400', '990/404',
       '990/408', '991/410', '993/398', '994/419'],
      dtype='object', length=3696)

### 4) "dataf" es una dataframe intermedio donde verificamos que ya no tengamos nulos en las columnas de interés

In [58]:
data_clean.isnull().sum()[0:20]

País                         0
Provincia                    0
Área                         0
Localidad                    0
Zona                         0
Lugar                        0
surface_total_in_m2       7654
surface_covered_in_m2    10982
price_usd_per_m2         13815
rooms                    14416
garage                       0
lavadero                     0
balcon                       0
parrilla                     0
jardin                       0
patio                        0
amenities                    0
estrenar                     0
gimnasio                     0
subte                        0
dtype: int64

In [59]:
dataf = data_clean.dropna(how="any", axis=0)

In [60]:
dataf.isnull().sum().sum()

0

In [63]:
dataf.shape

(33039, 3696)

In [64]:
# Vemos nuevamente cómo nos quedan los datos disponibles distribuídos geográficamente
dataf.Área.value_counts()[0:40]

Capital Federal        14847
Zona Norte              5288
Costa Atlántica         4595
Zona Sur                2226
Zona Oeste              2109
Santa Fe                1551
Córdoba                 1418
Interior                 488
Neuquén                  146
Río Negro                133
Misiones                  70
Mendoza                   63
Salta                     20
Entre Ríos                15
San Luis                  13
Tucumán                   12
Tierra Del Fuego          11
Chaco                      9
Chubut                     9
Santa Cruz                 6
Corrientes                 3
Jujuy                      3
La Rioja                   2
Santiago Del Estero        1
San Juan                   1
Name: Área, dtype: int64

In [67]:
dataf.columns[0:30]

Index(['País', 'Provincia', 'Área', 'Localidad', 'Zona', 'Lugar',
       'surface_total_in_m2', 'surface_covered_in_m2', 'price_usd_per_m2',
       'rooms', 'garage', 'lavadero', 'balcon', 'parrilla', 'jardin', 'patio',
       'amenities', 'estrenar', 'gimnasio', 'subte', 'baulera',
       'piscina_house', 'piscina_depart', 'tipo_propiedad_PH',
       'tipo_propiedad_apartment', 'tipo_propiedad_house',
       'tipo_propiedad_store', 'lat_lon', '0/669', '1/670'],
      dtype='object')

### 5) Armamos X e y

In [68]:
X = dataf.drop(['País', 'Provincia', 'Área', 'Localidad', 'Zona', 'Lugar', 'price_usd_per_m2', 'lat_lon'], axis=1)

In [69]:
X.shape

(33039, 3688)

In [70]:
y = dataf['price_usd_per_m2']

In [71]:
y.shape

(33039,)

In [72]:
print("Shape X:", X.shape)
print("Type X:", type(X))
print("Shape y:", y.shape)
print("Type y:", type(y))

Shape X: (33039, 3688)
Type X: <class 'pandas.core.frame.DataFrame'>
Shape y: (33039,)
Type y: <class 'pandas.core.series.Series'>


<hr id="Ubicaciones">

### 6) Pimer aproximación con STATS MODEL - OLS

In [73]:
# Importamos "train_test_split" y algunas funciones para calcular la bondad de ajuste.
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [74]:
# USANDO statsmodel tenemos que agregar explícitamente a una constante para que encuentre la Ordenada al Origen:
X_const = sm.add_constant(X)
X_const.head(3)

Unnamed: 0,const,surface_total_in_m2,surface_covered_in_m2,rooms,garage,lavadero,balcon,parrilla,jardin,patio,...,989/399,989/400,989/401,990/397,990/400,990/404,990/408,991/410,993/398,994/419
0,1.0,55.0,40.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,55.0,55.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,45.0,40.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


#### 6-a) Usando "X_const":

In [75]:
# USANDO TRAIN / TEST / SPLIT
X_train, X_test, y_train, y_test = train_test_split(X_const, y)

In [76]:
# USANDO statsmodel

model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model      
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: TEST:", r2_score(y_test, predictions2))  # PREDICT


EMC train: 6900860.7008067975
EMC train: 16431905.418291258
R2:  0.6300048699466014
R2: train: 0.6300048699466014
R2: TEST: 0.2882494425590115


In [77]:
print (model.summary())

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


                            OLS Regression Results                            
Dep. Variable:       price_usd_per_m2   R-squared:                       0.630
Model:                            OLS   Adj. R-squared:                  0.597
Method:                 Least Squares   F-statistic:                     18.91
Date:                Sun, 21 Jun 2020   Prob (F-statistic):               0.00
Time:                        12:53:35   Log-Likelihood:            -2.3026e+05
No. Observations:               24779   AIC:                         4.646e+05
Df Residuals:                   22731   BIC:                         4.812e+05
Df Model:                        2047                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

##### El uso de las parcelas es muy importante para el modelo.
##### Se mostró más arriba que intentamos utilizar dummies de la columna "Lugar" en lugar de las parcelas y no obteníamos R2 mejores a 0,56 / 0,57

### Primer eliminación de features con STATS MODEL - OLS:
##### Se eliminaran Features cuyo Pvalue supere el 1% (es decir que no están afectando a la varianza ni al sesgo del modelo)

In [78]:
not_feature_cols = model.pvalues[model.pvalues>0.01].index.values

In [79]:
not_feature_cols

array(['surface_total_in_m2', 'rooms', 'lavadero', ..., '989/400',
       '989/401', '990/400'], dtype=object)

#### Armamos "X2_const":

In [80]:
X2_const = X_const.drop(not_feature_cols, axis=1)

In [81]:
X2_const.head(3)

Unnamed: 0,const,surface_covered_in_m2,garage,patio,piscina_depart,1133/1652,1332/1542,1358/1539,1360/1569,1360/1570,...,945/372,945/966,947/365,986/390,990/397,990/404,990/408,991/410,993/398,994/419
0,1.0,40.0,0.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,55.0,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,40.0,0.0,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Segunda aproximación con STATS MODEL - OLS

#### Usando "X2_const"

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X2_const, y)

In [83]:
# USANDO statsmodel

model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: TEST:", r2_score(y_test, predictions2))  # PREDICT


EMC train: 4107225.5258183083
EMC train: 26088945.156337705
R2:  0.7393337374835017
R2: train: 0.7393337374835016
R2: TEST: 0.178816000612521


In [84]:
model.summary()

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,price_usd_per_m2,R-squared:,0.739
Model:,OLS,Adj. R-squared:,0.738
Method:,Least Squares,F-statistic:,592.7
Date:,"Sun, 21 Jun 2020",Prob (F-statistic):,0.0
Time:,12:56:29,Log-Likelihood:,-223830.0
No. Observations:,24779,AIC:,447900.0
Df Residuals:,24660,BIC:,448900.0
Df Model:,118,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1860.9661,28.482,65.338,0.000,1805.140,1916.793
surface_covered_in_m2,-3.9153,0.213,-18.341,0.000,-4.334,-3.497
garage,207.5596,27.448,7.562,0.000,153.761,261.359
patio,-388.6926,34.538,-11.254,0.000,-456.390,-320.995
piscina_depart,461.3390,38.128,12.100,0.000,386.606,536.072
1133/1652,197.8080,72.627,2.724,0.006,55.454,340.162
1332/1542,8717.3405,2031.681,4.291,0.000,4735.123,1.27e+04
1358/1539,1.033e+05,407.890,253.365,0.000,1.03e+05,1.04e+05
1360/1569,393.5346,154.590,2.546,0.011,90.528,696.541

0,1,2,3
Omnibus:,63316.761,Durbin-Watson:,1.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27381642384.987
Skew:,27.407,Prob(JB):,0.0
Kurtosis:,5152.545,Cond. No.,1.96e+20


In [85]:
X2_const.columns

Index(['const', 'surface_covered_in_m2', 'garage', 'patio', 'piscina_depart',
       '1133/1652', '1332/1542', '1358/1539', '1360/1569', '1360/1570',
       ...
       '945/372', '945/966', '947/365', '986/390', '990/397', '990/404',
       '990/408', '991/410', '993/398', '994/419'],
      dtype='object', length=814)

#### Segunda eliminación de features con STATS MODEL - OLS
##### Este paso lo realizamos porque próximamente generaremos nuevas features y si la mantenemos todas ahora, luego las PC no logran alojar en memoria el procesamiento necesario


In [86]:
not_feature_cols = model.pvalues[model.pvalues>0.01].index.values
not_feature_cols

array(['1360/1569', '1361/1569', '1361/1576', '1477/1364', '1477/622',
       '1477/624', '1477/626', '1477/628', '1478/1351', '1478/1352',
       '1478/1356', '1478/621', '1478/624', '1478/625', '1478/629',
       '1478/631', '1478/632', '1478/637', '1479/1359', '1479/1365',
       '1479/618', '1479/619', '1479/622', '1479/625', '1479/626',
       '1479/629', '1480/1357', '1480/1363', '1480/621', '1480/623',
       '1480/624', '1481/1364', '1481/620', '1481/623', '1481/626',
       '1482/1302', '1482/1362', '1483/872', '1484/1362', '1485/1356',
       '1486/1361', '1487/1360', '1487/1361', '1487/1362', '1488/946',
       '1489/1359', '1489/1362', '1490/1091', '1490/1236', '1492/1356',
       '1492/1581', '1494/1184', '1494/1354', '1495/1413', '1495/1414',
       '1495/1415', '1496/1342', '1500/1351', '1504/1587', '1504/1590',
       '1504/575', '1505/1285', '1505/1589', '1506/1567', '1507/1088',
       '1507/1132', '1508/1088', '1509/1088', '1509/1131', '1509/1132',
       '1509/1133'

#### Armamos "X3_const":

In [87]:
X3_const = X2_const.drop(not_feature_cols, axis=1)

In [88]:
X3_const.head(3)

Unnamed: 0,const,surface_covered_in_m2,garage,patio,piscina_depart,1133/1652,1332/1542,1358/1539,1360/1570,1360/1571,...,944/966,945/369,945/372,945/966,947/365,986/390,990/397,990/408,993/398,994/419
0,1.0,40.0,0.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,55.0,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,40.0,0.0,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Tercer aproximación con STATS MODEL - OLS

#### Usando "X3_const"

In [92]:
# USANDO statsmodel
X_train, X_test, y_train, y_test = train_test_split(X3_const, y)
model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: TEST:", r2_score(y_test, predictions2))  # PREDICT


EMC train: 11822392.726959178
EMC train: 2392939.3031539377
R2:  0.44230109114362115
R2: train: 0.4423010911436215
R2: TEST: 0.8450701774785399


#### Observamos que el modelo mantiene el R2 de Test y Train luego de haber filtrado parámetros con Pvlues mayores al 1%

In [93]:
model.summary()

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,price_usd_per_m2,R-squared:,0.442
Model:,OLS,Adj. R-squared:,0.441
Method:,Least Squares,F-statistic:,288.2
Date:,"Sun, 21 Jun 2020",Prob (F-statistic):,0.0
Time:,12:57:00,Log-Likelihood:,-236930.0
No. Observations:,24779,AIC:,474000.0
Df Residuals:,24710,BIC:,474600.0
Df Model:,68,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1880.5468,47.644,39.471,0.000,1787.162,1973.932
surface_covered_in_m2,-4.8175,0.360,-13.377,0.000,-5.523,-4.112
garage,295.8551,46.502,6.362,0.000,204.708,387.002
patio,-292.5737,58.360,-5.013,0.000,-406.963,-178.185
piscina_depart,482.7748,64.255,7.513,0.000,356.831,608.719
1133/1652,216.3447,122.900,1.760,0.078,-24.546,457.235
1332/1542,8715.8039,3443.446,2.531,0.011,1966.444,1.55e+04
1358/1539,9.177e+04,720.060,127.446,0.000,9.04e+04,9.32e+04
1360/1570,742.2684,350.864,2.116,0.034,54.554,1429.983

0,1,2,3
Omnibus:,80127.324,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27793919744.928
Skew:,53.619,Prob(JB):,0.0
Kurtosis:,5190.354,Cond. No.,8.2e+18


### 7) Probando diferentes Modelos

#### Armamo "X3" (sólo se requiere eliminar la columna "const")

In [94]:
X3 = X3_const.drop(["const"], axis=1)

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X3, y)

#### 7-a) Ridge Regression

##### probamos varios Alpha manualmente, y dejamos el más bajo pero sin seguir diminuyéndolo luego de que no se obervan mejoras en R2

In [96]:
# X_train, X_test, y_train, y_test
# SIEMPRE NORMALIZAMOS CUANDO HACEMOS REGULARIZACIÓN (RIGDE, LASSO, ETC..) PORQUE SINO COMO LOS BETAS DE CADA
# FEATURE ESTÁN EN ESCALAS DIFERENTE, PENALIZARÍA MAS A FEATURES EN RANGOS MUY GRANDES,
# ES DECIR, PENALIZARÍA DIFERENTE A CADA FEATURE
# ADEMÁS, AYUDA AL TIEMPO DE PROCESAMIENTO
# OJO QUE LOS BETA QUE OBTENEMOS ESTARÁN TAMBIÉN EN OTRA ESCALA, TAMBIÉN ESTARÁN NORMALIZADOS

rlm = linear_model.Ridge(alpha=0.01, normalize=True)   # EN GENERAL SIEMPRESE SE HACE NORMALIZACIÓN CON RIDGE

# Ajustamos nuevamente, esta vez con regularizacion (RIDGE)
ridge_model = rlm.fit(X_train, y_train)
predictions = ridge_model.predict(X_train)
# SUPUESTAMENTE EL MÉTODO PREDICT TOMA LOS X Y LOS NORMALIZA ANTES DE PREDECIR, SI ASÍ LO SETIE EN EL MODELO.
# SINO NO FUNCIONARÍA, O SI YO QUISIERA ARMAR EL MODELO CON LOS BETAS Y CORFICIENTES YO DEBERÍA NORMALIZAR LAS X

print ("r^2 Train:", ridge_model.score(X_train, y_train))
print ("r^2 TEST:", ridge_model.score(X_test, y_test))


r^2 Train: 0.4458038498349003
r^2 TEST: 0.8091958133405134


In [97]:
ridge_model.alpha

0.01

#### 7-b) Lasso Regression

##### probamos varios Alpha manualmente, y dejamos el más bajo pero sin seguir diminuyéndolo luego de que no se obervan mejoras en R2

In [98]:
# X_train, X_test, y_train, y_test

lasso = linear_model.Lasso(alpha=0.01, normalize=True) # SIEMPRE HACER NORMALIZACIÓN CON LASSO

# Ajustamos nuevamente, esta vez con regularizacion (LASSO)
lasso_model =lasso.fit(X_train, y_train)

print ("r^2 Train:", lasso_model.score(X_train, y_train))
print ("r^2 TEST:", lasso_model.score(X_test, y_test))


r^2 Train: 0.4458369868603438
r^2 TEST: 0.8100740591408361


In [99]:
lasso_model.alpha

0.01

### 8) Vamos a crear una Feature que es el cuadrado de la feature "surface_total_in_m2"

#### veremos si mejora la respuesta del modelo

#### Armamo "X3_cuad"

In [100]:
X3.shape

(33039, 685)

In [101]:
X3.columns

Index(['surface_covered_in_m2', 'garage', 'patio', 'piscina_depart',
       '1133/1652', '1332/1542', '1358/1539', '1360/1570', '1360/1571',
       '1360/1572',
       ...
       '944/966', '945/369', '945/372', '945/966', '947/365', '986/390',
       '990/397', '990/408', '993/398', '994/419'],
      dtype='object', length=685)

In [102]:
sup_x_sup = X3.surface_total_in_m2 * X3.surface_total_in_m2
sup_x_sup.name

AttributeError: 'DataFrame' object has no attribute 'surface_total_in_m2'

In [103]:
sup_x_sup.rename("sup_x_sup", axis=1, inplace=True)
sup_x_sup.name

NameError: name 'sup_x_sup' is not defined

In [104]:
X3_cuad = pd.concat([X3, sup_x_sup],axis=1)
X3_cuad.shape

NameError: name 'sup_x_sup' is not defined

In [105]:
X3_cuad.columns

NameError: name 'X3_cuad' is not defined

#### Probamos los diferentes modelos incluyendo nuestra nueva feature ("sup_x_sup"):
### "X3_cuad"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X3_cuad, y)

#### 8-a) Ridge Regression

##### probamos varios Alpha manualmente, y dejamos el más bajo pero sin seguir diminuyéndolo luego de que no se obervan mejoras en R2

In [None]:
rlm = linear_model.Ridge(alpha=0.01, normalize=True)   # EN GENERAL SIEMPRESE SE HACE NORMALIZACIÓN CON RIDGE

ridge_model = rlm.fit(X_train, y_train)
predictions = ridge_model.predict(X_train)

print ("r^2 Train:", ridge_model.score(X_train, y_train))
print ("r^2 TEST:", ridge_model.score(X_test, y_test))

In [None]:
ridge_model.alpha

#### 8-b) Lasso Regression

##### probamos varios Alpha manualmente, y dejamos el más bajo pero sin seguir diminuyéndolo luego de que no se obervan mejoras en R2

In [None]:
lasso = linear_model.Lasso(alpha=0.01, normalize=True) # SIEMPRE HACER NORMALIZACIÓN CON LASSO

lasso_model =lasso.fit(X_train, y_train)

print ("r^2 Train:", lasso_model.score(X_train, y_train))
print ("r^2 TEST:", lasso_model.score(X_test, y_test))

In [None]:
lasso_model.alpha

#### Para ambos modelos notamos una leve mejoría 1% del R2 Test y Train

####



#### Vamos a verificarlos utilizando CORSS-VALIDATION

CROSS-VALIDATION Y REGULTARIZACIÓN:

#### Importamos algunas librerías más requeridas para este punto

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold

from sklearn.preprocessing import StandardScaler

# KFold: Provides train/test indices to split data in train/test sets. 
# Split dataset into k consecutive folds (without shuffling by default).
# OBJETO PARA CROSS-VALIDATION (K-Folds cross-validator) 
cv = KFold(3, shuffle=True)
# LO BUENO DE USAR ESTE OBJETO, ES QUE PODRÍA USARLO PARA HACER SIEMPRE LAS MISMAS PARTICIONES

#### 8-c) Ridge Regression CV (Cross Validation)

In [None]:
# INSTANCIO Y FITEO EL MODELO RIDGE  -- ACÁ PRUEBO NORMALIZANDO
model = linear_model.RidgeCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean()))

In [None]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

#### 8-d) Lasso Regression CV (Cross Validation)

In [None]:
# INSTANCIO Y FITEO EL MODELO LASSO  -- ACÁ PRUEBO SIN NORMALIZAR

model = linear_model.LassoCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean(), zero_coefs=(model.coef_ == 0).sum()))
# se muestra que eliminó ("descartó") cierta cantidad de coeficientes


In [None]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

#### Se verificó para diferentes particiones los R2 Test y Train

### 9) Creación de Features Relacionales

#### Vamos a crear Features relacionando las columnas Dummie de las Parcelas con la columnas "surface_total_in_m2"

#### veremos si mejora la respuesta del modelo

#### Armamos "X3_cuad_relac"

In [None]:
X3_cuad.shape

In [None]:
X3_cuad.columns[0:20]

In [None]:
X3_cuad.columns[-1]

#### en base a las dos celdas anteriores SELECCIONAR A MANO LAS QUE NO VOY A MULTIPLICAR (las que no son dummies de posición y que hayan quedado luego de las sucesivas eliminaciones de fueatures con Pvalue > 0,01)

In [None]:
relac = X3_cuad.drop(['surface_total_in_m2', 'garage', 'lavadero', 'balcon', 'patio',
       'amenities', 'estrenar', 'gimnasio', 'piscina_house', 'piscina_depart', 'sup_x_sup'],
                     axis=1)

In [None]:
relac.shape

In [None]:
X3_cuad.surface_total_in_m2.shape

In [None]:
relaciones = relac.mul(X3_cuad.surface_total_in_m2, axis=0)
relaciones.shape

In [None]:
relaciones[relaciones.columns[0]]

In [None]:
relaciones[relaciones.columns[0]].loc[(relaciones[relaciones.columns[0]])>0]

In [None]:
# PEQUEÑA VERIFICACIÓN
display(X3_cuad.surface_total_in_m2[(relaciones[relaciones.columns[0]])>0])
print("")
display(relaciones[relaciones.columns[0]].loc[(relaciones[relaciones.columns[0]])>0])  # miro cóm queda

In [None]:
columnas = list(range(relaciones.columns.size))
columnas[-1]

In [None]:
relaciones.set_axis(columnas, axis=1, inplace=True)   # renombro las columnas nuevas (las relacionales)

In [None]:
relaciones.columns

In [None]:
relaciones.shape

In [None]:
X3_cuad_relac = pd.concat([X3_cuad, relaciones],axis=1)
X3_cuad_relac.shape

In [None]:
X3_cuad_relac.columns

#### Probamos los diferentes modelos incluyendo nuestras nuevas features RELACIONALES:
### "X3_cuad_relac"

#### Vamos a verificarlos utilizando CORSS-VALIDATION

CROSS-VALIDATION Y REGULTARIZACIÓN:

#### con "X3_cuad_relac"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X3_cuad_relac, y)

#### 9-a) Ridge Regression CV (Cross Validation)

In [None]:
# INSTANCIO Y FITEO EL MODELO RIDGE  -- ACÁ PRUEBO NORMALIZANDO
model = linear_model.RidgeCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean()))


In [None]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

#### 9-b) Lasso Regression CV (Cross Validation)

In [None]:
# INSTANCIO Y FITEO EL MODELO LASSO  -- ACÁ PRUEBO SIN NORMALIZAR

model = linear_model.LassoCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean(), zero_coefs=(model.coef_ == 0).sum()))
# se muestra que eliminó ("descartó") cierta cantidad de coeficientes


In [None]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

#### 9-c) Stats Model Linear

#### Armamos "X3_cuad_relac_const"

In [None]:
# USANDO statsmodel
# Tenemos que agregar explícitamente a una constante:
X3_cuad_relac_const = sm.add_constant(X3_cuad_relac)
X3_cuad_relac_const.head(3)

In [None]:
X3_cuad_relac_const = X3_cuad_relac_const.astype("int", copy=True)   # lo paso a int sino no me da la memoria

In [None]:
# USANDO statsmodel
X_train, X_test, y_train, y_test = train_test_split(X3_cuad_relac_const, y)
model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print("EMC train:", mean_squared_error(y_train, predictions))
print("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: test:", r2_score(y_test, predictions2))  # PREDICT


##### SE OBSERVA QUE BAJÓ EL R2 TEST

In [None]:
print (model.summary())

#### Tercer eliminación de features con STATS MODEL - OLS:
##### Se eliminaran Features cuyo Pvalue supere el 1% (es decir que no están afectando a la varianza ni al sesgo del modelo)

In [None]:
not_feature_cols = model.pvalues[model.pvalues>0.01].index.values
not_feature_cols

#### Armamos "X4_cuad_relac_const"

In [None]:
X4_cuad_relac_const = X3_cuad_relac_const.drop(not_feature_cols, axis=1)

In [None]:
X4_cuad_relac_const.head(3)

#### STATS MODEL con "X4_cuad_relac_const"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X4_cuad_relac_const, y)

In [None]:
# USANDO statsmodel

model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_train)
predictions2 = model.predict(X_test)

# Imprimimos el MSE y un resumen del modelo
print ("EMC train:", mean_squared_error(y_train, predictions))
print ("EMC train:", mean_squared_error(y_test, predictions2))

print('R2: ', model.rsquared) # model
print("R2: train:", r2_score(y_train, predictions))  # TRAIN
print("R2: test:", r2_score(y_test, predictions2))  # PREDICT

#### También probemos RIDGE_CV y LASSO_CV CON con "X4_cuad_relac"

#### armammos "X4_cuad_relac"

In [None]:
X4_cuad_relac = X4_cuad_relac_const.drop(["const"], axis=1)

In [None]:
X4_cuad_relac.head(3)

#### probamos con "X4_cuad_relac"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X4_cuad_relac, y)

#### 9-d) Lasso Regression CV (Cross Validation)

In [None]:
# INSTANCIO Y FITEO EL MODELO LASSO  -- ACÁ PRUEBO SIN NORMALIZAR

model = linear_model.LassoCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean(), zero_coefs=(model.coef_ == 0).sum()))
# se muestra que eliminó ("descartó") cierta cantidad de coeficientes


In [None]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

#### 9-e) Ridge Regression CV (Cross Validation)

##### Ridge CV (Cross Validation)

In [None]:
# INSTANCIO Y FITEO EL MODELO RIDGE  -- ACÁ PRUEBO NORMALIZANDO
model = linear_model.RidgeCV(alphas=np.linspace(0.01, 0.1, 10), normalize=True).fit(X_train, y_train)

# HAGO CROSS VALIDATIONS
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# MUESTRA TODOS LOS PARÁMETROS OBTENIDOS
print(dict(alpha=model.alpha_, scores=scores, mean_score=scores.mean()))
# se muestra que eliminó ("descartó") cierta cantidad de coeficientes


In [None]:
#r2 TEST:
model.score(X_test, y_test)  # acá vemos con los datos de TEST

<hr id="ToCSV">
<h2 style="color: lightblue">CONCLUSIONES</h2>