## Generamos y Evaluamos los modelos de ML

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
datos = pd.read_csv("df41d_mapa_feateng_f2.csv", index_col='index')

In [3]:
y=datos.price_usd_per_m2
X=datos.iloc[:,1:]

In [32]:
X.head()

Unnamed: 0_level_0,property_type,place_name,surface_total_in_m2,surface_covered_in_m2,rooms,cercania_subtes,cercania_av,cercania_tren,cercania_bus,cercania_hospi,...,cant_univ,cant_edu,cant_banco,cant_gastro,cant_cult,cant_delito,surface_total_in_m2_t,cocheras,amenities_ex,estado
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,PH,Mataderos,55.0,40.0,2.0,0.050246,0.003009,0.026301,0.027171,0.013671,...,0.0,5.0,0.0,0.0,2.0,89.0,0.13484,0,1.0,ninguno
1,apartment,Liniers,55.0,55.0,2.0,0.057926,0.003574,0.013199,0.013782,0.008378,...,0.0,4.0,1.0,0.0,1.0,86.0,0.13484,1,0.0,nuevo
2,apartment,Belgrano,45.0,40.0,1.0,0.010611,0.00128,0.005623,0.009783,0.006277,...,0.0,9.0,2.0,0.0,4.0,142.0,0.149071,0,0.0,ninguno
3,apartment,Palermo,50.0,30.0,2.0,0.001581,0.000892,0.001965,0.000953,0.01827,...,1.0,15.0,3.0,0.0,9.0,423.0,0.141421,0,1.0,ninguno
4,apartment,Palermo,42.0,31.0,1.0,0.001581,0.000892,0.001965,0.000953,0.01827,...,1.0,15.0,3.0,0.0,9.0,423.0,0.154303,0,0.0,ninguno


In [5]:
X.drop(['lat','lon','amenities'],axis=1,inplace=True)

Analizamos los valores categoricos para luego verificar que al aplicar los procesos de transformacion durante el pipeline existe consistencia con la cantidad de columnas

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18957 entries, 0 to 20569
Data columns (total 27 columns):
property_type            18957 non-null object
place_name               18957 non-null object
surface_total_in_m2      18957 non-null float64
surface_covered_in_m2    18957 non-null float64
rooms                    18957 non-null float64
cercania_subtes          18957 non-null float64
cercania_av              18957 non-null float64
cercania_tren            18957 non-null float64
cercania_bus             18957 non-null float64
cercania_hospi           18957 non-null float64
cercania_est_edu         18957 non-null float64
cercania_univ            18957 non-null float64
cercania_comisarias      18957 non-null float64
cercania_verde           18957 non-null float64
cant_deptos              18957 non-null float64
cant_subte               18957 non-null float64
cant_hospi               18957 non-null float64
cant_univ                18957 non-null float64
cant_edu                 1895

In [7]:
len(X.columns)

27

In [8]:
X['place_name'].nunique()

48

In [9]:
X['property_type'].nunique()

2

In [10]:
X['estado'].nunique()

4

In [11]:
28-3+48+4+4

81

## Dado que el Dataset con los features tiene un toatl de 16 columnas con 2 variables categoricas de las cuales:
`place_name` contiene 48 valores unicos.  
`property_type` posee 4 valores unicos.
Entonces es de esperarse que la matriz final luego de aplicarle el onehot encoding tenga una total de __81__ (29 - 3 + 48 + 4 + 3) columnas

### Pipelines

Un pipeline de sklearn se define como una secuencia de pasos. Cada paso se define con una tupla de forma `[nombre del paso, transformador]`

Por ejemplo, si queremos crear un pipeline que procese las variables numéricas, primero imputándolas y después estandarizandolas, y adicionalmente otro que procese las variables categoricas aplicandoles un Onehot enconding, para finalmente procesarlos con los estimadores seleccionados, entonces podriamos crear un pipeline de la siguiente manera:

<img src="./media/pipeline.png">

In [12]:
from sklearn import preprocessing, feature_extraction
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn import svm
from sklearn import neighbors
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

In [13]:
# Defino array con las columnas numericas 
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
# Defino array con las columnas categoricas
categorical_features = X.select_dtypes(include=['object']).columns

display(numeric_features)
display(categorical_features)

# generamos un pipeline para trabajar con los features numericos (int y float)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# generamos un pipeline para trabajar con los features categoricos (object)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# creamos un pipeline para separar los datos en numericos y categoricos y aplicarle los correspondientes
# pipelines antes generados.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


Index(['surface_total_in_m2', 'surface_covered_in_m2', 'rooms',
       'cercania_subtes', 'cercania_av', 'cercania_tren', 'cercania_bus',
       'cercania_hospi', 'cercania_est_edu', 'cercania_univ',
       'cercania_comisarias', 'cercania_verde', 'cant_deptos', 'cant_subte',
       'cant_hospi', 'cant_univ', 'cant_edu', 'cant_banco', 'cant_gastro',
       'cant_cult', 'cant_delito', 'surface_total_in_m2_t', 'cocheras',
       'amenities_ex'],
      dtype='object')

Index(['property_type', 'place_name', 'estado'], dtype='object')

In [14]:
# probamos los pipelines recien creados
test = preprocessor.fit(X).transform(X)
len(test[0])

78

El numero anterior coincide con el caclulo que habiamos realizado anteriormente al comienzo de la notebook donde calculabamos el numero final de columnas de la matriz de datos una vez implementado el onehot encoding. Por ende los pipeline estan funcionando correctamente.

### Seleccion de Modelo e hiperparametros
Un pipeline tambien puede utilizarse durante el proceso de seleccion de modelo y ajuste de hiperparametros. El siguiente codigo itera entre varios Regresores, habiendo aplicado previamente las transformaciones a las variables categoricas y numericas.

Mediante el uso de grid search se puede hallar los hiperparametros que obtienen mejorers resultados. 

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1000)

ridge_params = {'alpha':np.linspace(1.5, 2, num=10, endpoint=True)}
lasso_params = {'alpha':np.linspace(0.015, 0.02, num=5, endpoint=False)}
en_params = {'alpha':np.linspace(0.001, 0.01, num=10, endpoint=False)}
dt_params = {'max_depth':[20]}
rf_params = {'n_estimators':np.arange(100,110,5),'max_depth':[20,25]}

classifiers = [
    LinearRegression(), 
    GridSearchCV(cv=5, estimator=Ridge(), param_grid=ridge_params), 
    GridSearchCV(cv=5, estimator=Lasso(), param_grid=lasso_params),
    GridSearchCV(cv=5, estimator=ElasticNet(), param_grid=en_params),
    GridSearchCV(cv=5, estimator=tree.DecisionTreeRegressor(), param_grid=dt_params),
    GridSearchCV(cv=5, estimator=RandomForestRegressor(), param_grid=rf_params)
    #svm.SVR(),
    #tree.DecisionTreeRegressor(max_depth=15)
]

for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print()
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
model score: 0.612

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.5    , 1.55556, 1.61111, 1.66667, 1.72222, 1.77778, 1.83333,
       1.88889, 1.94444, 2.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
model score: 0.614

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([0.015, 0.016, 0.017, 0.018, 0.019])},


In [16]:
df_ridge = pd.DataFrame(classifiers[1].cv_results_)[['mean_test_score', 'std_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False)
display(df_ridge.head())

Unnamed: 0,mean_test_score,std_test_score,params
0,0.626861,0.010249,{'alpha': 1.5}
1,0.626853,0.010243,{'alpha': 1.5555555555555556}
2,0.626846,0.010238,{'alpha': 1.6111111111111112}
3,0.626838,0.010232,{'alpha': 1.6666666666666667}
4,0.62683,0.010226,{'alpha': 1.7222222222222223}


In [17]:
df_lasso = pd.DataFrame(classifiers[2].cv_results_)[['mean_test_score', 'std_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False)
display(df_lasso.head())

Unnamed: 0,mean_test_score,std_test_score,params
1,0.626956,0.010365,{'alpha': 0.016}
2,0.626956,0.010363,{'alpha': 0.017}
3,0.626956,0.010361,{'alpha': 0.018000000000000002}
0,0.626955,0.010366,{'alpha': 0.015}
4,0.626955,0.010359,{'alpha': 0.019}


In [18]:
df_en = pd.DataFrame(classifiers[3].cv_results_)[['mean_test_score', 'std_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False)
display(df_en.head())

Unnamed: 0,mean_test_score,std_test_score,params
0,0.626086,0.009925,{'alpha': 0.001}
1,0.624547,0.009624,{'alpha': 0.0019000000000000002}
2,0.622584,0.009395,{'alpha': 0.0028000000000000004}
3,0.620313,0.009212,{'alpha': 0.0037}
4,0.617818,0.009061,{'alpha': 0.0046}


In [19]:
df_dt = pd.DataFrame(classifiers[4].cv_results_)[['mean_test_score', 'std_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False)
display(df_dt.head())

Unnamed: 0,mean_test_score,std_test_score,params
0,0.541817,0.027263,{'max_depth': 20}


In [20]:
df_rf = pd.DataFrame(classifiers[5].cv_results_)[['mean_test_score', 'std_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False)
display(df_rf.head())

Unnamed: 0,mean_test_score,std_test_score,params
2,0.720961,0.011387,"{'max_depth': 25, 'n_estimators': 100}"
3,0.720884,0.01198,"{'max_depth': 25, 'n_estimators': 105}"
0,0.717143,0.011897,"{'max_depth': 20, 'n_estimators': 100}"
1,0.716338,0.013791,"{'max_depth': 20, 'n_estimators': 105}"


## Persistiendo el Preprocesador

In [21]:
import pickle

In [22]:
file = open("preprocesador.pkl", "wb")
pickle.dump(preprocessor, file)
file.close()

In [23]:
preprocesador = pickle.load(open("preprocesador.pkl", "rb"))

In [24]:
preprocesador.transform(X[1:2])

array([[-0.40077854, -0.28265435, -0.43957279,  5.10098577,  2.40768707,
         0.78449971,  0.41423218, -0.62439025, -0.83826726,  6.62034613,
         3.92006613,  0.94446158, -0.89540149, -0.68563466, -0.26260594,
        -0.50097111, -1.25049429, -0.62325088, -0.43196328, -0.69910718,
        -1.17107678,  0.15835119,  1.10790977, -1.45256368,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [34]:
X[1:2].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 1 to 1
Data columns (total 27 columns):
property_type            1 non-null object
place_name               1 non-null object
surface_total_in_m2      1 non-null float64
surface_covered_in_m2    1 non-null float64
rooms                    1 non-null float64
cercania_subtes          1 non-null float64
cercania_av              1 non-null float64
cercania_tren            1 non-null float64
cercania_bus             1 non-null float64
cercania_hospi           1 non-null float64
cercania_est_edu         1 non-null float64
cercania_univ            1 non-null float64
cercania_comisarias      1 non-null float64
cercania_verde           1 non-null float64
cant_deptos              1 non-null float64
cant_subte               1 non-null float64
cant_hospi               1 non-null float64
cant_univ                1 non-null float64
cant_edu                 1 non-null float64
cant_banco               1 non-null float64
cant_gastro          

## Persistiendo el modelo optimo

In [25]:
import pickle

In [26]:
file = open("modelo.pkl", "wb")
pickle.dump(classifiers[5], file)
file.close()

In [27]:
modelo = pickle.load(open("modelo.pkl", "rb"))

In [28]:
modelo.best_score_

0.7209609428503836

In [29]:
preprocesador.transform(X[1:2])

array([[-0.40077854, -0.28265435, -0.43957279,  5.10098577,  2.40768707,
         0.78449971,  0.41423218, -0.62439025, -0.83826726,  6.62034613,
         3.92006613,  0.94446158, -0.89540149, -0.68563466, -0.26260594,
        -0.50097111, -1.25049429, -0.62325088, -0.43196328, -0.69910718,
        -1.17107678,  0.15835119,  1.10790977, -1.45256368,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [30]:
modelo.predict(preprocesador.transform(X[1:2]))

array([2077.66725153])