# Herramientas para Machine Learning - Ejemplo (dataset House Information)
En el primer caso se cargan librerías, así como el dataset a evaluar (**House Dataset.csv**). La función head muestra las primeras filas del dataset.

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

#Clasificación
from xgboost import XGBClassifier

#Regresión
from xgboost import XGBRegressor
from xgboost import plot_importance
from matplotlib import pyplot

from sklearn.pipeline import Pipeline

#Desactivar los warnings
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('HouseDataset.csv')
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
0,2126,4,1,1969,215355.283618
1,2459,3,2,1980,195014.221626
2,1860,2,1,1970,306891.012076
3,2294,2,1,1996,206786.787153
4,2130,5,2,2001,272436.239065


Se coloca la función "**describe**" de Pandas para mostrar diversos ṕarámetros estadísticos de interés de cada columna numérica.

In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SquareFeet,50000.0,2006.37468,575.513241,1000.0,1513.0,2007.0,2506.0,2999.0
Bedrooms,50000.0,3.4987,1.116326,2.0,3.0,3.0,4.0,5.0
Bathrooms,50000.0,1.99542,0.815851,1.0,1.0,2.0,3.0,3.0
YearBuilt,50000.0,1985.40442,20.719377,1950.0,1967.0,1985.0,2003.0,2021.0
Price,50000.0,224827.325151,76141.842966,-36588.165397,169955.860225,225052.141166,279373.630052,492195.259972


### Separar predictoras y target
Se realizarán tres diferentes df (de dicha variable):

- Val: Se utilizará para validar el modelo que se está creando.
- Train: Se utilizará para entrenar dicho modelo.
- Test: Se utilizará para validar en el entrenamiento del modelo.

In [11]:
# df con 1500 obs que splitearemos en Train y Test

X = df[0:1500].drop(columns = 'SquareFeet')
y = df['SquareFeet'][0:1500]

# Val: generamos un df con 250 obs para validar el modelo final

Val_x = df[1000:1250].drop(columns = 'SquareFeet')
Val_y = df['SquareFeet'][1000:1250]

print(X.shape)

(1500, 4)


In [12]:
print(Val_x.shape)

(250, 4)


Se realiza el escalado respectivo del modelo df al tamaño de la muestra que seleccionamos:

In [13]:
mms = StandardScaler()
X_mms = mms.fit_transform(X)

pd.DataFrame(X_mms).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,1500.0,-1.219765e-16,1.000334,-1.380805,-0.471583,0.437639,0.437639,1.346861
1,1500.0,-1.065814e-16,1.000334,-1.231986,-1.231986,-0.013806,1.204374,1.204374
2,1500.0,-4.838796e-15,1.000334,-1.716098,-0.851791,0.012516,0.828806,1.693114
3,1500.0,1.19608e-16,1.000334,-3.291711,-0.691574,0.014514,0.675296,2.668747


Se separan el **train** y el **test**:

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train_x,test_x,train_y,test_y = train_test_split(X_mms,y,test_size=0.5,random_state=5567)

print(train_x.shape)
print(test_x.shape)

(750, 4)
(750, 4)


Se entrena el modelo:

In [22]:
#Instanciar
xgb = XGBClassifier(use_label_encoder=False) #lo del encoder es para que no de warning
train_y = le.fit_transform(train_y)
test_y = le.fit_transform(test_y)

#Entrenar
xgb.fit(train_x,train_y,
        early_stopping_rounds = 4, # num iteraciones sin mejora en la métrica, después para.
        eval_set=[(test_x, test_y)], # df test para evaluar
        eval_metric = 'auc') # métrica evaluación

[0]	validation_0-auc:0.56661
[1]	validation_0-auc:0.57459
[2]	validation_0-auc:0.59266
[3]	validation_0-auc:0.60276
[4]	validation_0-auc:0.60942
[5]	validation_0-auc:0.60481
[6]	validation_0-auc:0.61160
[7]	validation_0-auc:0.61391
[8]	validation_0-auc:0.61472
[9]	validation_0-auc:0.61164
[10]	validation_0-auc:0.61585
[11]	validation_0-auc:0.61489
[12]	validation_0-auc:0.61690
[13]	validation_0-auc:0.61812
[14]	validation_0-auc:0.61895
[15]	validation_0-auc:0.61810
[16]	validation_0-auc:0.61821
[17]	validation_0-auc:0.61757
