In [1]:
# Se importan las librerías necesarias para realizar la limpieza y entrenamientos de los modelos seleccionados:
# LinearRegression
# RandomForestRegressor
# GradientBoostingRegressor
# HistGradientBoostingRegressor
# DecisionTreeRegressor
# KNeighborsRegressor

import pandas as pd
import requests
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,explained_variance_score

In [2]:
train_dataset = pd.read_csv('../input/train.csv', index_col=0)


In [3]:
#predict_dataset = pd.read_csv('./predict.csv', index_col=0)


In [4]:
# Visualizamos los datos para empezar a limpiar

train_dataset.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1.12,Premium,H,SI1,61.6,59.0,6.67,6.63,4.1,5363
1,1.14,Very Good,E,SI2,60.0,54.0,6.74,6.97,4.11,5593
2,0.9,Very Good,D,SI2,60.3,63.0,6.12,6.22,3.72,3534
3,0.71,Ideal,E,VS1,61.9,54.0,5.74,5.76,3.56,3212
4,0.34,Very Good,F,SI2,60.0,62.0,4.51,4.55,2.72,447


In [5]:
train_dataset.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [6]:
#train_dataset.cut.value_counts()

In [7]:
#train_dataset.color.value_counts()

In [8]:
#train_dataset.clarity.value_counts()

In [9]:
#train_dataset.isna().sum()

In [10]:
# Transformamos las columnas categóricas para pasarlas a numéricas

le = preprocessing.LabelEncoder()
def num_transform(datasets,columns):
    for ds in datasets:
        for column in columns:
            ds[f"{column}_num"] = le.fit_transform(ds[column])
datasets = [train_dataset]
columns = ["cut","color","clarity"]
num_transform(datasets,columns)

In [11]:
#train_dataset.head()

In [12]:
# Establecemos los valores para mostrar las variables X e y

columnas = [a for a in train_dataset.columns if a not in ["cut","color","clarity","price"]]
X = train_dataset[columnas]
y = train_dataset["price"]

In [1]:
# Separamos los datos de entrenamiento y los datos que queremos comprobar, (80%,20%)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

NameError: name 'train_test_split' is not defined

In [14]:
# Ajustamos los parámetros de cada modelo 

models = {
    "linear":LinearRegression(),
    "rfr":RandomForestRegressor(n_estimators=400,max_features=5,max_depth=35),
    "gbr":GradientBoostingRegressor(n_estimators=300,max_depth=25,learning_rate=0.2,loss='lad'),
    "hgbr":HistGradientBoostingRegressor(loss='poisson', max_iter=200, max_leaf_nodes=33, max_depth=5),
    "dtr":DecisionTreeRegressor(splitter='best',max_depth=250,min_samples_split=50,min_samples_leaf=2,min_weight_fraction_leaf=0.0,max_features=None,random_state=0,max_leaf_nodes=2000,min_impurity_decrease=0.0,min_impurity_split=None,ccp_alpha=0.0),
    "knr":KNeighborsRegressor()
    
}

# Entrenamos todos los modelos del diccionario

for name,m  in models.items():
    print(f"Training {name}...")
    m.fit(X_train, y_train)
print("Train complete")

Training linear...
Training rfr...
Training gbr...
Training hgbr...
Training dtr...
Training knr...
Train complete


In [15]:
# Imprimimos las métricas para evaluar los modelos y seleccionar los que funcinan mejor

printMetric = lambda label,value:print(f"\t {label}: {round(value,3)}")
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("R2_score",r2_score(y_test, y_pred))
    printMetric("Root mean squared error",mean_squared_error(y_test, y_pred)**0.5)
    printMetric("Mean absolute error",mean_absolute_error(y_test, y_pred))
    printMetric("explained variance score",explained_variance_score(y_test, y_pred))
print("Train complete")

Evaluating model linear
	 R2_score: 0.883
	 Root mean squared error: 1348.754
	 Mean absolute error: 855.801
	 explained variance score: 0.883
Evaluating model rfr
	 R2_score: 0.98
	 Root mean squared error: 559.081
	 Mean absolute error: 274.582
	 explained variance score: 0.98
Evaluating model gbr
	 R2_score: 0.978
	 Root mean squared error: 587.745
	 Mean absolute error: 287.417
	 explained variance score: 0.978
Evaluating model hgbr
	 R2_score: 0.98
	 Root mean squared error: 565.292
	 Mean absolute error: 279.507
	 explained variance score: 0.98
Evaluating model dtr
	 R2_score: 0.973
	 Root mean squared error: 654.666
	 Mean absolute error: 329.532
	 explained variance score: 0.973
Evaluating model knr
	 R2_score: 0.945
	 Root mean squared error: 923.859
	 Mean absolute error: 491.415
	 explained variance score: 0.946
Train complete
