In [1]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict  # Necesaria para validación cruzada
import pandas as pd
import numpy as np
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor
import xgboost as xgb 
import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error
import joblib







### En primer lugar se prueban los modelos más comúnes.

In [2]:
cleaned_data= pd.read_csv("Data/train_2.csv")


In [3]:
# Separar las características y la variable objetivo
X = cleaned_data.drop(columns=['SalePrice'])
y = cleaned_data['SalePrice']

# Dividir en conjuntos de entrenamiento y validación
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalización de los datos usando MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Modelos a probar
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Machine': SVR()
}

# Crear un DataFrame para almacenar los resultados
results = pd.DataFrame(columns=['Model', 'RMSLE_train', 'R2_train', 'RMSLE_test', 'R2_test'])

# Función para calcular RMSLE asegurando que los valores sean positivos
def rmsle(y_true, y_pred):
    # Asegurarse de que todos los valores sean mayores o iguales a 0
    y_true = np.where(y_true < 0, 0, y_true)
    y_pred = np.where(y_pred < 0, 0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Entrenar y evaluar cada modelo
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    # Predicciones en el conjunto de entrenamiento
    train_predictions = model.predict(X_train_scaled)
    rmsle_train = rmsle(y_train, train_predictions)
    r2_train = r2_score(y_train, train_predictions)
    
    # Predicciones en el conjunto de validación
    test_predictions = model.predict(X_valid_scaled)
    rmsle_test = rmsle(y_valid, test_predictions)
    r2_test = r2_score(y_valid, test_predictions)
    
    # Almacenar los resultados
    results = results.append({
        'Model': model_name,
        'RMSLE_train': rmsle_train,
        'R2_train': r2_train,
        'RMSLE_test': rmsle_test,
        'R2_test': r2_test
    }, ignore_index=True)

# Reorganizar las columnas
results = results[['Model', 'RMSLE_train', 'RMSLE_test', 'R2_train', 'R2_test']]

# Redondear a dos decimales para RMSLE y R2
results[['RMSLE_train', 'RMSLE_test']] = results[['RMSLE_train', 'RMSLE_test']].round(2)
results[['R2_train', 'R2_test']] = results[['R2_train', 'R2_test']].round(2)

# Formatear RMSLE y R2: 
def format_rmsle(value, model_name):
    if model_name == 'Linear Regression':
        return f"{value:.2e}"  # Notación científica
    else:
        return f"{value:,.2f}"  # Formato legible

# Aplicar formato a RMSLE
results['RMSLE_train'] = [format_rmsle(value, model) for value, model in zip(results['RMSLE_train'], results['Model'])]
results['RMSLE_test'] = [format_rmsle(value, model) for value, model in zip(results['RMSLE_test'], results['Model'])]

# Formatear R2: notación científica solo para Linear Regression
def format_r2(value, model_name):
    if model_name == 'Linear Regression':
        return f"{value:.2e}"  # Notación científica
    else:
        return f"{value:.2f}"  # Formato legible

results['R2_train'] = [format_r2(value, model) for value, model in zip(results['R2_train'], results['Model'])]
results['R2_test'] = [format_r2(value, model) for value, model in zip(results['R2_test'], results['Model'])]

# Mostrar los resultados ordenados por RMSLE_train y RMSLE_test
results = results.sort_values(by=['RMSLE_train', 'RMSLE_test']).reset_index(drop=True)
print(results)

# Guardar el escalador en la subcarpeta Data
scaler_filename = 'Data/minmax_scaler.pkl'
joblib.dump(scaler, scaler_filename)

print(f"Escalador guardado como {scaler_filename}")

  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


                    Model RMSLE_train RMSLE_test  R2_train    R2_test
0           Decision Tree        0.00       0.20      1.00       0.71
1           Random Forest        0.06       0.14      0.98       0.89
2       Gradient Boosting        0.08       0.14      0.97       0.90
3  Support Vector Machine        0.40       0.42     -0.05      -0.06
4       Linear Regression    3.40e-01   2.66e+00  8.90e-01  -2.49e+21
Escalador guardado como Data/minmax_scaler.pkl


  results = results.append({


Se usan dos métricas RMSLE que es la que se usará como referencia en el reto y $R^2$ que se medirán tanto en train como en test, con el objetivo de valorar el sobreajuste.
Se observan métricas prometedoras en los modelos más robustos (Gradient Boosting y Random Forest). Los modelos que no funcionan bien con relaciones no lineales tienen un desempeño muy pobre. Todo ello sugiere que las variables se relacionan entre si de forma compleja.

 ### Modelos de ensamblado

In [5]:
# Separar las características y la variable objetivo
X = cleaned_data.drop(columns=['SalePrice'])
y = cleaned_data['SalePrice']

# Dividir en conjuntos de entrenamiento y validación
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalización de los datos usando MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Modelos a probar
models = {
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(n_estimators=100),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100),
    
}

# Crear un DataFrame para almacenar los resultados
results = pd.DataFrame(columns=['Model', 'RMSLE_train', 'R2_train', 'RMSLE_test', 'R2_test'])

# Función para calcular RMSLE asegurando que los valores sean no negativos
def rmsle(y_true, y_pred):
    # Asegurar que todos los valores sean mayores o iguales a 0
    y_true = np.where(y_true < 0, 0, y_true)
    y_pred = np.where(y_pred < 0, 0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Entrenar y evaluar cada modelo
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)

    # Predicciones en el conjunto de entrenamiento
    train_predictions = model.predict(X_train_scaled)
    rmsle_train = rmsle(y_train, train_predictions)
    r2_train = r2_score(y_train, train_predictions)

    # Predicciones en el conjunto de validación
    test_predictions = model.predict(X_valid_scaled)
    rmsle_test = rmsle(y_valid, test_predictions)
    r2_test = r2_score(y_valid, test_predictions)

    # Almacenar los resultados
    results = pd.concat([results, pd.DataFrame({
        'Model': [model_name],
        'RMSLE_train': [rmsle_train],
        'RMSLE_test': [rmsle_test],
        'R2_train': [r2_train],
        'R2_test': [r2_test]
    })], ignore_index=True)

# Mostrar los resultados ordenados por RMSLE de prueba
results = results.sort_values(by='RMSLE_test').reset_index(drop=True)

# Mostrar resultados con dos decimales
pd.options.display.float_format = '{:.2f}'.format
print(results)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2478
[LightGBM] [Info] Number of data points in the train set: 1166, number of used features: 70
[LightGBM] [Info] Start training from score 180976.861063
               Model  RMSLE_train  R2_train  RMSLE_test  R2_test
0  Gradient Boosting         0.08      0.97        0.13     0.90
1           LightGBM         0.05      0.98        0.14     0.89
2            XGBoost         0.01      1.00        0.14     0.88
3           AdaBoost         0.19      0.88        0.22     0.82


found 0 physical cores < 1
  File "C:\Users\oscar\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


Dado los buenos resultados con Gradient Boosting se prueban otros modelos de la misma famila para ver si alguno lo mejora, siendo la diferencia prácticamente nula.

### CatBosst

In [6]:
data= pd.read_csv("Data/train.csv")


In [7]:
# Separar las características y la variable objetivo
X = data.drop(columns=['SalePrice'])  # Cambia 'SalePrice' por tu variable objetivo
y = data['SalePrice']

# Manejar los NaNs en las características categóricas
X = X.fillna("Desconocido")

# Convertir las características categóricas a tipo string
categorical_cols = X.select_dtypes(include=['object']).columns
X[categorical_cols] = X[categorical_cols].astype(str)

# Dividir el conjunto de datos en entrenamiento y validación
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Identificar las características categóricas
categorical_features_indices = [X_train.columns.get_loc(col) for col in categorical_cols]

# Crear y entrenar el modelo
model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, cat_features=categorical_features_indices, verbose=100)
model.fit(X_train, y_train)

# Realizar predicciones
train_preds = model.predict(X_train)
valid_preds = model.predict(X_valid)

# Definir función para RMSLE
def rmsle(y_true, y_pred):
    y_true = np.maximum(0, y_true)  # Ajuste para valores no negativos
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Calcular métricas de evaluación
train_rmsle = rmsle(y_train, train_preds)
valid_rmsle = rmsle(y_valid, valid_preds)

train_r2 = r2_score(y_train, train_preds)
valid_r2 = r2_score(y_valid, valid_preds)

# Resultados
print("CatBoost Metrics:")
print(f"Train RMSLE: {train_rmsle:.2f}, Train R²: {train_r2:.2f}")
print(f"Validation RMSLE: {valid_rmsle:.2f}, Validation R²: {valid_r2:.2f}")

0:	learn: 72162.6819362	total: 300ms	remaining: 5m
100:	learn: 16906.5345257	total: 11.6s	remaining: 1m 43s
200:	learn: 12491.7206160	total: 23.4s	remaining: 1m 32s
300:	learn: 10194.6269635	total: 34.9s	remaining: 1m 20s
400:	learn: 8539.2389198	total: 46.7s	remaining: 1m 9s
500:	learn: 7423.0275130	total: 57.9s	remaining: 57.7s
600:	learn: 6521.4174597	total: 1m 9s	remaining: 46.1s
700:	learn: 5611.5770354	total: 1m 21s	remaining: 34.6s
800:	learn: 4836.5708256	total: 1m 32s	remaining: 23s
900:	learn: 4229.1078623	total: 1m 44s	remaining: 11.4s
999:	learn: 3845.4652330	total: 1m 55s	remaining: 0us
CatBoost Metrics:
Train RMSLE: 0.05, Train R²: 0.99
Validation RMSLE: 0.14, Validation R²: 0.91


También se prueba CatBoost que es un modelo de ensamblaje que tiene la particularidad de admitir datos categóricos. Las métricas en test son ligeramante superiores pero el sobreajuste, que ya era elevado en los modelos numéricos, es inadmisible.

### Stacking

In [8]:
cleaned_data= pd.read_csv("Data/train_2.csv")

In [9]:
# Separar las características y la variable objetivo
X = cleaned_data.drop(columns=['SalePrice'])
y = cleaned_data['SalePrice']

# Dividir en conjuntos de entrenamiento y validación
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalización de los datos usando MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

base_models = [
    ('random_forest', RandomForestRegressor(n_estimators=100)),
    ('gradient_boosting', GradientBoostingRegressor(n_estimators=100)),
    ('lightgbm', LGBMRegressor(n_estimators=100))
]

meta_model = XGBRegressor(
    n_estimators=100,   
    learning_rate=0.1,  
    max_depth=3,        
    random_state=42
)

# Definir el modelo de stacking
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Entrenar el modelo de stacking
stacking_model.fit(X_train_scaled, y_train)

# Predicciones en el conjunto de entrenamiento y validación
train_predictions = stacking_model.predict(X_train_scaled)
test_predictions = stacking_model.predict(X_valid_scaled)

# Definir función para RMSLE
def rmsle(y_true, y_pred):
    y_true = np.maximum(0, y_true)  # Asegurarse de que los valores sean no negativos
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Calcular métricas de evaluación
rmsle_train = rmsle(y_train, train_predictions)
r2_train = r2_score(y_train, train_predictions)

rmsle_test = rmsle(y_valid, test_predictions)
r2_test = r2_score(y_valid, test_predictions)

# Mostrar resultados
print(f"Stacking Model - RMSLE Train: {rmsle_train:.2f}, R2 Train: {r2_train:.2f}")
print(f"Stacking Model - RMSLE Test: {rmsle_test:.2f}, R2 Test: {r2_test:.2f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2478
[LightGBM] [Info] Number of data points in the train set: 1166, number of used features: 70
[LightGBM] [Info] Start training from score 180976.861063
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2251
[LightGBM] [Info] Number of data points in the train set: 932, number of used features: 69
[LightGBM] [Info] Start training from score 181194.222103
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2247
[LightGBM] [Info] Number of data points in the train set: 933, number of used features: 67
[LightGBM] [Info] Star

Se usa un stacking con los modelos que mejor han funcionado como modelos base y XGBBoost como metamodelo para captar las realciones no lineales de forma eficiente. Los resultados son interesantes porque si bien bajamos ligeramente en test, el sobreajuste es considerablemente menor. La parte negativa es que un stacking es mucho más costoso de optimizar que un modelo simple.

### Red Neuronal

In [10]:
# Separar las características y la variable objetivo
X = cleaned_data.drop(columns=['SalePrice'])
y = cleaned_data['SalePrice']

# Dividir en conjuntos de entrenamiento y validación
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalización de los datos
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)


model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),  # Capa de entrada
    layers.Dense(64, activation='relu'),  # Capa oculta
    layers.Dense(32, activation='relu'),  # Capa oculta
    layers.Dense(1)  # Capa de salida
])

# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Entrenar el modelo
history = model.fit(X_train_scaled, y_train, 
                    epochs=100,  
                    batch_size=32,  
                    validation_data=(X_valid_scaled, y_valid),  
                    verbose=1)  

# Hacer predicciones
train_predictions = model.predict(X_train_scaled).flatten()
valid_predictions = model.predict(X_valid_scaled).flatten()

# Definir función para RMSLE
def rmsle(y_true, y_pred):
    y_true = np.maximum(0, y_true)  # Asegurarse de que los valores sean no negativos
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Calcular RMSLE y R²
rmsle_train = rmsle(y_train, train_predictions)
r2_train = r2_score(y_train, train_predictions)

rmsle_test = rmsle(y_valid, valid_predictions)
r2_test = r2_score(y_valid, valid_predictions)

# Mostrar resultados
print(f'RMSLE Train: {rmsle_train:.2f}, R² Train: {r2_train:.2f}')
print(f'RMSLE Test: {rmsle_test:.2f}, R² Test: {r2_test:.2f}')



Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoc

Al tratarse de un dataset complejo con relaciones no lineales era obligado probar también una red neuronal, de forma similar aunque más acentuada los resultados de test son inferiores a Gradient Boosting pero el sobeajuste es menor. En este caso las métricas de test son mejores que las de test, lo que significa que el modelo generaliza muy bien.