## Modelado 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

In [None]:
# Cargar los datos
data = pd.read_csv('dataset6.csv')

In [None]:
# Separar características y variable objetivo
X = data.drop('total_alquileres', axis=1)
y = data['total_alquileres']

In [None]:
# Ver distribución de la variable objetivo
plt.figure(figsize=(10, 6))
sns.histplot(y, bins=50, kde=True)
plt.title('Distribución de total_alquileres')
plt.show()

In [None]:
# Podríamos considerar aplicar una transformación logarítmica si la distribución es muy sesgada
y_log = np.log1p(y)
plt.figure(figsize=(10, 6))
sns.histplot(y_log, bins=50, kde=True)
plt.title('Distribución de log(total_alquileres + 1)')
plt.show()

## Division de datos

In [None]:
# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Escalar las características numéricas (excluyendo las codificadas one-hot)
numeric_cols = ['temperatura', 'sensacion_termica', 'humedad', 'velocidad_viento']
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

## Modelado

In [None]:
# Modelo de regresión lineal
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Métricas
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Regresión Lineal - MSE: {mse_lr:.2f}, MAE: {mae_lr:.2f}, R2: {r2_lr:.2f}")

In [None]:
# Ridge Regression con búsqueda de hiperparámetros
ridge = Ridge()
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
grid_ridge = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_ridge.fit(X_train, y_train)

best_ridge = grid_ridge.best_estimator_
y_pred_ridge = best_ridge.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression - MSE: {mse_ridge:.2f}, MAE: {mae_ridge:.2f}, R2: {r2_ridge:.2f}")
print(f"Mejor alpha: {grid_ridge.best_params_}")

In [None]:
# Lasso Regression con búsqueda de hiperparámetros
lasso = Lasso()
param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]}
grid_lasso = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_lasso.fit(X_train, y_train)

best_lasso = grid_lasso.best_estimator_
y_pred_lasso = best_lasso.predict(X_test)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression - MSE: {mse_lasso:.2f}, MAE: {mae_lasso:.2f}, R2: {r2_lasso:.2f}")
print(f"Mejor alpha: {grid_lasso.best_params_}")

In [None]:
# Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - MSE: {mse_rf:.2f}, MAE: {mae_rf:.2f}, R2: {r2_rf:.2f}")
print(f"Mejores parámetros: {grid_rf.best_params_}")

# Importancia de características
feature_importances = pd.DataFrame(best_rf.feature_importances_,
                                   index=X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))