In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from datetime import datetime

In [2]:
# Cargar los conjuntos de datos
train_data = pd.read_csv("trainData.csv")
eval_data = pd.read_csv("evalData.csv")

In [3]:
# Limpieza de Datos y Preprocesamiento

# Manejar valores faltantes
train_data['departure_time'] = train_data['departure_time'].fillna(train_data['departure_time'].mode()[0])
eval_data['departure_time'] = eval_data['departure_time'].fillna(eval_data['departure_time'].mode()[0])

In [4]:
# Convertir 'date' y 'departure_time' a tipos de datos de fecha/hora
train_data['date'] = pd.to_datetime(train_data['date'])
eval_data['date'] = pd.to_datetime(eval_data['date'])
train_data['departure_time'] = pd.to_datetime(train_data['departure_time'], format='%H:%M:%S').dt.time
eval_data['departure_time'] = pd.to_datetime(eval_data['departure_time'], format='%H:%M:%S').dt.time


In [5]:
# Extraer características adicionales
train_data['day_of_week'] = train_data['date'].dt.dayofweek
eval_data['day_of_week'] = eval_data['date'].dt.dayofweek
train_data['hour_of_day'] = train_data['departure_time'].apply(lambda x: x.hour)
eval_data['hour_of_day'] = eval_data['departure_time'].apply(lambda x: x.hour)


In [6]:
# Eliminar columna innecesaria
train_data = train_data.drop(columns=['Unnamed: 0'])
eval_data = eval_data.drop(columns=['Unnamed: 0'])


In [7]:
# Preparación de los Datos para los Modelos de Machine Learning

# Dividir los datos
X = train_data.drop(columns=['noshow'])
y = train_data['noshow']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Identificar columnas numéricas y categóricas
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns


In [9]:
# Crear transformadores para variables numéricas y categóricas
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Crear preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])


In [10]:
import optuna

def objective(trial):
    # Definir el espacio de búsqueda de hiperparámetros
    n_neighbors = trial.suggest_int('n_neighbors', 2, 20)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    
    # Crear el modelo con los hiperparámetros sugeridos
    knn_model = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights)
    knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', knn_model)])
    
    # Entrenar el modelo y evaluar el rendimiento
    knn_pipeline.fit(X_train, y_train)
    y_pred = knn_pipeline.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_pred)
    
    return mae

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)  # Puedes ajustar n_trials según tus necesidades

# Resultados
print('Número de pruebas finalizadas:', len(study.trials))
print('Mejores parámetros:', study.best_params)
print('Mejor MAE:', study.best_value)


[I 2023-10-31 13:06:21,898] A new study created in memory with name: no-name-49f0d421-aafd-46c3-95a0-a726db9d0a7b
[I 2023-10-31 14:56:19,963] Trial 0 finished with value: 3.139736796708097 and parameters: {'n_neighbors': 20, 'weights': 'distance'}. Best is trial 0 with value: 3.139736796708097.


In [10]:
# Construcción y Evaluación de Modelos

# Modelo k-Nearest Neighbors (kNN)
knn_model = KNeighborsRegressor()
knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', knn_model)])


In [11]:
# Ajustar los hiperparámetros para kNN
param_grid_knn = {
    'model__n_neighbors': [5, 10, 15],
    'model__weights': ['uniform', 'distance']
}
grid_search_knn = GridSearchCV(knn_pipeline, param_grid_knn, cv=3, scoring='neg_mean_absolute_error')
grid_search_knn.fit(X_train, y_train)


In [None]:
# Evaluar el rendimiento de kNN
y_pred_knn = grid_search_knn.predict(X_valid)
mae_knn = mean_absolute_error(y_valid, y_pred_knn)


In [None]:
# Modelo Árboles de Regresión
tree_model = DecisionTreeRegressor()
tree_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', tree_model)])

# Ajustar los hiperparámetros para Árboles de Regresión
param_grid_tree = {
    'model__max_depth': [5, 10, 15],
    'model__min_samples_split': [2, 5, 10]
}
grid_search_tree = GridSearchCV(tree_pipeline, param_grid_tree, cv=3, scoring='neg_mean_absolute_error')
grid_search_tree.fit(X_train, y_train)


In [None]:
# Evaluar el rendimiento de Árboles de Regresión
y_pred_tree = grid_search_tree.predict(X_valid)
mae_tree = mean_absolute_error(y_valid, y_pred_tree)


In [None]:
# Resultados
print("Mejores hiperparámetros para kNN:", grid_search_knn.best_params_)
print("MAE para kNN:", mae_knn)
print("Mejores hiperparámetros para Árboles de Regresión:", grid_search_tree.best_params_)
print("MAE para Árboles de Regresión:", mae_tree)


In [None]:
# Realizar Predicciones en el Conjunto de Evaluación
final_model = grid_search_knn if mae_knn < mae_tree else grid_search_tree
predictions = final_model.predict(eval_data)

# Guardar las Predicciones en un Archivo CSV
np.savetxt("predictions.csv", predictions, delimiter=",")