In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Preprocesamiento

In [14]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Columnas numericas y categoricas
num_cols = make_column_selector(dtype_include=np.number)
cat_cols = make_column_selector(dtype_exclude=np.number)

# Pipelines para tratar los valores faltantes
num_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

# Column transformer
preprocessor = make_column_transformer((num_pipeline, num_cols), (cat_pipeline, cat_cols))

# Implementacion

## Eleccion de modelo

In [None]:
!pip install lazypredict

In [None]:
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier

# Dividimos en conjuntos de train y test
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# transformamos las caracteristicas
X_train_lazy = preprocessor.fit_transform(X_train)
X_test_lazy = preprocessor.transform(X_test)
X_train_lazy = pd.DataFrame(X_train_lazy.toarray(), columns=preprocessor.get_feature_names_out())
X_test_lazy = pd.DataFrame(X_test_lazy.toarray(), columns=preprocessor.get_feature_names_out())


lazyclassifier = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = lazyclassifier.fit(X_train_lazy, X_test_lazy, y_train, y_test)
models

- Gracias a los resultados observados en los modelos: Regresión Logística, K-Nearest Neighbors (KNN), Árbol de Decisión, XGBoost y LightGBM, he decidido enforcarme en el modelo de LGBM
  
  Evaluaciones: 0.81	- 0.80	- 0.80	- 0.81

## Entrenamiento

In [None]:
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline

# Creamos un pipeline ya con el modelo
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier())
])

# Entrenamos
pipeline.fit(X_train, y_train)

# Hacemos predicciones
y_pred = pipeline.predict(X_test)

# Evaluamos con un reporte
print(classification_report(y_test, y_pred))
accuracy = pipeline.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

Observamos unos resultados buenos, pero que se pueden mejorar bastante:
-  Accuracy: 0.7821229050279329


## Optimizacion de hiperparametros

In [None]:
# Primero con randomsearch
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'classifier__learning_rate': np.arange(0.01, 0.2, 0.01),
    'classifier__max_depth': np.arange(2, 10, 1),
    'classifier__n_estimators': np.arange(10, 100, 10)
}

randomsearch = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=10, cv=5, scoring='roc_auc')
randomsearch.fit(X_train, y_train)

best_params = randomsearch.best_params_
best_score = randomsearch.best_score_

print("Mejores hiperparametros encontrados:")
print(best_params)
print("Mejor score obtenido:")
print(best_score)

In [None]:
from sklearn.model_selection import GridSearchCV
# Ahora con gridsearch
# Definimos los hiperparametros a optimizar: learning_rate, max_depth, n_estimators
param_grid = {
    'classifier__learning_rate': np.arange(0.04, 0.1, 0.01),
    'classifier__max_depth': [3, 4, 5, 6, 7],
    'classifier__n_estimators': [10, 30]
}

gridsearch = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='roc_auc')
gridsearch.fit(X_train, y_train)

best_params = gridsearch.best_params_
best_score = gridsearch.best_score_

print("Mejores hiperparametros encontrados:")
print(best_params)
print("Mejor score obtenido:")
print(best_score)

Resultados:
- Mejores hiperparametros encontrados:
{'classifier__learning_rate': 0.05, 'classifier__max_depth': 4, 'classifier__n_estimators': 30}
- Mejor score obtenido:
  0.8606697986507716

- Modelo antes de optimizar: 0.7821229050279329

# Predicciones

In [None]:
# Creamos un pipeline ya con el modelo optimizado
pipeline_opt = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(learning_rate=0.05, max_depth=4, n_estimators=30))
])
# Datos
X = df.drop('Survived', axis=1)
y = df['Survived']
X_test_final = pd.read_csv('test.csv')
# Entrenamos
pipeline_opt.fit(X, y)

# Hacemos predicciones
y_pred_opt = pipeline_opt.predict(X_test_final)

y_pred_opt

# Exportamos resultados

In [39]:
# Creamos el dataset con los resultados
output = pd.DataFrame({
    'PassengerId': X_test_final['PassengerId'],
    'Survived': y_pred_opt
})

# Exportamos a un archivo CSV
output.to_csv('resultados.csv', index=False)
print("Archivo 'resultados.csv' creado exitosamente.")

Archivo 'resultados.csv' creado exitosamente.
