# Machine Learning en el dataset del Titanic

In [1]:
import pycaret
from pycaret.regression import *
import pandas as pd
import numpy as np
import os
import sys
import json
import pickle


In [2]:
titan = pd.read_csv('data/titanic_limpio.csv')

In [3]:
titan

Unnamed: 0,PASSENGERID,SURVIVED,PCLASS,NAME,SEX,AGE,SIBSP,PARCH,TICKET,FARE,CABIN,EMBARKED
0,1,Dead,Tercera,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Unknown,Southampton
1,2,Alive,Primera,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,Cherburgo
2,3,Alive,Tercera,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Unknown,Southampton
3,4,Alive,Primera,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,Southampton
4,5,Dead,Tercera,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Unknown,Southampton
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,Dead,Segunda,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Unknown,Southampton
887,888,Alive,Primera,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,Southampton
888,889,Dead,Tercera,"Johnston, Miss. Catherine Helen ""Carrie""",female,24.0,1,2,W./C. 6607,23.4500,Unknown,Southampton
889,890,Alive,Primera,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,Cherburgo


In [4]:
titan = titan.drop(['NAME', 'TICKET'], axis=1)

In [5]:
from sklearn.preprocessing import LabelEncoder
def fritas(df):
    """
    Given a pandas DataFrame, encodes all categorical (object) columns using
    Label Encoding and returns a copy of the encoded DataFrame.
    Parameters:
    - df: pandas DataFrame
    Returns:
    - df_encoded: pandas DataFrame
    - encoder_info: list of dicts
    """
    df_encoded = df.copy()  # Make a copy of the original DataFrame
    object_columns = df_encoded.select_dtypes(include=["object"]).columns  # Select the categorical columns of the DataFrame
    encoder_info = []  # Initialize a list to store the encoder information
    
    for column in object_columns:
        le = LabelEncoder()  # Create a new LabelEncoder for each categorical column
        df_encoded[column] = le.fit_transform(df_encoded[column].astype(str))  # Fit and transform the LabelEncoder on the column
        encoder_info.append({  # Store the encoder information in a dictionary
            'column': column,
            'labels': list(le.classes_),  # List the original labels
            'codes': list(le.transform(le.classes_))  # List the encoded codes
        })
        
    return df_encoded, encoder_info  # Return the encoded DataFrame and the encoder information

In [6]:
# Necesitamos encodear las variables categoricas
titan = fritas(titan)[0]

In [7]:
titan

Unnamed: 0,PASSENGERID,SURVIVED,PCLASS,SEX,AGE,SIBSP,PARCH,FARE,CABIN,EMBARKED
0,1,1,2,1,22.0,1,0,7.2500,147,2
1,2,0,0,0,38.0,1,0,71.2833,81,0
2,3,0,2,0,26.0,0,0,7.9250,147,2
3,4,0,0,0,35.0,1,0,53.1000,55,2
4,5,1,2,1,35.0,0,0,8.0500,147,2
...,...,...,...,...,...,...,...,...,...,...
886,887,1,1,1,27.0,0,0,13.0000,147,2
887,888,0,0,0,19.0,0,0,30.0000,30,2
888,889,1,2,0,24.0,1,2,23.4500,147,2
889,890,0,0,1,26.0,0,0,30.0000,60,0


In [8]:
s = setup(data = titan, target = 'SURVIVED', session_id = 123, log_experiment = True, experiment_name = 'titanic1')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,SURVIVED
2,Target type,Regression
3,Original data shape,"(891, 10)"
4,Transformed data shape,"(891, 10)"
5,Transformed train set shape,"(623, 10)"
6,Transformed test set shape,"(268, 10)"
7,Numeric features,9
8,Preprocess,True
9,Imputation type,simple


2024/02/05 20:32:09 INFO mlflow.tracking.fluent: Experiment with name 'titanic1' does not exist. Creating a new experiment.


In [9]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.2705,0.1408,0.3737,0.3954,0.2589,0.2199,0.011
rf,Random Forest Regressor,0.2594,0.1426,0.3761,0.3878,0.2595,0.2241,0.039
ada,AdaBoost Regressor,0.3152,0.1459,0.3809,0.3718,0.262,0.2992,0.005
catboost,CatBoost Regressor,0.2737,0.1467,0.3816,0.3709,0.2634,0.2242,0.188
et,Extra Trees Regressor,0.2513,0.1471,0.3822,0.3683,0.2642,0.2131,0.022
lr,Linear Regression,0.3041,0.1509,0.3874,0.3532,0.2686,0.2508,0.193
ridge,Ridge Regression,0.3048,0.1508,0.3874,0.3532,0.2687,0.2514,0.003
br,Bayesian Ridge,0.3079,0.151,0.3876,0.3528,0.2692,0.2541,0.003
lar,Least Angle Regression,0.3024,0.1516,0.3883,0.3501,0.269,0.2487,0.004
lightgbm,Light Gradient Boosting Machine,0.2834,0.1539,0.3907,0.3398,0.2655,0.24,0.236


In [10]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [11]:
pred_supervivencia = predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.2521,0.12,0.3465,0.4825,0.24,0.2182


In [13]:
save_model(best, 'models/titanic_model', model_only=True)

Model Successfully Saved


(GradientBoostingRegressor(random_state=123), 'models/titanic_model.pkl')