In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, RobustScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
CAT_COLUMN_NAMES = ['family_size', 'embarked', 'gender', 'pclass', 'title', 'is_alone']
CONT_COLUMN_NAMES = ['age', 'ticket_price']
QUANTILE_SIZE = 10
TAR_COLUMN_NAMES = ['survived']

In [3]:
x_train, y_train = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Some features considered irrelevant are dropped from the beginning
x_train.drop(['boat', 'body', 'home.dest'], axis=1, inplace=True)
x_train.rename(columns={'fare': 'ticket_price', 'sex': 'gender'}, inplace=True)

# Synthetic features are calculated out of the given ones
x_train['family_size'] = x_train['parch'] + x_train['sibsp']
x_train['is_alone'] = np.where(x_train['family_size'] > 1, 0, 1)

# Title feature can be extracted from the name
x_train['title'] = x_train['name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
x_train.loc[x_train['title'] == 'Miss', 'title'] = 'Mrs'
x_train.loc[x_train['title'] == 'Master', 'title'] = 'Mr'
x_train.loc[(x_train['title'] != 'Mrs') & (x_train['title'] != 'Mr'), 'title'] = 'rare'

# Imputación de nulos: más frecuente -> variables categóricas & 'k-vecinos=5 para las continuas'
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))])
num_transformer = Pipeline(steps=[('imputer', KNNImputer(n_neighbors=5))])
preprocessor = ColumnTransformer(
    transformers=[('num', num_transformer, CONT_COLUMN_NAMES), ('cat', cat_transformer, CAT_COLUMN_NAMES)])
preprocessor.fit(x_train)
x_train_prepro = preprocessor.transform(x_train)


# Once the features have been preprocessed, the target column is vertically appended
concat_train_prepro = np.concatenate((x_train_prepro, np.expand_dims(y_train.values, axis=1)), axis=1).tolist()

# Build Datafrane with Features + y_true
df_train = pd.DataFrame(concat_train_prepro, columns=CONT_COLUMN_NAMES + CAT_COLUMN_NAMES + TAR_COLUMN_NAMES)
columns_features = df_train.columns.drop('survived').to_list()

# float to int
df_train['pclass'] = df_train['pclass'].astype(int)
df_train['family_size'] = df_train['family_size'].astype(int)

df_train = df_train[['gender', 'title', 'age', 'family_size', 'is_alone',
                     'embarked',  'pclass',  'ticket_price', 'survived']]

# Escritura del Dataset
df_train.to_csv('titanic.csv', index_label='passenger_id', header=True)

df_train.sample(10)

Unnamed: 0,gender,title,age,family_size,is_alone,embarked,pclass,ticket_price,survived
116,female,Mrs,60.0,5,0,S,1,263.0,1
1220,male,Mr,32.0,0,1,S,3,8.05,0
680,male,Mr,21.1,0,1,C,3,7.225,0
345,male,Mr,23.0,0,1,S,2,13.0,0
465,female,Mrs,23.0,0,1,C,2,13.7917,1
752,male,Mr,24.0,2,0,S,3,24.15,0
1216,female,Mrs,21.4,0,1,Q,3,7.7333,1
646,female,Mrs,38.0,6,0,S,3,31.3875,1
939,female,Mrs,36.0,2,0,S,3,12.1833,0
219,male,Mr,49.8,0,1,C,1,25.7417,1


In [4]:
# family_size transform
df_train['family_size'] = df_train['family_size'].apply(lambda x: '=1' if x <= 1
                                                        else ('=2' if (x ==2)
                                                              else ('3-5' if (x >2 and x <= 5)
                                                                    else '>5')))

# Separación de variables categoricas, continuas y target
cat = df_train[CAT_COLUMN_NAMES].values
cont = df_train[CONT_COLUMN_NAMES].values
tar = df_train['survived'].values

# OneHot enconding de las pariables categoricas
enc = OneHotEncoder()
enc.fit(cat)
cat_enc = enc.transform(cat)

# Nombre de las columnas OneHot Encoding
enc_names = CONT_COLUMN_NAMES.copy()
for col_n in CAT_COLUMN_NAMES:
    for v in pd.unique(df_train[col_n]):
        enc_names.append(col_n + '_{}'.format(v) if isinstance(v, str) else col_n + '_{:.1f}'.format(v))
enc_names.append('survived')

# Paso a DataFrame
data = np.concatenate((cont, cat_enc.toarray(), [[i] for i in tar]), axis=1)
df_ohe = pd.DataFrame(data, columns=enc_names)
df_train_features = df_ohe.drop(['survived'], axis=1)
df_train_target = df_ohe['survived'].astype('int')

# Entrenamiento y prediccion del modelo
model = RandomForestClassifier()
model.fit(df_train_features.values, df_train_target.values)
pre_prediction = model.predict(df_train_features.values)
print('INFO: Train accuracy: {:.4f}'.format(accuracy_score(df_train_target.values, pre_prediction)))

# Target column is replaced in the ORIGINAL dataset (not the one with the dummy variables) by the prediction column
df_predict = df_train
df_predict['y_true'] = df_predict['survived']
df_predict['y_predict'] = pre_prediction
df_predict_targets = pd.get_dummies(df_predict['y_predict'], prefix='survived')
df_predict = pd.concat([df_predict.drop(['survived'], axis=1), df_predict_targets], axis=1)

# replace predictions targets names
df_predict.rename(columns={'survived_0': 'NO_SURVIVED', 'survived_1': 'SURVIVED'}, inplace=True)
df_predict.sample(10)

INFO: Train accuracy: 0.9679


Unnamed: 0,gender,title,age,family_size,is_alone,embarked,pclass,ticket_price,y_true,y_predict,NO_SURVIVED,SURVIVED
652,male,Mr,23.0,=1,1,S,3,7.8542,0,0,1,0
1187,female,Mrs,1.0,=2,0,S,3,16.7,1,1,0,1
1081,female,Mrs,34.5,=1,1,Q,3,7.8292,1,1,0,1
829,female,Mrs,16.0,>5,0,S,3,46.9,0,0,1,0
919,male,Mr,18.5,=1,1,C,3,7.2292,0,0,1,0
942,male,Mr,21.1,=1,1,C,3,7.225,0,0,1,0
1204,male,Mr,21.0,=1,1,S,3,7.925,0,0,1,0
1241,male,Mr,30.1,=1,1,C,3,6.4375,0,0,1,0
1002,female,Mrs,21.0,=2,0,Q,3,23.25,1,1,0,1
1123,female,Mrs,24.2,=2,0,C,3,22.3583,1,1,0,1


In [5]:
# Discretizamos la edad
df_predict['age'] = df_predict['age'].apply(lambda x: '<12_years' if x <= 12
                                            else ('12_18_years' if (x >12 and x <= 18)
                                                  else ('18_30_years' if (x >18 and x <= 30)
                                                        else ('30_60_years' if (x >30 and x <= 60)
                                                              else '>60_years'))))

# Discretizamos el precio del billete (high, mid, low)
kbd = KBinsDiscretizer(n_bins=3, encode='ordinal')
df_predict['ticket_price'] = kbd.fit_transform(df_predict[['ticket_price']].values)
df_predict['ticket_price'] = df_predict['ticket_price'].apply(lambda x: 'Low' if x == 0
                                                              else ('Mid' if x == 1
                                                                    else 'High'))
df_predict.sample(10)

Unnamed: 0,gender,title,age,family_size,is_alone,embarked,pclass,ticket_price,y_true,y_predict,NO_SURVIVED,SURVIVED
1260,female,Mrs,12_18_years,=1,1,S,3,Mid,1,1,0,1
443,male,Mr,30_60_years,=2,0,S,2,High,0,0,1,0
275,female,Mrs,30_60_years,=2,0,C,1,High,1,1,0,1
908,female,Mrs,18_30_years,=1,1,S,3,Mid,0,0,1,0
150,male,Mr,30_60_years,=1,1,S,1,Low,0,0,1,0
1130,female,Mrs,12_18_years,=1,1,S,3,Low,0,1,0,1
532,male,rare,30_60_years,=1,1,S,2,Mid,0,0,1,0
26,male,Mr,18_30_years,=1,1,C,1,High,1,1,0,1
1142,male,Mr,<12_years,3-5,0,Q,3,High,0,0,1,0
527,male,Mr,18_30_years,=1,1,S,2,Mid,0,0,1,0


In [6]:
#sns.histplot(data=df_predict, x="is_alone")

In [7]:
df_predict = df_predict[['gender', 'title', 'age', 'family_size', 'is_alone',
                         'embarked',  'pclass',  'ticket_price',
                         'NO_SURVIVED', 'SURVIVED',
                         'y_true', 'y_predict']]
# Escritura del Dataset
df_predict.to_csv('titanic_discretized.csv', index_label='passenger_id', header=True)
df_predict.sample(10)

Unnamed: 0,gender,title,age,family_size,is_alone,embarked,pclass,ticket_price,NO_SURVIVED,SURVIVED,y_true,y_predict
1260,female,Mrs,12_18_years,=1,1,S,3,Mid,0,1,1,1
888,male,Mr,30_60_years,=1,1,S,3,Low,1,0,0,0
428,female,Mrs,18_30_years,=2,0,S,2,Mid,0,1,1,1
631,male,Mr,18_30_years,=1,1,S,3,Low,1,0,0,0
1187,female,Mrs,<12_years,=2,0,S,3,Mid,0,1,1,1
335,male,rare,30_60_years,=1,1,S,2,Mid,1,0,0,0
617,male,Mr,18_30_years,=1,1,S,3,Low,1,0,0,0
1289,male,Mr,18_30_years,=1,1,S,3,Low,1,0,0,0
293,female,Mrs,30_60_years,=1,1,S,1,High,0,1,1,1
1037,female,Mrs,18_30_years,=1,1,C,3,Low,0,1,1,1
