In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer, RobustScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
CAT_COLUMN_NAMES = ['family_size', 'embarked', 'sex', 'pclass', 'title', 'is_alone']
CONT_COLUMN_NAMES = ['age', 'ticket_price']
QUANTILE_SIZE = 10
TAR_COLUMN_NAMES = ['survived']

In [3]:
x_train, y_train = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Some features considered irrelevant are dropped from the beginning
x_train.drop(['boat', 'body', 'home.dest'], axis=1, inplace=True)
x_train.rename(columns={'fare': 'ticket_price'}, inplace=True)

# Synthetic features are calculated out of the given ones
x_train['family_size'] = x_train['parch'] + x_train['sibsp']
x_train['is_alone'] = np.where(x_train['family_size'] > 1, 0, 1)

# Title feature can be extracted from the name
x_train['title'] = x_train['name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
x_train.loc[x_train['title'] == 'Miss', 'title'] = 'Mrs'
x_train.loc[x_train['title'] == 'Master', 'title'] = 'Mr'
x_train.loc[(x_train['title'] != 'Mrs') & (x_train['title'] != 'Mr'), 'title'] = 'rare'

# Imputación de nulos: más frecuente -> variables categóricas & 'k-vecinos=5 para las continuas'
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))])
num_transformer = Pipeline(steps=[('imputer', KNNImputer(n_neighbors=5))])
preprocessor = ColumnTransformer(
    transformers=[('num', num_transformer, CONT_COLUMN_NAMES), ('cat', cat_transformer, CAT_COLUMN_NAMES)])
preprocessor.fit(x_train)
x_train_prepro = preprocessor.transform(x_train)


# Once the features have been preprocessed, the target column is vertically appended
concat_train_prepro = np.concatenate((x_train_prepro, np.expand_dims(y_train.values, axis=1)), axis=1).tolist()

# Build Datafrane with Features + y_true
df_train = pd.DataFrame(concat_train_prepro, columns=CONT_COLUMN_NAMES + CAT_COLUMN_NAMES + TAR_COLUMN_NAMES)
columns_features = df_train.columns.drop('survived').to_list()

# family_size transform
df_train['family_size'] = df_train['family_size'].apply(lambda x: 'Family_1' if x <= 1
                                                        else ('Family_2' if (x ==2)
                                                              else ('Family_3_5' if (x >2 and x <= 5)
                                                                    else 'Family_upper_5')))

df_train.head(15)


Unnamed: 0,age,ticket_price,family_size,embarked,sex,pclass,title,is_alone,survived
0,29.0,211.3375,Family_1,S,female,1.0,Mrs,1,1
1,0.9167,151.55,Family_3_5,S,male,1.0,Mr,0,1
2,2.0,151.55,Family_3_5,S,female,1.0,Mrs,0,0
3,30.0,151.55,Family_3_5,S,male,1.0,Mr,0,0
4,25.0,151.55,Family_3_5,S,female,1.0,Mrs,0,0
5,48.0,26.55,Family_1,S,male,1.0,Mr,1,1
6,63.0,77.9583,Family_1,S,female,1.0,Mrs,1,1
7,39.0,0.0,Family_1,S,male,1.0,Mr,1,0
8,53.0,51.4792,Family_2,S,female,1.0,Mrs,0,1
9,71.0,49.5042,Family_1,C,male,1.0,Mr,1,0


In [4]:
# Separación de variables categoricas, continuas y target
cat = df_train[CAT_COLUMN_NAMES].values
cont = df_train[CONT_COLUMN_NAMES].values
tar = df_train['survived'].values

# OneHot enconding de las pariables categoricas
enc = OneHotEncoder()
enc.fit(cat)
cat_enc = enc.transform(cat)

# Nombre de las columnas OneHot Encoding
enc_names = CONT_COLUMN_NAMES.copy()
for col_n in CAT_COLUMN_NAMES:
    for v in pd.unique(df_train[col_n]):
        enc_names.append(col_n + '_{}'.format(v) if isinstance(v, str) else col_n + '_{:.1f}'.format(v))
enc_names.append('survived')

# Paso a DataFrame
data = np.concatenate((cont, cat_enc.toarray(), [[i] for i in tar]), axis=1)
df_ohe = pd.DataFrame(data, columns=enc_names)
df_train_features = df_ohe.drop(['survived'], axis=1)
df_train_target = df_ohe['survived'].astype('int')

# Entrenamiento y prediccion del modelo
model = RandomForestClassifier()
model.fit(df_train_features.values, df_train_target.values)
pre_prediction = model.predict(df_train_features.values)
print('INFO: Train accuracy: {:.4f}'.format(accuracy_score(df_train_target.values, pre_prediction)))

# Target column is replaced in the ORIGINAL dataset (not the one with the dummy variables) by the prediction column
df_predict = df_train
df_predict['y_true'] = df_predict['survived']
df_predict['y_predict'] = pre_prediction
df_predict_targets = pd.get_dummies(df_predict['y_predict'], prefix='survived')
df_predict = pd.concat([df_predict.drop(['survived'], axis=1), df_predict_targets], axis=1)

# replace predictions targets names
df_predict.rename(columns={'survived_0': 'NO_SURVIVED', 'survived_1': 'SURVIVED'}, inplace=True)
df_predict.sample(10)

INFO: Train accuracy: 0.9679


Unnamed: 0,age,ticket_price,family_size,embarked,sex,pclass,title,is_alone,y_true,y_predict,NO_SURVIVED,SURVIVED
171,46.0,26.0,Family_1,S,male,1.0,Mr,1,0,0,1,0
184,35.3,27.7208,Family_1,C,male,1.0,Mr,1,0,0,1,0
575,27.0,21.0,Family_1,S,female,2.0,Mrs,1,0,0,1,0
1160,50.0,8.05,Family_1,S,male,3.0,Mr,1,0,0,1,0
441,55.0,16.0,Family_1,S,female,2.0,Mrs,1,1,1,0,1
98,48.0,106.425,Family_1,C,female,1.0,Mrs,1,1,1,0,1
458,17.0,10.5,Family_1,S,female,2.0,Mrs,1,1,1,0,1
460,24.0,27.0,Family_3_5,S,female,2.0,Mrs,0,1,1,0,1
802,35.6,6.95,Family_1,Q,male,3.0,Mr,1,0,0,1,0
1258,29.0,15.2458,Family_2,C,female,3.0,Mrs,0,1,1,0,1


In [14]:
#Discretización con "criterio" de las variables age y fare
# import seaborn as sns
# sns.histplot(data=df_predict, x="age")

# Discretizamos la edad
# df_predict['age'] = df_predict['age'].apply(lambda x: 'Kids-10' if x <= 10
#                                             else ('Teenager_10_18' if (x >10 and x <= 18)
#                                                   else ('Young_18_25' if (x >18 and x <= 25)
#                                                         else ('Adult_25_65' if (x >18 and x <= 25)
#                                                               else 'Old+65'))))
# Discretizamos 
qt = QuantileTransformer(n_quantiles=5, random_state=0)
df_predict['ticket_price'] = qt.fit_transform(df_predict[['ticket_price']].values)



# df_predict.sample(10)

In [39]:
qt.references_

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [38]:
import numpy as np
from sklearn.preprocessing import quantile_transform
rng = np.random.RandomState(0)
X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
quantile_transform(X, n_quantiles=10, random_state=0, copy=True)


array([[0.        ],
       [0.09871873],
       [0.10643612],
       [0.11754671],
       [0.21017437],
       [0.21945445],
       [0.23498666],
       [0.32443642],
       [0.33333333],
       [0.41360794],
       [0.42339464],
       [0.46257841],
       [0.47112236],
       [0.49834237],
       [0.59986536],
       [0.63390302],
       [0.66666667],
       [0.68873101],
       [0.69611125],
       [0.81280699],
       [0.82160354],
       [0.88126439],
       [0.90516028],
       [0.99319435],
       [1.        ]])

In [33]:
df_predict.sample(10)

Unnamed: 0,age,ticket_price,family_size,embarked,sex,pclass,title,is_alone,y_true,y_predict,NO_SURVIVED,SURVIVED
710,37.0,0.245384,Family_1,Q,female,3.0,Mrs,1,0,0,1,0
1095,23.0,0.241559,Family_1,Q,female,3.0,Mrs,1,0,0,1,0
1065,21.0,0.246967,Family_1,S,male,3.0,Mr,1,0,0,1,0
861,23.0,0.251113,Family_1,S,female,3.0,Mrs,1,0,0,1,0
832,43.0,0.75812,Family_upper_5,S,female,3.0,Mrs,0,0,0,1,0
1051,33.0,0.255878,Family_1,S,male,3.0,Mr,1,0,0,1,0
973,28.6,0.239051,Family_1,S,male,3.0,Mr,1,0,0,1,0
713,21.0,0.244855,Family_1,Q,male,3.0,Mr,1,0,0,1,0
527,28.0,0.34927,Family_1,S,male,2.0,Mr,1,0,0,1,0
71,27.0,0.80483,Family_1,C,male,1.0,Mr,1,0,0,1,0


In [37]:
from sklearn.preprocessing import KBinsDiscretizer
kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
df_predict['ticket_price5'] = qt.fit_transform(df_predict[['ticket_price']].values)
df_predict.sample(10)

Unnamed: 0,age,ticket_price,family_size,embarked,sex,pclass,title,is_alone,y_true,y_predict,NO_SURVIVED,SURVIVED,ticket_price2,ticket_price3,ticket_price4,ticket_price5
19,36.0,0.772849,Family_1,C,male,1.0,Mr,1,0,0,1,0,0.772849,0.772849,0.772849,0.772849
202,36.0,0.675873,Family_1,S,male,1.0,Mr,1,1,1,0,1,0.675873,0.675873,0.675873,0.675873
141,45.0,0.766673,Family_1,C,female,1.0,Mrs,1,1,1,0,1,0.766673,0.766673,0.766673,0.766673
381,30.0,0.444567,Family_1,S,female,2.0,Mrs,1,0,1,0,1,0.444567,0.444567,0.444567,0.444567
182,30.0,0.789055,Family_1,C,female,1.0,Mrs,1,1,1,0,1,0.789055,0.789055,0.789055,0.789055
809,18.0,0.751611,Family_3_5,S,male,3.0,Mr,0,0,0,1,0,0.751611,0.751611,0.751611,0.751611
1096,2.0,0.598402,Family_3_5,S,male,3.0,Mr,0,0,0,1,0,0.598402,0.598402,0.598402,0.598402
527,28.0,0.34927,Family_1,S,male,2.0,Mr,1,0,0,1,0,0.34927,0.34927,0.34927,0.34927
526,29.0,0.477285,Family_1,C,male,2.0,Mr,1,1,1,0,1,0.477285,0.477285,0.477285,0.477285
1068,61.0,0.197494,Family_1,S,male,3.0,Mr,1,0,0,1,0,0.197494,0.197494,0.197494,0.197494


In [32]:
rng = np.random.RandomState(0)
X = rng.normal(loc=0.5, scale=0.25, size=(25, 1))
# X
qt = QuantileTransformer(n_quantiles=5, random_state=0)
qt.fit_transform(X)
qt.references_

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [6]:
df_predict.sample(100)

Unnamed: 0,age,ticket_price,family_size,embarked,sex,pclass,title,is_alone,y_true,y_predict,NO_SURVIVED,SURVIVED
319,Old+65,134.5,Family_1,C,female,1.0,Mrs,1,1,1,0,1
1188,Young_18_25,16.7,Family_2,S,female,3.0,Mrs,0,1,1,0,1
317,Young_18_25,61.3792,Family_1,C,male,1.0,Mr,1,1,1,0,1
528,Old+65,0.0,Family_1,S,male,2.0,Mr,1,0,0,1,0
1046,Young_18_25,7.8958,Family_1,S,male,3.0,Mr,1,0,0,1,0
603,Old+65,20.25,Family_2,S,female,3.0,Mrs,0,1,1,0,1
114,Young_18_25,263.0,Family_3_5,S,male,1.0,Mr,0,0,0,1,0
62,Old+65,61.175,Family_1,S,male,1.0,Mr,1,0,0,1,0
723,Old+65,7.7333,Family_1,Q,male,3.0,Mr,1,0,0,1,0
1240,Kids-10,8.5167,Family_1,C,male,3.0,Mr,1,1,1,0,1
