# Transformação para rede neural

Baseada em exemplo do Kaggle

## Preparando o ambiente

In [134]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
import tensorflow as tf

In [135]:
SEED = 1002
tf.random.set_seed(1002)
np.random.seed(1002)

## Carregando os dados

In [87]:
train = pd.read_csv("https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/original/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/original/test.csv")
gt = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/original/ground_truth.csv')

Adicionando coluna para identificar os registros como treino e teste e juntando os dados para fazer o merge junto.

> Realizar o tratamento de dados com o dataset único pode ser problemático para alguns datasets pois dados de teste podem "vazar" para os dados de treino.

In [88]:
train['Type'] = 'train'
test['Type'] = 'test'

data = train.append(test)

  data = train.append(test)


In [89]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Type
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train


## Tratamento dos dados faltantes e _feature engineering_

### `Title`

In [90]:
data['Title'] = data['Name']

# Cleaning name and extracting Title using regex
for name_string in data['Name']:
    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=True)

Trocando títulos mais raros por títulos mais comuns: isso exige um conhecimento maior da própria lingua inglesa e seus respectivos pronomes de tratamento. Provavelmente aqui teve um dos maiores ganhos dessa transformação em relação às feitas por mim.

In [91]:
mapping = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Major': 'Other', 
           'Col': 'Other', 'Dr' : 'Other', 'Rev' : 'Other', 'Capt': 'Other', 
           'Jonkheer': 'Royal', 'Sir': 'Royal', 'Lady': 'Royal', 
           'Don': 'Royal', 'Countess': 'Royal', 'Dona': 'Royal'}
           
data.replace({'Title': mapping}, inplace=True)
titles = ['Miss', 'Mr', 'Mrs', 'Royal', 'Other', 'Master']

### `Age`

Idades faltantes são substituídas pela mediana do título - semelhante ao que eu fiz nas primeiras análises.

In [92]:
for title in titles:
    age_to_impute = data.groupby('Title')['Age'].median()[titles.index(title)]
    data.loc[(data['Age'].isnull()) & (data['Title'] == title), 'Age'] = age_to_impute

Abaixo também a informação para considerar como criança ou não, o que como foi visto é relevante. Talvez o relevante seja isso e não toda a faixa etária como eu fiz.

In [93]:
data.loc[:,'Child'] = 1
data.loc[(data['Age'] >= 18),'Child'] =0

### Informações da família (`SibSp` e `Parch`)

Número de pessoas na família e a informação se estava sozinho ou não foi corretamente entendida como relevante nas minhas análises, mas a melhora aqui está em criar uma classificação entre famílias maiores e menores. **Existe alguma diferença nos dados com relação a isso?**

#### `Family_Size`

In [94]:
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data.loc[:,'FsizeD'] = 'Alone'
data.loc[(data['Family_Size'] > 1),'FsizeD'] = 'Small'
data.loc[(data['Family_Size'] > 4),'FsizeD'] = 'Big'

#### `Last_Name` e `Family_Survival`

Muito interessante considerar a sobrevivência pelo sobrenome. Isso pode ser encontrado também [neste notebook](https://www.kaggle.com/konstantinmasich/titanic-0-82-0-83).

In [95]:
data['Last_Name'] = data['Name'].apply(lambda x: str.split(x, ",")[0])

In [96]:
DEFAULT_SURVIVAL_VALUE = 0.5
data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
for grp, grp_df in data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
                               
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            # For each member of the family, we get if someone has survided or not (except him)
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                # If someone has survived, so the value is one
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin == 0.0):
                # If anyone has survived, so the value is zero
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
            # If the lastname is only found at the test data, we impute the value 0.5

Fazendo a mesma coisa mas agrupando por ticket nos casos de sobrenomes diferentes. Isso é realmente uma ótima ideia.

In [97]:
for _, grp_df in data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin == 0.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0

A principal preocupação com relação a esse tratamento é que ele pode "vazar" dados do dataset de treino para o de teste, visto que a sobrevivênvia do dataset de treino pode ser considerado no dataset do de teste.

### `Fare`

Esse é o atributo mais complicado de trabalhar desse dataset depois do ticket. Os dados faltantes foram preenchidos com a mediana da classe, o que faz muito sentido e eu não tinha pensado em fazer.

In [98]:
fa = data[data["Pclass"] == 3]
data['Fare'].fillna(fa['Fare'].median(), inplace = True)

## Encoding e Pre-modeling

Remoção de features desnecesárias e encodificação das features, ou seja, dummies, numericals e outros.

### Removendo features desnecessárias

Interessante que a Cabine, local de embarque, ticket e até mesmo o tamanho da família foram descartados. Será que nenhum deles mesmo é relevante para a sobrevivência?

In [99]:
data = data.drop(columns = ['Age','Cabin','Embarked','Name','Last_Name',
                            'Parch', 'SibSp','Ticket', 'Family_Size'])

### Encoding das features

Muito interessante essa separação pelo número de valores únicos nas features.

In [100]:
target_col = ["Survived"]
id_dataset = ["Type"]

Separando as variáveis categóricas.

In [101]:
cat_cols   = data.nunique()[data.nunique() < 12].keys().tolist()
cat_cols   = [x for x in cat_cols ]
cat_cols

['Survived',
 'Pclass',
 'Sex',
 'Type',
 'Title',
 'Child',
 'FsizeD',
 'Family_Survival']

Colunas numéricas

In [102]:
num_cols = [x for x in data.columns if x not in cat_cols + target_col + id_dataset]

Colunas binárias

In [103]:
bin_cols = data.nunique()[data.nunique() == 2].keys().tolist()

Colunas com mais de dois valores

In [104]:
multi_cols = [i for i in cat_cols if i not in bin_cols]

Encoding colunas binárias

In [105]:
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])

data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Fare,Type,Title,Child,FsizeD,Family_Survival
0,1,0,3,1,7.25,1,Mr,0,Small,0.5
1,2,1,1,0,71.2833,1,Mrs,0,Small,0.5


Dummy values

In [106]:
data = pd.get_dummies(data = data,columns = multi_cols )

In [107]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Sex,Fare,Type,Child,Pclass_1,Pclass_2,Pclass_3,Title_Master,...,Title_Mr,Title_Mrs,Title_Other,Title_Royal,FsizeD_Alone,FsizeD_Big,FsizeD_Small,Family_Survival_0.0,Family_Survival_0.5,Family_Survival_1.0
0,1,0,1,7.25,1,0,0,0,1,0,...,1,0,0,0,0,0,1,0,1,0
1,2,1,0,71.2833,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0


Normalizando colunas numéricas

In [108]:
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns = num_cols)

df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index = True,right_index = True,how = "left")
data = data.drop(columns = ['PassengerId'],axis = 1)

data.head(2)

Unnamed: 0,Survived,Sex,Type,Child,Pclass_1,Pclass_2,Pclass_3,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Title_Royal,FsizeD_Alone,FsizeD_Big,FsizeD_Small,Family_Survival_0.0,Family_Survival_0.5,Family_Survival_1.0,Fare
0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,-0.503176
0,2,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,-0.503176


Separando o target, ou seja, o resultado esperado.

In [109]:
cols = data.columns.tolist()
cols.insert(0, cols.pop(cols.index('Survived')))
data = data.reindex(columns= cols)
data.head(2)

Unnamed: 0,Survived,Sex,Type,Child,Pclass_1,Pclass_2,Pclass_3,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Title_Royal,FsizeD_Alone,FsizeD_Big,FsizeD_Small,Family_Survival_0.0,Family_Survival_0.5,Family_Survival_1.0,Fare
0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,-0.503176
0,2,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,-0.503176


## Separando treino e teste

In [129]:
train = data[data['Type'] == 1].drop(columns = ['Type'])
test = data[data['Type'] == 0].drop(columns = ['Type'])

X_train = train.iloc[:, 1:20].values
y_train = train.iloc[:,0].values

## Treinando a rede neural

In [126]:
def create_baseline():
    model = Sequential()
    model.add(Dense(13, input_dim = 18, activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [127]:
estimator = KerasClassifier(build_fn = create_baseline, epochs = 20, batch_size = 10, verbose = 1)
kfold = StratifiedKFold(n_splits = 5, shuffle = False)
results = cross_val_score(estimator, X_train, y_train, cv = kfold)

  estimator = KerasClassifier(build_fn = create_baseline, epochs = 20, batch_size = 10, verbose = 1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

In [128]:
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 84.96% (1.64%)


In [130]:
X_test = test.iloc[:, 1:20].values
estimator.fit(X_train, y_train, epochs = 20, batch_size = 10)
prediction = estimator.predict(X_test).tolist()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Criando predição dos dados de teste

In [131]:
se = pd.Series(prediction)
# Creating new column of predictions in data_check dataframe
test['check'] = se
test['check'] = test['check'].str.get(0)

series = []
for val in test.check:
    if val >= 0.5:
        series.append(1)
    else:
        series.append(0)
test['final'] = series

In [132]:
match = 0
nomatch = 0
for val in test.values:
    if val[1] == val[3]:
        match = match +1
    else:
        nomatch = nomatch +1

Validando com os dados de teste baseado no ground truth

In [133]:
test['Survived'] = test['final']
predictions = test['final'].tolist()

print(classification_report(gt['Survived'], predictions))
print(accuracy_score(gt['Survived'], predictions))
print(confusion_matrix(gt['Survived'], predictions))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       260
           1       0.78      0.68      0.73       158

    accuracy                           0.81       418
   macro avg       0.80      0.78      0.79       418
weighted avg       0.80      0.81      0.80       418

0.80622009569378
[[229  31]
 [ 50 108]]


Salvando predições em CSV no formato esperado

In [119]:
validation = pd.read_csv("https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/original/test.csv")
validation['Survived'] = test['Survived']
temp = validation[['PassengerId', 'Survived']]

temp.to_csv("../../data/submissions/keras_ex_nn.csv", index = False)

## Salvando os dados processados

In [120]:
train.head()

Unnamed: 0,Survived,Sex,Child,Pclass_1,Pclass_2,Pclass_3,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Title_Royal,FsizeD_Alone,FsizeD_Big,FsizeD_Small,Family_Survival_0.0,Family_Survival_0.5,Family_Survival_1.0,Fare
0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,-0.503176
1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0.734809
2,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,-0.490126
3,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0.383263
4,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,-0.487709


In [121]:
test.head()

Unnamed: 0,Survived,Sex,Child,Pclass_1,Pclass_2,Pclass_3,Title_Master,Title_Miss,Title_Mr,Title_Mrs,...,Title_Royal,FsizeD_Alone,FsizeD_Big,FsizeD_Small,Family_Survival_0.0,Family_Survival_0.5,Family_Survival_1.0,Fare,check,final
0,0,1,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,-0.503176,0,0
1,1,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,1,0,0.734809,1,1
2,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,0,1,0,-0.490126,0,0
3,0,1,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0.383263,0,0
4,1,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,0,1,-0.487709,1,1


In [124]:
test_data = test.drop(columns=['Survived', 'check', 'final'])
test_data['PassengerId'] = validation['PassengerId']

In [125]:
train.to_csv("../../data/processed_v6/train.csv", index = False)
test_data.to_csv("../../data/processed_v6/test.csv", index = False)

## Validando o treino com o RandomForest

In [136]:
rfParams = {"criterion":["gini","entropy"],
             "n_estimators":[10, 20, 50, 100, 180, 200],
             "min_samples_leaf":[1, 2, 3, 4],
             "min_samples_split":np.arange(3,8), 
             "max_features":["sqrt", "auto", "log2"],
             "n_jobs": [-1],
             "random_state":[SEED]}

In [137]:
def tune_model_params(model, params, x, y):
    gridsearch = GridSearchCV(model, params, scoring='accuracy', n_jobs=-1, cv=10, verbose=4, return_train_score=True)
    gridsearch.fit(x, y)
    best_params, best_score = gridsearch.best_params_, round(gridsearch.best_score_*100, 2)
    return best_params, best_score

In [138]:
best_params, best_score = tune_model_params(RandomForestClassifier(), rfParams, X_train, y_train)

print("Best params: ", best_params)
print("Best score: ", best_score)

Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Best params:  {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 10, 'n_jobs': -1, 'random_state': 1002}
Best score:  85.18


In [139]:
rf = RandomForestClassifier(**best_params)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)

print("RANDOM FOREST \n")
print(classification_report(gt['Survived'], predictions))
print(accuracy_score(gt['Survived'], predictions))
print(confusion_matrix(gt['Survived'], predictions))

RANDOM FOREST 

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       260
           1       0.74      0.66      0.70       158

    accuracy                           0.78       418
   macro avg       0.77      0.76      0.77       418
weighted avg       0.78      0.78      0.78       418

0.784688995215311
[[224  36]
 [ 54 104]]


O resultado com esse dataset foi ligeramente inferior ao resultado com o dataset que eu tinha feito para a RandomForest. O caso aqui foi uma questão de boa transformação de dados com um modelo mais robusto.