# Transformação para rede neural

A ideia é transformar os dados em binários ou ranges de 0 a 1 baseado na transformação v4 para prepará-lo para executar em uma rede neural.

## Preparando o ambiente

In [21]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display
import numpy as np

## Carregando os dados

In [22]:
original_train = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/original/train.csv')
original_test = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/original/test.csv')
train = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/processed_v4/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/processed_v4/test.csv')

train.shape, test.shape

((891, 30), (418, 29))

## Analisando os dados

In [23]:
train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,FamilySize,Singleton,...,Cabin_F,Cabin_G,Cabin_T,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Survived
0,1,3,2,22.0,1,0,139,7.25,2,0,...,0,0,1,0,0,1,0,0,0,0
1,2,1,1,38.0,1,0,154,71.2833,2,0,...,0,0,0,0,0,0,1,0,0,1
2,3,3,1,26.0,0,0,165,7.925,1,1,...,0,0,1,0,1,0,0,0,0,1
3,4,1,1,35.0,1,0,3,53.1,2,0,...,0,0,0,0,0,0,1,0,0,1
4,5,3,2,35.0,0,0,108,8.05,1,1,...,0,0,1,0,0,1,0,0,0,0


## Pclass

Vamos começar pelo Pclass, transformá-lo em um Dummy.

In [24]:
class_dummies = pd.get_dummies(train['Pclass'], prefix='Pclass')
train = train.join(class_dummies)
train.drop(columns=['Pclass'], inplace=True)

class_dummies = pd.get_dummies(test['Pclass'], prefix='Pclass')
test = test.join(class_dummies)
test.drop(columns=['Pclass'], inplace=True)

train.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,FamilySize,Singleton,SmallFamily,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Survived,Pclass_1,Pclass_2,Pclass_3
0,1,2,22.0,1,0,139,7.25,2,0,1,...,0,0,1,0,0,0,0,0,0,1
1,2,1,38.0,1,0,154,71.2833,2,0,1,...,0,0,0,1,0,0,1,1,0,0
2,3,1,26.0,0,0,165,7.925,1,1,0,...,0,1,0,0,0,0,1,0,0,1
3,4,1,35.0,1,0,3,53.1,2,0,1,...,0,0,0,1,0,0,1,1,0,0
4,5,2,35.0,0,0,108,8.05,1,1,0,...,0,0,1,0,0,0,0,0,0,1


## Sex

Transformar o Sex 1 e 2 em 0 e 1.

In [25]:
sex_map = {
    1: 0,
    2: 1
}

train['Sex'] = train['Sex'].map(sex_map)
test['Sex'] = test['Sex'].map(sex_map)
train.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,FamilySize,Singleton,SmallFamily,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Survived,Pclass_1,Pclass_2,Pclass_3
0,1,1,22.0,1,0,139,7.25,2,0,1,...,0,0,1,0,0,0,0,0,0,1
1,2,0,38.0,1,0,154,71.2833,2,0,1,...,0,0,0,1,0,0,1,1,0,0
2,3,0,26.0,0,0,165,7.925,1,1,0,...,0,1,0,0,0,0,1,0,0,1
3,4,0,35.0,1,0,3,53.1,2,0,1,...,0,0,0,1,0,0,1,1,0,0
4,5,1,35.0,0,0,108,8.05,1,1,0,...,0,0,1,0,0,0,0,0,0,1


## Age

Idade será normalizada entre 0 e 1

In [26]:
age_max = max([max(train['Age']), max(test['Age'])])
train['Age'] = train['Age'] / age_max
test['Age'] = test['Age'] / age_max

In [27]:
train.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,FamilySize,Singleton,SmallFamily,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Survived,Pclass_1,Pclass_2,Pclass_3
0,1,1,0.275,1,0,139,7.25,2,0,1,...,0,0,1,0,0,0,0,0,0,1
1,2,0,0.475,1,0,154,71.2833,2,0,1,...,0,0,0,1,0,0,1,1,0,0
2,3,0,0.325,0,0,165,7.925,1,1,0,...,0,1,0,0,0,0,1,0,0,1
3,4,0,0.4375,1,0,3,53.1,2,0,1,...,0,0,0,1,0,0,1,1,0,0
4,5,1,0.4375,0,0,108,8.05,1,1,0,...,0,0,1,0,0,0,0,0,0,1


## Fare

Normalizada da mesma forma que o Age

In [28]:
fare_max = max([max(train['Fare']), max(test['Fare'])])
train['Fare'] = train['Fare'] / fare_max
test['Fare'] = test['Fare'] / fare_max

train.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,FamilySize,Singleton,SmallFamily,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Survived,Pclass_1,Pclass_2,Pclass_3
0,1,1,0.275,1,0,139,0.014151,2,0,1,...,0,0,1,0,0,0,0,0,0,1
1,2,0,0.475,1,0,154,0.139136,2,0,1,...,0,0,0,1,0,0,1,1,0,0
2,3,0,0.325,0,0,165,0.015469,1,1,0,...,0,1,0,0,0,0,1,0,0,1
3,4,0,0.4375,1,0,3,0.103644,2,0,1,...,0,0,0,1,0,0,1,1,0,0
4,5,1,0.4375,0,0,108,0.015713,1,1,0,...,0,0,1,0,0,0,0,0,0,1


## Variáveis de presença de família

Não sei se o melhor seria normalizar ou separar em dummies. Como não tem como alguém ser acompanhado por "meia" pessoa, ou seja, diferente de Fare e Age não é um dado contínuo, então provavelmente dummies seria melhor. No entanto isso talvez crie colunas demais.

In [29]:
train['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8], dtype=int64)

In [30]:
ss_max = max([max(train['SibSp']), max(test['SibSp'])])
train['SibSp'] = train['SibSp'] / ss_max
test['SibSp'] = test['SibSp'] / ss_max

train.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,FamilySize,Singleton,SmallFamily,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Survived,Pclass_1,Pclass_2,Pclass_3
0,1,1,0.275,0.125,0,139,0.014151,2,0,1,...,0,0,1,0,0,0,0,0,0,1
1,2,0,0.475,0.125,0,154,0.139136,2,0,1,...,0,0,0,1,0,0,1,1,0,0
2,3,0,0.325,0.0,0,165,0.015469,1,1,0,...,0,1,0,0,0,0,1,0,0,1
3,4,0,0.4375,0.125,0,3,0.103644,2,0,1,...,0,0,0,1,0,0,1,1,0,0
4,5,1,0.4375,0.0,0,108,0.015713,1,1,0,...,0,0,1,0,0,0,0,0,0,1


In [31]:
parch_max = max([max(train['Parch']), max(test['Parch'])])
train['Parch'] = train['Parch'] / parch_max
test['Parch'] = test['Parch'] / parch_max

train.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,FamilySize,Singleton,SmallFamily,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Survived,Pclass_1,Pclass_2,Pclass_3
0,1,1,0.275,0.125,0.0,139,0.014151,2,0,1,...,0,0,1,0,0,0,0,0,0,1
1,2,0,0.475,0.125,0.0,154,0.139136,2,0,1,...,0,0,0,1,0,0,1,1,0,0
2,3,0,0.325,0.0,0.0,165,0.015469,1,1,0,...,0,1,0,0,0,0,1,0,0,1
3,4,0,0.4375,0.125,0.0,3,0.103644,2,0,1,...,0,0,0,1,0,0,1,1,0,0
4,5,1,0.4375,0.0,0.0,108,0.015713,1,1,0,...,0,0,1,0,0,0,0,0,0,1


In [32]:
train['FamilySize'].unique()

array([ 2,  1,  5,  3,  7,  6,  4,  8, 11], dtype=int64)

In [33]:
fs_max = max([max(train['FamilySize']), max(test['FamilySize'])])
train['FamilySize'] = train['FamilySize'] / fs_max
test['FamilySize'] = test['FamilySize'] / fs_max

train.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,FamilySize,Singleton,SmallFamily,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Survived,Pclass_1,Pclass_2,Pclass_3
0,1,1,0.275,0.125,0.0,139,0.014151,0.181818,0,1,...,0,0,1,0,0,0,0,0,0,1
1,2,0,0.475,0.125,0.0,154,0.139136,0.181818,0,1,...,0,0,0,1,0,0,1,1,0,0
2,3,0,0.325,0.0,0.0,165,0.015469,0.090909,1,0,...,0,1,0,0,0,0,1,0,0,1
3,4,0,0.4375,0.125,0.0,3,0.103644,0.181818,0,1,...,0,0,0,1,0,0,1,1,0,0
4,5,1,0.4375,0.0,0.0,108,0.015713,0.090909,1,0,...,0,0,1,0,0,0,0,0,0,1


## Ticket

O Ticket é um pouco mais complicado, visto que ele não possui um padrão muito claro. Para tal, vou carregar os Tickets dos dados originais.

In [34]:
ticket_dummies = pd.get_dummies(original_train['Ticket'].str[0:1], prefix='Ticket')
train = train.join(ticket_dummies)
train.drop(columns=['Ticket'], inplace=True)

ticket_dummies = pd.get_dummies(original_test['Ticket'].str[0:1], prefix='Ticket')
test = test.join(ticket_dummies)
test.drop(columns=['Ticket'], inplace=True)
test['Ticket_5'] = np.zeros(test.shape[0])
test['Ticket_8'] = np.zeros(test.shape[0])

print(train.shape, test.shape)
display(train.head())


(891, 47) (418, 46)


Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,FamilySize,Singleton,SmallFamily,LargeFamily,...,Ticket_7,Ticket_8,Ticket_9,Ticket_A,Ticket_C,Ticket_F,Ticket_L,Ticket_P,Ticket_S,Ticket_W
0,1,1,0.275,0.125,0.0,0.014151,0.181818,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,2,0,0.475,0.125,0.0,0.139136,0.181818,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,3,0,0.325,0.0,0.0,0.015469,0.090909,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,0,0.4375,0.125,0.0,0.103644,0.181818,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,0.4375,0.0,0.0,0.015713,0.090909,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
train.columns

Index(['PassengerId', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize',
       'Singleton', 'SmallFamily', 'LargeFamily', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E',
       'Cabin_F', 'Cabin_G', 'Cabin_T', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty', 'Survived',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Ticket_1', 'Ticket_2', 'Ticket_3',
       'Ticket_4', 'Ticket_5', 'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9',
       'Ticket_A', 'Ticket_C', 'Ticket_F', 'Ticket_L', 'Ticket_P', 'Ticket_S',
       'Ticket_W'],
      dtype='object')

## Feature Importance

Analisar a importancia das features a partir do atual best_model.

In [36]:
best_params = {
    'criterion': 'gini',
    'max_depth': 35,
    'min_samples_split': 4,
    'min_samples_leaf': 4,
    'n_estimators': 25,
    'random_state': 1
}

In [37]:
modelo = RandomForestClassifier(**best_params)
modelo.fit(train.drop(columns=['PassengerId', 'Survived']), train['Survived'])

RandomForestClassifier(max_depth=35, min_samples_leaf=4, min_samples_split=4,
                       n_estimators=25, random_state=1)

In [38]:
#modelo.feature_names_in_, modelo.feature_importances_
features = pd.DataFrame(modelo.feature_importances_, index=train.drop(columns=['PassengerId', 'Survived']).columns, columns=['Importance']).sort_values(by='Importance', ascending=False).query('Importance > 0').reset_index()
to_keep = features['index'].tolist()
to_keep.append('PassengerId')
test = test[to_keep]
to_keep.append('Survived')
train = train[to_keep]

train.head()


Unnamed: 0,Title_Mr,Sex,Age,Title_Mrs,Fare,Pclass_3,Cabin_T,SmallFamily,Title_Miss,Pclass_1,...,Ticket_C,Cabin_A,Cabin_F,Title_Royalty,Ticket_7,Ticket_F,Ticket_6,Ticket_4,PassengerId,Survived
0,1,1,0.275,0,0.014151,1,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0.475,1,0.139136,0,0,1,0,1,...,0,0,0,0,0,0,0,0,2,1
2,0,0,0.325,0,0.015469,1,1,0,1,0,...,0,0,0,0,0,0,0,0,3,1
3,0,0,0.4375,1,0.103644,0,0,1,0,1,...,0,0,0,0,0,0,0,0,4,1
4,1,1,0.4375,0,0.015713,1,1,0,0,0,...,0,0,0,0,0,0,0,0,5,0


In [40]:
train.to_csv('../../data/processed_v5_1/train.csv', index=False)
test.to_csv('../../data/processed_v5_1/test.csv', index=False)