<a href="https://colab.research.google.com/github/RenanCostaNascimento/mestrado-reconhecimento-padroes/blob/main/Titanic_Melhorado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

O objetivo desse notebook é melhorar o [resultado obtido anteriormente](https://colab.research.google.com/drive/1MGozqv8xQcfvaqDxbev43TKwdBZueQe9) usando como base o [código do professor](https://github.com/fboldt/aulasml/blob/titanic/titanic.ipynb). Abaixo então se segue o código do professor. Minhas alterações serão marcadas através de comentários.

In [39]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv').set_index('PassengerId')
test = pd.read_csv('test.csv').set_index('PassengerId')

In [40]:
# Ao invés de usar as características existentes, vamos criar algumas novas.
df = pd.concat([train, test], axis=0, sort=False)
df['Title'] = df.Name.str.split(',').str[1].str.split('.').str[0].str.strip()
df['IsWomanOrChild'] = ((df.Title == 'Master') | (df.Sex == 'female'))
df['LastName'] = df.Name.str.split(',').str[0]
family = df.groupby(df.LastName).Survived
df['FamilyTotalCount'] = family.transform(lambda s: s[df.IsWomanOrChild].fillna(0).count())
df['FamilyTotalCount'] = df.mask(df.IsWomanOrChild, df.FamilyTotalCount - 1, axis=0)
df['FamilySurvivedCount'] = family.transform(lambda s: s[df.IsWomanOrChild].fillna(0).sum())
df['FamilySurvivedCount'] = df.mask(df.IsWomanOrChild, df.FamilySurvivedCount - df.Survived.fillna(0), axis=0)
df['FamilySurvivalRate'] = (df.FamilySurvivedCount / df.FamilyTotalCount.replace(0, np.nan))
df['IsSingleTraveler'] = df.FamilyTotalCount == 0


x = pd.concat([
    df.FamilySurvivalRate.fillna(0),
    df.IsSingleTraveler,
    df.Sex.replace({'male': 0, 'female': 1}),
], axis=1)

# ao invés de usar X, y e test
# vamos usar train_x, test_x e train_y
train_x, test_x = x.loc[train.index], x.loc[test.index]
train_y = df.Survived.loc[train.index]

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np

pipe = Pipeline([
    ('classificador', RandomForestClassifier())
])

parametros = {
    # adicionei mais alguns parâmetros de depth
    'classificador__max_depth': [2,3,4,5],
    # adicionei dois criterions
    'classificador__criterion': ['gini', 'entropy']
}
modelo = GridSearchCV(pipe, param_grid=parametros)

scores = cross_validate(modelo, train_x, train_y , cv=RepeatedKFold())
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

(array([0.94413408, 0.91573034, 0.92696629, 0.90449438, 0.91573034,
        0.89385475, 0.92134831, 0.92696629, 0.92696629, 0.93820225,
        0.92178771, 0.92134831, 0.91573034, 0.92696629, 0.92134831,
        0.9273743 , 0.91011236, 0.91011236, 0.91011236, 0.91011236,
        0.93854749, 0.91011236, 0.93820225, 0.88764045, 0.93258427,
        0.9273743 , 0.91011236, 0.91573034, 0.91573034, 0.93820225,
        0.93296089, 0.92134831, 0.93258427, 0.92134831, 0.8988764 ,
        0.93854749, 0.92134831, 0.91011236, 0.89325843, 0.88202247,
        0.93854749, 0.90449438, 0.92696629, 0.90449438, 0.92696629,
        0.92178771, 0.92696629, 0.91011236, 0.92696629, 0.92134831]),
 0.9192938296403238,
 0.01379362816512509)

In [49]:
modelo.fit(train_x, train_y)
y_pred = modelo.predict(test_x)
pd.DataFrame({'Survived': y_pred.astype(int)}, index=test.index) \
.reset_index() \
.to_csv(f'survived.csv', index=False)