# 10 - DecisionTree + Dummies

Executar o algoritmo de `DecisionTree` utilizando o novo dataset.

## Preparando o ambiente

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, GroupKFold, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, Normalizer, RobustScaler

In [2]:
SEED = 42 # Vida, universo e tudo mais
np.random.seed(SEED)

## Carregando os dados

In [4]:
test = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/processed/test_dummies.csv')
del test['Unnamed: 0']
test.head(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Floor_A,Floor_B,Floor_C,Floor_D,Floor_E,Floor_F,Floor_G,Floor_SC,Floor_T,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
1,1,1,38,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0


In [4]:
treino = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/processed/train_dummies.csv')
del treino['Unnamed: 0']
treino.head(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Floor_A,Floor_B,Floor_C,Floor_D,Floor_E,Floor_F,Floor_G,Floor_SC,Floor_T,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
1,1,1,38,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0


## Separando treino e teste

In [6]:
X = treino.drop(columns=['Survived'])
y = treino['Survived']

In [7]:
treino_x, teste_x, treino_y, teste_y = train_test_split(X, y, test_size = 0.2, stratify = y)
print("Treinaremos com %d elementos e testaremos com %d elementos" % (len(treino_x), len(teste_x)))

Treinaremos com 712 elementos e testaremos com 179 elementos


## Executando a `DecisionTree`

In [8]:
def dt_grid_search(treino_x, treino_y, teste_x, teste_y):
    criterion = ["gini", "entropy"]
    max_depth = [2, 3, 5, 7, 9, 11, 13]
    min_samples_split = [2, 8, 16, 32]
    min_samples_leaf = [1, 2, 8, 16, 32]
    resultados = []

    for c in criterion:
        for md in max_depth:
            for ms in min_samples_split:
                for ml in min_samples_leaf:
                    modelo = DecisionTreeClassifier(criterion = c, max_depth = md, min_samples_split = ms, min_samples_leaf = ml)
                    modelo.fit(treino_x, treino_y)
                    score = modelo.score(teste_x, teste_y)
                    resultados.append({
                        "criterion": c,
                        "max_depth" : md,
                        "min_samples_split": ms,
                        "min_samples_leaf": ml,
                        "score": score
                    })

    return pd.DataFrame(resultados)

In [9]:
resultados = dt_grid_search(treino_x, treino_y, teste_x, teste_y).sort_values(by='score', ascending=False)
resultados

Unnamed: 0,criterion,max_depth,min_samples_split,min_samples_leaf,score
279,entropy,13,32,32,0.782123
187,entropy,5,8,8,0.782123
174,entropy,3,16,32,0.782123
175,entropy,3,32,1,0.782123
74,gini,7,16,32,0.782123
...,...,...,...,...,...
110,gini,11,16,1,0.715084
90,gini,9,16,1,0.715084
70,gini,7,16,1,0.715084
61,gini,7,2,2,0.709497


In [22]:
def dt_grid_search_cross(X, y):
    criterion = ["gini", "entropy"]
    max_depth = [2, 3, 5, 7, 9, 11, 13]
    min_samples_split = [2, 8, 16, 32]
    min_samples_leaf = [1, 2, 8, 16, 32]
    n_splits = [2, 3, 5, 10, 15]
    resultados = []
    
    for c in criterion:
        for md in max_depth:
            for ms in min_samples_split:
                for ml in min_samples_leaf:
                    for sp in n_splits:
                        modelo = DecisionTreeClassifier(criterion = c, max_depth = md, min_samples_split = ms, min_samples_leaf = ml)
                        results = cross_validate(modelo, X, y, cv = GroupKFold(n_splits = sp), groups = X.Age, return_train_score=True)
                        resultados.append({
                            "criterion": c,
                            "max_depth" : md,
                            "min_samples_split": ms,
                            "min_samples_leaf": ml,
                            "test_score": (results['test_score'][0] + results['test_score'][1]) / 2,
                            "train_score": (results['train_score'][0] + results['train_score'][1]) / 2,
                            "n_splits": sp
                        })

    return pd.DataFrame(resultados)

In [23]:
resultados = dt_grid_search_cross(X, y).sort_values(by=['test_score', 'train_score'], ascending=[False, False])
resultados

Unnamed: 0,criterion,max_depth,min_samples_split,min_samples_leaf,test_score,train_score,n_splits
1158,entropy,9,16,2,0.854494,0.864628,10
103,gini,3,2,1,0.849001,0.822827,10
108,gini,3,2,2,0.849001,0.822827,10
128,gini,3,8,1,0.849001,0.822827,10
133,gini,3,8,2,0.849001,0.822827,10
...,...,...,...,...,...,...,...
94,gini,2,32,16,0.711864,0.796875,15
24,gini,2,2,32,0.711864,0.795072,15
49,gini,2,8,32,0.711864,0.795072,15
74,gini,2,16,32,0.711864,0.795072,15


## Treinando o melhor modelo

In [25]:
best_params = resultados.sort_values(by=['test_score'], ascending=[False]).head(1).drop(columns=['test_score', 'train_score', 'n_splits']).to_dict(orient='records')[0]
best_params

{'criterion': 'entropy',
 'max_depth': 9,
 'min_samples_split': 16,
 'min_samples_leaf': 2}

In [27]:
cv = GroupKFold(n_splits = 10)
modelo = DecisionTreeClassifier().set_params(**best_params)
results = cross_validate(modelo, X, y, cv = cv, groups = X.Age, return_train_score=True)
pd.DataFrame(results)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.005405,0.001697,0.808989,0.866584
1,0.008139,0.0,0.9,0.862672
2,0.00797,0.0,0.788889,0.870162
3,0.0,0.007994,0.865169,0.865337
4,0.008017,0.0,0.831461,0.86409
5,0.0,0.0,0.775281,0.862843
6,0.0,0.0,0.786517,0.870324
7,0.015617,0.0,0.730337,0.870324
8,0.015644,0.0,0.842697,0.860349
9,0.0,0.01554,0.818182,0.86675


In [None]:
modelo.predict()