<a href="https://colab.research.google.com/github/TNK443/RecPadroes/blob/main/05_ArvoreDeDecisao_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy  as np

In [2]:
# CRIANDO UM PIPELINE COMPLETO PARA PREPROCESSAMENTO.
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

X = train.drop('Survived', axis=1)
y = train['Survived']

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# ----------------------------------------------------------------------------------------------------
class AtributosDesejados(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId','Name','Ticket','Cabin']
        return self
    
    def transform(self, X, y=None):
        return X.drop(self.colunasIndesejadas, axis=1)
# ----------------------------------------------------------------------------------------------------
class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    
    def transform(self, X, y=None):
        return X[self.colunasNumericas]
# ----------------------------------------------------------------------------------------------------
class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    
    def transform(self, X, y=None):
        return X[self.colunasCategoricas]
# ----------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------
# PIPELINES #

pipenum = Pipeline([
                    ('atributos_numericos', AtributosNumericos()),
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler())
                   ])

pipecat = Pipeline([
                    ('atributos_categoricos', AtributosCategoricos()),
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('encoder', OneHotEncoder())
                   ])
# ----------------------------------------------------------------------------------------------------
#FeatureUnion# 
uneCaracteristicas = FeatureUnion([
                                   ('pipenum', pipenum),
                                   ('pipecat', pipecat)
                                  ])
# ----------------------------------------------------------------------------------------------------
preproc = Pipeline([
                    ('atributos_desejados',AtributosDesejados()),
                    ('unecaracteristicas', uneCaracteristicas)
                   ])

pipetotal = Pipeline([
                      ('preproc', preproc),
                      ('arvore', DecisionTreeClassifier())
                    ])
# ----------------------------------------------------------------------------------------------------

In [4]:
# ----------------------------------------------------------------------------------------------------
parametros = {
    'arvore__max_depth': [5],         #list(range(1,20,2)),
    'arvore__criterion': ['entropy'], #['gini','entropy'],
    'arvore__min_samples_split': [7], #list(range(2,10,1)),
    'arvore__min_samples_leaf': [1],  #list(range(1,10,1)),
    'arvore__splitter': ['random']    #['best', 'random']
}
# TESTEI DIVERSOS NOVOS PARAMETROS, PARA CONSEGUIR UM SCORE MELHOR.
modelo = GridSearchCV(pipetotal, param_grid=parametros)
# ----------------------------------------------------------------------------------------------------


In [70]:
modelo.best_params_

{'arvore__criterion': 'entropy',
 'arvore__max_depth': 5,
 'arvore__min_samples_leaf': 1,
 'arvore__min_samples_split': 7,
 'arvore__splitter': 'random'}

# **GO!!!**

In [68]:
# ----------------------------------------------------------------------------------------------------
modelo.fit(X, y)
y_pred = modelo.predict(test)

scores   = cross_validate(modelo, X, y)

print("--------------------------------------------------")
print()
print("Scores:",scores)
print()
print("np.mean:", np.mean(scores['test_score']))
print()
print("--------------------------------------------------")
# ----------------------------------------------------------------------------------------------------

--------------------------------------------------

Scores: {'fit_time': array([0.13632178, 0.13650608, 0.13715315, 0.13843346, 0.12632465]), 'score_time': array([0.00852394, 0.00573492, 0.00572705, 0.00579906, 0.00547409]), 'test_score': array([0.76536313, 0.80337079, 0.83707865, 0.79775281, 0.83707865])}

np.mean: 0.8081288054736049

--------------------------------------------------


In [72]:
modelo.cv_results_

{'mean_fit_time': array([0.01524286]),
 'mean_score_time': array([0.00588303]),
 'mean_test_score': array([0.81375306]),
 'param_arvore__criterion': masked_array(data=['entropy'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_arvore__max_depth': masked_array(data=[5],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_arvore__min_samples_leaf': masked_array(data=[1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_arvore__min_samples_split': masked_array(data=[7],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_arvore__splitter': masked_array(data=['random'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'arvore__criterion': 'entropy',
   'arvore__max_depth': 5,
   'arvore__min_samples_leaf': 1,
   'arvore__min_samples_split': 7,
   'arvore__splitter': 'random'}],
 'rank_test_score':

# SUBMISSION

In [None]:
# ----------------------------------------------------------------------------------------------------

In [71]:
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# ----------------------------------------------------------------------------------------------------