# Ajuste de Características

In [1]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

lista_pronomes = ['Mr', 'Miss', 'Mrs', 'Master']

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

def substituiPronome(nome):
    if nome in lista_pronomes:
        return 'outros'
    else:
        return nome

def tamanho(item):
    return len(item)

def tamanhoFamilia(soma):
    if soma == 0:
        return 'sozinho'
    elif soma <= 3:
        return 'pequena'
    else:
        return 'grande'

def idadeNula(idade):
    if idade:
        return 1
    else:
        return 0

def primeiraLetra(ticket):
    return str(ticket)[0]

def ticketSobrevivencia(ticket):
    if ticket in ['1', '2', '3', 'S', 'P', 'C', '9']:
        return 'ticket_alto'
    else:
        return 'ticket_baixo'


class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True, excluirTicket=True, excluirCabin=True):
        self.excluirName = excluirName
        self.excluirTicket = excluirTicket
        self.excluirCabin = excluirCabin
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        if self.excluirTicket:
            self.colunasIndesejadas.append('Ticket')  
        if self.excluirCabin:
            self.colunasIndesejadas.append('Cabin')    
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
            lista_pronomes = ['Mr', 'Miss', 'Mrs', 'Master']
            Xdrop['Name'] = Xdrop['Name'].apply(substituiPronome)
            Xdrop['NameTamanho'] = Xdrop['Name'].apply(tamanho) #tamanho do nome
        if 'Ticket' not in self.colunasIndesejadas:
            Xdrop['tamanhoTicket'] = Xdrop['Ticket'].apply(tamanho) #tamanho do ticket
            Xdrop['Ticket'] = Xdrop['Ticket'].apply(primeiraLetra) 
            Xdrop['Ticket'] = Xdrop['Ticket'].apply(ticketSobrevivencia) 
        if 'Cabin' not in self.colunasIndesejadas:
            Xdrop['Cabin'] = Xdrop['Cabin'].apply(primeiraLetra) 
        Xdrop['idadeVazia'] = Xdrop['Age'].apply(idadeNula)
        Xdrop['tamanhoFamilia'] = Xdrop['SibSp'] + Xdrop['Parch']
        Xdrop['tamanhoFamilia'] = Xdrop['tamanhoFamilia'].apply(tamanhoFamilia)
        return Xdrop

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador', RandomForestClassifier())
])

parametros = {
    'atributosDesejados__excluirName': [True, False],
    'atributosDesejados__excluirTicket': [True, False],
    'atributosDesejados__excluirCabin': [True, False],
    'classificador__max_depth': [5],
    'classificador__criterion': ['gini', 'entropy'], 
    #'classificador__min_samples_leaf': [1,5,10],
    #'classificador__min_samples_split': [2, 10, 16],
    #'classificador__n_estimators': [50, 100, 400]
}
modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y, cv=RepeatedKFold(random_state=101))
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

(array([0.82681564, 0.83707865, 0.83707865, 0.79213483, 0.83146067,
        0.8547486 , 0.7752809 , 0.85955056, 0.79775281, 0.84831461,
        0.82122905, 0.82022472, 0.83707865, 0.8258427 , 0.83707865,
        0.83240223, 0.7752809 , 0.80898876, 0.86516854, 0.85955056,
        0.81564246, 0.85393258, 0.86516854, 0.76966292, 0.8258427 ,
        0.87709497, 0.76966292, 0.83707865, 0.88202247, 0.80337079,
        0.82681564, 0.85393258, 0.85393258, 0.80337079, 0.78089888,
        0.84916201, 0.80898876, 0.80898876, 0.80898876, 0.84269663,
        0.81005587, 0.8258427 , 0.81460674, 0.86516854, 0.80337079,
        0.81564246, 0.84831461, 0.84269663, 0.8258427 , 0.83146067]),
 0.8272663360743207,
 0.027318704995761268)

In [7]:
modelo.fit(X,y)
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submissao6.csv',index=False)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste