### **Módulos**

In [3]:
import pandas as pd
from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier

### **Datasets**

In [4]:
# Importa datasets
treino = pd.read_csv('./datasets/train.csv')
teste = pd.read_csv('./datasets/test.csv')

# Separa "features" e "target"
X_treino = treino.drop('Survived', axis=1).copy()
y_treino = treino['Survived'].copy()
X_teste = teste.copy()

### **Análise e filtragem dos dados**

In [5]:
X_treino.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# A feature "PassengerId" é única por registro e não contém nenhuma informação relevante. Vamos descartá-la.
X_treino.drop('PassengerId', axis=1, inplace=True)
X_teste.drop('PassengerId', axis=1, inplace=True)

In [6]:
# Vamos analisar se alguma feature tem pouca amostragem e merece ser descartada.
na = pd.DataFrame()
na['treino'] = X_treino.isna().sum() / X_treino.shape[0] * 100
na['teste'] = X_teste.isna().sum() / X_teste.shape[0] * 100
na.sort_values(['treino','teste'], ascending=False)

Unnamed: 0,treino,teste
Cabin,77.104377,78.229665
Age,19.86532,20.574163
Embarked,0.224467,0.0
Fare,0.0,0.239234
PassengerId,0.0,0.0
Pclass,0.0,0.0
Name,0.0,0.0
Sex,0.0,0.0
SibSp,0.0,0.0
Parch,0.0,0.0


In [7]:
# "Cabin" tem mais de 77% de valores faltantes em todo o dataset. Vamos descartá-la.
X_treino.drop('Cabin', axis=1, inplace=True)
X_teste.drop('Cabin', axis=1, inplace=True)

In [8]:
# Para um primeiro estudo, a fim de simplificar, descartaremos as features "Name" e "Ticket"
X_treino.drop(['Name','Ticket'], axis=1, inplace=True)
X_teste.drop(['Name','Ticket'], axis=1, inplace=True)

In [9]:
# Então temos o seguinte conjunto de dados
X_treino.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.25,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.925,S
3,4,1,female,35.0,1,0,53.1,S
4,5,3,male,35.0,0,0,8.05,S


In [10]:
# Features a serem tratadas pelo pipeline

# Númericas que tem dados faltantes
num_cols = ['Age','Fare']

# Categóricas, com dados faltantes ou não
cat_cols = ['Sex','Embarked']

### **Pipeline**

In [11]:
# Pré-processamento de dados numéricos
num_transformer = SimpleImputer(strategy='mean')  #Completa missing values com a média dos demais

# Pré-processamento de dados categóricos
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  #Completa missing values com o caso mais frequente
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combina as ações de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

# Modelo
model = DecisionTreeClassifier(max_depth=3, random_state=42)

# Pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

### **Treinando o modelo e avaliando o resultado**

In [12]:
pipe.fit(X_treino, y_treino)
pipe.score(X_treino, y_treino)

0.7968574635241302

In [13]:
feat_imp = pipe.named_steps['model'].feature_importances_
feat = pd.DataFrame(list(zip(X_treino,feat_imp)))
feat.columns = ['Feature','Importance']
feat.set_index('Feature', inplace=True)
feat.sort_values('Importance', ascending=False)

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
Age,0.76365
Pclass,0.155131
PassengerId,0.081219
Sex,0.0
SibSp,0.0
Parch,0.0
Fare,0.0


### **Predição com o dataset de teste**

In [14]:
y_teste = pipe.predict(X_teste)
y_teste

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### **Resultado para submissão**

In [None]:
resultado = pd.DataFrame()
resultado['PassengerId'] = teste['PassengerId']
resultado['Survived'] = y_teste
resultado.set_index('PassengerId', inplace=True)
resultado.head()

In [None]:
t = datetime.now().strftime('%Y%m%d_%H%M')
resultado.to_csv(f'./submissions/titanic_{t}.csv')