# Pipeline+
- Pipeline para modelos preditivos: de regressão ou regressão.  
Pipelines scikit learn são espécies de "contênieres" que podem ter objetos do tipo:  
- Tranformer (não é NLP, é de pré-processamento mesmo)
- Estimator (nome que o sklearn dá para algoritmos de classificação, regressão e clustering)
- Pipeline (é possível utilizar pipelines um dentro do outro)
- Feature Union (ajuda a juntar pipelines diferentes)

In [94]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

In [121]:
import warnings
warnings.filterwarnings('ignore')

In [95]:
#importando os dados
treino = pd.read_csv('../input/titanic/train.csv')
teste=pd.read_csv('../input/titanic/test.csv')

In [96]:
teste.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [97]:
treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Criando um pipeline "na mão"

In [98]:
pipeline_mao = Pipeline([
    ('one_hot_encoder',OneHotEncoder(handle_unknown='ignore')),
    ('min_max_scaler',StandardScaler(with_mean=False)),
    ('classificador',RandomForestClassifier())]
)
pipeline_mao

In [99]:
pipeline_mao.steps[0]

('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))

## Usando o makepipeline  
Ele cria opipeline "automaticamente"


## Treino e Validação

In [100]:
make_pipeline([OneHotEncoder(handle_unknown='ignore'),StandardScaler(with_mean=False),RandomForestClassifier()])

In [101]:
X = treino.drop('Survived', axis=1)
y = treino['Survived']

X_treino, X_valid, y_treino, y_valid = train_test_split(X,y)
X_treino.shape, X_valid.shape, y_treino.shape, y_valid.shape

((668, 11), (223, 11), (668,), (223,))

In [102]:
pipeline_mao.fit(X_treino, y_treino)

In [103]:
pipeline_mao.predict(X_valid)

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1])

In [104]:
pipeline_mao.score(X_valid,y_valid)

0.8340807174887892

## Separando as variáveis em categóricas e numéricas 

In [105]:
X_treino.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [106]:
cat_var = [coluna for coluna in X_treino.columns if X_treino[coluna].dtype.name == 'object']
cat_var

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [107]:
num_var = [coluna for coluna in X_treino.columns if X_treino[coluna].dtype.name != 'object']
num_var

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

## Tratamento das colunas categóricas (imputer = tratar dados faltantes)

In [108]:
pipeline_categoricas = Pipeline([('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
                                ('encoder',OneHotEncoder(handle_unknown='ignore', sparse=False))]
                               )
   

## Tratamento para as colunas numéricas

In [109]:
pipeline_numericas = Pipeline([('imputer', SimpleImputer(strategy='median')),
                              ('scaler',MinMaxScaler())]
                                )

In [110]:
pre_processamento = ColumnTransformer([('cat', pipeline_categoricas, cat_var),
                                      ('num',pipeline_numericas, num_var)])

## Random Forest

In [111]:
pipeline_random_forest = make_pipeline(pre_processamento,RandomForestClassifier(random_state= 2023))
pipeline_log_reg = make_pipeline(pre_processamento, LogisticRegression(random_state= 2023))

In [112]:
pipeline_random_forest

In [113]:
pipeline_random_forest.fit(X_treino,y_treino)
pipeline_random_forest.score(X_valid,y_valid)



0.8161434977578476

In [114]:
pipeline_log_reg.fit(X_treino,y_treino)
pipeline_log_reg.score(X_valid,y_valid)



0.8116591928251121

## Cross Validation

In [117]:
validacao_cruzada = KFold(n_splits = 10, shuffle= True, random_state =2023)
validacao_cruzada

KFold(n_splits=10, random_state=2023, shuffle=True)

In [123]:
cross_val_score(pipeline_random_forest, X,y, cv=validacao_cruzada)

array([0.86666667, 0.82022472, 0.79775281, 0.7752809 , 0.84269663,
       0.82022472, 0.79775281, 0.80898876, 0.87640449, 0.78651685])

In [124]:
acuracia_media_rf = cross_val_score(pipeline_random_forest, X, y, cv= validacao_cruzada).mean()
acuracia_media_rf

0.819250936329588

In [126]:
acuracia_media_log =  cross_val_score(pipeline_log_reg, X, y, cv= validacao_cruzada).mean()
acuracia_media_log 

0.8204244694132334