## **Laboratorio #4 - Machine Learning Pipeline**

Stefanie M. Alvarez Pérez, 20002045

In [236]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [237]:
import my_preprocessors as mypp #nuestra librería

In [238]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [239]:
#Cast de Variable Pclass
data['Pclass'] = data['Pclass'].astype('O')

In [240]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['PassengerId', 'Ticket', 'Name', 'Cabin', 'Fare'], axis=1),
        data['Survived'],
        test_size=0.3,
        random_state=2022)

X_train.shape, X_test.shape

((623, 7), (268, 7))

In [241]:
X_test.shape

(268, 7)

In [242]:
type(X_test)

pandas.core.frame.DataFrame

In [243]:
### Transformación al Target
#y_train = np.log(y_train)
#y_test = np.log(y_test)

In [244]:
#X_test = pd.read_csv("test.csv")
X_test = X_test[FEATURES]
#X_test['Pclass'] = X_test['Pclass'].astype('O')

In [245]:
X_test.shape

(268, 5)

## Configuración del Machine Learning Pipeline

In [246]:
#Variables categoricas con NA
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['Embarked']

#Variable categoricas con NA pero indicador de Missing
CATEGORICAL_VARS_WITH_NA_MISSING = []


#Variables numéricas con NA
NUMERICAL_VARS_WITH_NA = ['Age']


#Variables de temporalidad
#TEMPORAL_VARS = ['']
#REF_VAR = ""

#Varaibles que vamos a tirar
DROP_FEATURES = []

#Varibles para transformación logarítmica
NUMERICALS_LOG_VARS = [] #"Fare"

#Variables para binarización por sesgo fuerte
BINARIZE_VARS = []

#Variables para hacer mapeo categorico por codificación ordinal
Lista_Sex = ['Sex']
Lista_E = ['Embarked']

#Variables categoricas a codificar sin ordinalidad
#CATEGORICAL_VARS = []

#Mapeos de variables categoricas
Diccio_Sex = {'female':1, 'male':2, '3':3, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_E = {'C':1, 'Q':2, 'S':3, 'Missing':0, 'NA':0, 'NaN':0}

#Variables seleccionadas según análisis de Lasso
FEATURES = [
    'Pclass', 
    'Sex', 
    'Age', 
    'SibSp', 
    'Embarked',
]

In [247]:
#Selección de variables para entrenamiento
X_train = X_train[FEATURES]
X_train.shape

(623, 5)

## Machine Learning PipeLine

In [248]:
Titanic_pipeline = Pipeline([
    
    #============= IMPUTACIONES ===================#
    
    #1. Imputación de varaibles categoricas
    ('missing_imputation', 
         CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    #2. Imputación de variables categoricas con NA basado en frequiencia.
    ('frequent_imputation', 
         CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    #3. Indicamos Faltante en variables numéricas para imputar
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #4. Imputación de mediana para variables categoricas
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #============= VARIABLES TEMPORALES ==================
    
    #5. Tratamiento de variables temporales
    #('eslapsed_time', mypp.TremporalVariableTransformer(
    #   variables=TEMPORAL_VARS, reference_variable=REF_VAR)
    #),
    
    #6. Drop de variables
    #('drop_features', DropFeatures(features_to_drop=DROP_FEATURES)),
    
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    #7. Transformación logaritmica
    #('log', LogTransformer(variables=NUMERICALS_LOG_VARS)),
    
    #8. Binarización de Variables con Sesgo Fuerte
    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)
    ),
    
    #=============== CODIFICACION DE VARIABLES CATEGORICAS ORDINALES ==============
    ('mapper_sex', mypp.Mapper(variables=Lista_Sex, mappings=Diccio_Sex)),
    
    ('mapper_Embarked', mypp.Mapper(
        variables=Lista_E, mappings=Diccio_E)),
    
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    #('rare_label_encoder', RareLabelEncoder(
        #tol=0.01, n_categories=1, variables=CATEGORICAL_VARS)),
    
    #('categorical_encoder', OrdinalEncoder(
        #encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('Lasso', Lasso(alpha=0.01, random_state=2022)),
]) 

In [249]:
Titanic_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation', CategoricalImputer(variables=[])),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['Embarked'])),
                ('missing_indicator', AddMissingIndicator(variables=['Age'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Age'])),
                ('binarizer',
                 SklearnTransformerWrapper(transformer=Binarizer(threshold=0),
                                           variables=[])),
                ('mapper_sex',
                 Mapper(mappings={'3': 3, 'Missing': 0, 'NA': 0, 'NaN': 0,
                                  'female': 1, 'male': 2},
                        variables=['Sex'])),
                ('mapper_Embarked',
                 Mapper(mappings={'C': 1, 'Missing': 0, 'NA': 0, 'NaN': 0,
                                  'Q'

In [250]:
Titanic_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation', CategoricalImputer(variables=[])),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['Embarked'])),
                ('missing_indicator', AddMissingIndicator(variables=['Age'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Age'])),
                ('binarizer',
                 SklearnTransformerWrapper(transformer=Binarizer(threshold=0),
                                           variables=[])),
                ('mapper_sex',
                 Mapper(mappings={'3': 3, 'Missing': 0, 'NA': 0, 'NaN': 0,
                                  'female': 1, 'male': 2},
                        variables=['Sex'])),
                ('mapper_Embarked',
                 Mapper(mappings={'C': 1, 'Missing': 0, 'NA': 0, 'NaN': 0,
                                  'Q'

In [251]:
preds = Titanic_pipeline.predict(X_test)

In [252]:
preds

array([0.21489429, 0.21489429, 0.71227322, 0.17896185, 0.17896185,
       0.74078496, 0.21489429, 0.74078496, 0.17896185, 0.21489429,
       0.21489429, 0.72652909, 0.74078496, 0.21489429, 0.21489429,
       0.75504083, 0.21489429, 0.21489429, 0.21489429, 0.71227322,
       0.24340603, 0.21489429, 0.24340603, 0.71227322, 0.70485252,
       0.69059665, 0.20747359, 0.21489429, 0.17896185, 0.74078496,
       0.19321772, 0.19321772, 0.24340603, 0.22915016, 0.21489429,
       0.71227322, 0.21489429, 0.71227322, 0.74078496, 0.71227322,
       0.67634078, 0.21489429, 0.67634078, 0.21489429, 0.21489429,
       0.21489429, 0.24340603, 0.21489429, 0.21489429, 0.74078496,
       0.21489429, 0.72652909, 0.67634078, 0.21489429, 0.21489429,
       0.21489429, 0.21489429, 0.21489429, 0.71227322, 0.21489429,
       0.71227322, 0.21489429, 0.71227322, 0.21489429, 0.21489429,
       0.21489429, 0.21489429, 0.69059665, 0.21489429, 0.71227322,
       0.21489429, 0.71227322, 0.17896185, 0.69059665, 0.71227

In [253]:
from sklearn.metrics import mean_squared_error

In [254]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Embarked
615,2,female,24.0,1,S
598,3,male,,0,C
161,2,female,40.0,0,S
854,2,female,44.0,1,S
216,3,female,27.0,0,S


In [255]:
mean_squared_error(np.exp(y_test), np.exp(preds), squared=False)

0.6954817126212205

In [256]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Embarked
770,3,male,24.0,0,S
178,2,male,30.0,0,S
786,3,female,18.0,0,S
159,3,male,,8,S
656,3,male,,0,S
...,...,...,...,...,...
693,3,male,25.0,0,C
79,3,female,30.0,0,S
71,3,female,16.0,5,S
503,3,female,37.0,0,S


In [257]:
import joblib

In [258]:
#Guardamos pipeline
joblib.dump(Titanic_pipeline, 'Titanic_pipeline.pkl')

['Titanic_pipeline.pkl']

In [259]:
type(Titanic_pipeline)

sklearn.pipeline.Pipeline