## **Proyecto #1 - Machine Learning Pipeline**

Stefanie M. Alvarez Pérez, 20002045

In [152]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [153]:
import my_preprocessors as mypp #nuestra libraria

In [154]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [155]:
#Cast de Variables 'is_canceled' y 'is_repeated_guest'
data['is_canceled'] = data['is_canceled'].astype('O')
data['is_repeated_guest'] = data['is_repeated_guest'].astype('O')

In [156]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['agent', 'company','reservation_status_date', 'country','arrival_date_week_number', 'arrival_date_day_of_month'], axis=1),
        data['adr'],
        test_size=0.3,
        random_state=2022)

X_train.shape, X_test.shape

((83573, 26), (35817, 26))

In [157]:
np.unique(y_train)

array([-6.38e+00,  0.00e+00,  2.60e-01, ...,  5.08e+02,  5.10e+02,
        5.40e+03])

In [158]:
type(X_test)

pandas.core.frame.DataFrame

In [159]:
### Transformación al Target
#y_train = np.log(y_train)
#y_test = np.log(y_test)

In [160]:
X_test.shape

(35817, 26)

## Configuración del Machine Learning Pipeline

In [161]:
#Variables categoricas con NA
CATEGORICAL_VARS_WITH_NA_FREQUENT = []

#Variable categoricas con NA pero indicador de Missing
CATEGORICAL_VARS_WITH_NA_MISSING = []


#Variables numéricas con NA
NUMERICAL_VARS_WITH_NA = ['children']

#Variables que vamos a tirar
DROP_FEATURES = []

#Variables para binarización por sesgo fuerte
BINARIZE_VARS = []

#Variables categoricas a codificar
ART = ['assigned_room_type']
ADM = ['arrival_date_month']
MS = ['market_segment']
DC = ['distribution_channel']
Meal = ['meal']
CT = ['customer_type']
RS = ['reservation_status']
Hotel = ['hotel']
DT = ['deposit_type']
RRT = ['reserved_room_type']

#Mapeos de variables categoricas
Diccio_ART = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'H':8, 'I':9,'K':10, 'L':11, 'P':12, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_ADM = {'April':1, 'August':2, 'December':3, 'February':4, 'January':5, 'July':6, 'June':7, 'March':8, 'May':9,'November':10, 'September':11, 'October':12, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_MS = {'Aviation':1, 'Complementary':2, 'Corporate':3, 'Direct':4, 'Groups':5, 'Offline TA/TO':6, 'Online TA':7, 'Undefined':8, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_DC = {'Corporate':1, 'Direct':2, 'GDS':3, 'TA/TO':4, 'Undefined':5, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_Meal = {'BB':1, 'HB':2, 'Undefined':3, 'SC':4, 'FB':5, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_CT = {'Contract':1, 'Group':2, 'Transient':3, 'Transient-Party':4, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_RS = {'Canceled':1, 'Check-Out':2, 'No-Show':3, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_Hotel = {'City Hotel':1, 'Resort Hotel':2, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_DT = {'No Deposit':1, 'Non Refund':2, 'Refundable':3, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_RRT = {'A':1, 'D':2, 'E':3, 'G':4, 'C':5, 'F':6, 'B':7, 'H':8, 'L':9, 'P':10, 'Missing':0, 'NA':0, 'NaN':0}

#Variables seleccionadas según análisis de Lasso
FEATURES = ['hotel', 'is_canceled', 'lead_time', 'arrival_date_month','stays_in_weekend_nights', 
            'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 
            'market_segment', 'distribution_channel', 'is_repeated_guest', 
            'previous_cancellations', 'previous_bookings_not_canceled', 
            'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type',
            'days_in_waiting_list', 'customer_type', 'required_car_parking_spaces', 
            'total_of_special_requests', 'reservation_status']

In [162]:
#Selección de variables para entrenamiento
X_train = X_train[FEATURES]
X_test = X_test[FEATURES]
X_train.shape

(83573, 24)

In [163]:
X_train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status
42579,City Hotel,0,14,September,1,3,2,0.0,0,BB,...,0,A,A,0,No Deposit,0,Contract,0,1,Check-Out
67602,City Hotel,1,161,May,0,2,2,0.0,0,BB,...,0,A,A,0,Non Refund,0,Transient,0,0,Canceled
55554,City Hotel,1,119,August,0,2,2,1.0,0,BB,...,0,D,D,0,No Deposit,0,Transient,0,0,Canceled
93099,City Hotel,0,4,July,0,1,2,0.0,0,SC,...,0,A,A,0,No Deposit,0,Transient,0,1,Check-Out
118057,City Hotel,0,36,August,1,1,2,0.0,0,BB,...,0,A,A,0,No Deposit,0,Transient,0,1,Check-Out


## Machine Learning PipeLine

In [164]:
Hotel_pipeline = Pipeline([
    
    #============= IMPUTACIONES ===================#
    
    #1. Imputación de variables categoricas
    ('missing_imputation', 
         CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    #2. Imputación de variables categoricas con NA basado en frequiencia.
    ('frequent_imputation', 
         CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    #3. Indicamos Faltante en variables numéricas para imputar
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #4. Imputación de mediana para variables categoricas
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #============= VARIABLES TEMPORALES ==================
    
    #5. Tratamiento de variables temporales
    #('eslapsed_time', mypp.TremporalVariableTransformer(
        #variables=TEMPORAL_VARS, reference_variable=REF_VAR)),
    
    #6. Drop de variables
    #('drop_features', DropFeatures(features_to_drop=DROP_FEATURES)),
    
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    #7. Transformación logaritmica
    #('log', LogTransformer(variables=NUMERICALS_LOG_VARS)),
    
    #8. Binarización de Variables con Sesgo Fuerte
    #('binarizer', SklearnTransformerWrapper(
        #transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)),
    
    #=============== CODIFICACION DE VARIABLES CATEGORICAS ORDINALES ==============
    ('mapper_ART', mypp.Mapper(variables=ART, mappings=Diccio_ART)),
    
    ('mapper_ADM', mypp.Mapper(variables=ADM, mappings=Diccio_ADM)),
    
    ('mapper_MS', mypp.Mapper(variables=MS, mappings=Diccio_MS)),
    
    ('mapper_DC', mypp.Mapper(variables=DC, mappings=Diccio_DC)),
    
    ('mapper_Meal', mypp.Mapper(variables=Meal, mappings=Diccio_Meal)),
    
    ('mapper_CT', mypp.Mapper(variables=CT, mappings=Diccio_CT)),
    
    ('mapper_RS', mypp.Mapper(variables=RS, mappings=Diccio_RS)),
    
    ('mapper_Hotel', mypp.Mapper(variables=Hotel, mappings=Diccio_Hotel)),
    
    ('mapper_DT', mypp.Mapper(variables=DT, mappings=Diccio_DT)),
        
    ('mapper_RRT', mypp.Mapper(variables=RRT, mappings=Diccio_RRT)),
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    #('rare_label_encoder', RareLabelEncoder(
        #tol=0.01, n_categories=1, variables=CATEGORICAL_VARS)),
    
    #('categorical_encoder', OrdinalEncoder(
        #encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('Lasso', Lasso(alpha=0.01, random_state=2022)),
]) 

In [165]:
y_train.head()

42579     109.75
67602     120.00
55554     127.80
93099     139.00
118057    155.00
Name: adr, dtype: float64

In [166]:
np.unique(y_train)

array([-6.38e+00,  0.00e+00,  2.60e-01, ...,  5.08e+02,  5.10e+02,
        5.40e+03])

In [167]:
#y_train = np.maximum(y_train, 0.0)
#y_test = np.maximum(y_test, 0.0)

In [168]:
#y_train = np.minimum(y_train, 110)
#y_test = np.minimum(y_test, 110)

In [169]:
np.unique(y_train)

array([-6.38e+00,  0.00e+00,  2.60e-01, ...,  5.08e+02,  5.10e+02,
        5.40e+03])

In [170]:
Hotel_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation', CategoricalImputer(variables=[])),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=[])),
                ('missing_indicator',
                 AddMissingIndicator(variables=['children'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['children'])),
                ('mapper_ART',
                 Mapper(mappings={'A': 1, 'B...
                ('mapper_DT',
                 Mapper(mappings={'Missing': 0, 'NA': 0, 'NaN': 0,
                                  'No Deposit': 1, 'Non Refund': 2,
                                  'Refundable': 3},
                        variables=['deposit_type'])),
                ('mapper_RRT',
                 Mapper(mappings={'A': 1, 'B': 7, 'C': 5, 'D': 2, 'E': 3,
                                  'F': 6, 'G': 4, 'H':

In [171]:
preds = Hotel_pipeline.predict(X_test)

In [172]:
preds

array([ 52.3406585 , 104.41534438,  75.98919042, ...,  86.32503061,
        84.91897085,  97.92531528])

In [173]:
from sklearn.metrics import mean_squared_error 

In [174]:
X_train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status
42579,City Hotel,0,14,September,1,3,2,0.0,0,BB,...,0,A,A,0,No Deposit,0,Contract,0,1,Check-Out
67602,City Hotel,1,161,May,0,2,2,0.0,0,BB,...,0,A,A,0,Non Refund,0,Transient,0,0,Canceled
55554,City Hotel,1,119,August,0,2,2,1.0,0,BB,...,0,D,D,0,No Deposit,0,Transient,0,0,Canceled
93099,City Hotel,0,4,July,0,1,2,0.0,0,SC,...,0,A,A,0,No Deposit,0,Transient,0,1,Check-Out
118057,City Hotel,0,36,August,1,1,2,0.0,0,BB,...,0,A,A,0,No Deposit,0,Transient,0,1,Check-Out


In [175]:
mean_squared_error(np.exp(y_test), np.exp(preds), squared=False)

  output_errors = np.average((y_true - y_pred) ** 2, axis=0,


inf

In [176]:
y_test

84149      81.00
1258      188.71
82013      70.00
52113      93.60
70513     120.00
           ...  
16065     192.00
59236     118.35
53194     140.00
108289    100.00
2046       62.00
Name: adr, Length: 35817, dtype: float64

In [177]:
X_test

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status
84149,City Hotel,0,99,February,0,1,1,0.0,0,BB,...,0,A,A,0,No Deposit,0,Transient-Party,0,0,Check-Out
1258,Resort Hotel,0,65,August,2,5,2,0.0,0,HB,...,0,D,D,1,No Deposit,0,Transient,0,0,Check-Out
82013,City Hotel,1,115,March,2,0,1,0.0,0,BB,...,0,A,A,0,Non Refund,22,Transient,0,0,Canceled
52113,City Hotel,1,44,May,0,2,2,0.0,0,SC,...,0,A,A,0,No Deposit,0,Transient,0,0,Canceled
70513,City Hotel,1,150,June,2,3,1,0.0,0,BB,...,0,A,A,0,Non Refund,0,Transient,0,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16065,Resort Hotel,0,0,August,0,1,2,0.0,0,BB,...,0,A,A,0,No Deposit,0,Transient,0,1,Check-Out
59236,City Hotel,1,87,October,2,2,2,0.0,0,BB,...,0,D,D,0,No Deposit,0,Transient,0,0,Canceled
53194,City Hotel,1,350,June,0,2,2,0.0,0,BB,...,0,A,A,0,Non Refund,0,Transient,0,0,Canceled
108289,City Hotel,0,116,March,0,2,2,0.0,0,BB,...,0,A,A,0,No Deposit,0,Transient-Party,0,1,Check-Out


In [178]:
import joblib

In [179]:
#Guardamos pipeline
joblib.dump(Hotel_pipeline, 'housePrice_pipeline.pkl')

['housePrice_pipeline.pkl']

In [180]:
type(Hotel_pipeline)

sklearn.pipeline.Pipeline