## **Proyecto 1. Parte 2 - Feature Engineering**

Stefanie M. Alvarez Pérez, 20002045

In [3008]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import joblib

In [3009]:
pd.pandas.set_option('display.max_columns', None) #habilitamos despliegue maximo de columnas

In [3010]:
data = pd.read_csv('train.csv')
print(data.shape)


(119390, 32)


In [3011]:
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3012]:
#separamos data para entrenamiento y prueba, previene el overfitting (ajuste a la estructura del dataset completo)
X_train, X_test, y_train, y_test = train_test_split(
                        data.drop(['agent', 'company'], axis=1),
                        data['adr'],
                        test_size=0.15,
                        random_state=2021) #Random State garantiza la reproducilidad del análisis

In [3013]:
X_train.shape, X_test.shape

((101481, 30), (17909, 30))

In [3014]:
X_train = X_train.drop(['adr'], axis=1)
X_test = X_test.drop(['adr'], axis=1)

In [3015]:
X_train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
84964,City Hotel,0,88,2016,March,10,4,1,2,2,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,0,0,A,D,0,No Deposit,0,Transient,0,1,Check-Out,2016-03-07
27111,Resort Hotel,0,13,2016,August,34,16,0,4,2,1.0,0,HB,PRT,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,0,Transient,0,2,Check-Out,2016-08-20
13958,Resort Hotel,0,0,2017,February,8,21,0,1,1,0.0,0,BB,PRT,Corporate,Corporate,0,2,9,A,D,0,No Deposit,0,Transient,1,2,Check-Out,2017-02-22
48359,City Hotel,0,0,2016,March,13,22,0,1,2,0.0,0,BB,PRT,Complementary,Direct,0,0,0,A,A,1,No Deposit,0,Transient,0,1,Check-Out,2016-03-23
105085,City Hotel,0,23,2017,January,4,25,0,4,2,0.0,0,BB,NLD,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient,0,0,Check-Out,2017-01-29


In [3016]:
y_train.head()

84964      60.00
27111     237.90
13958      35.00
48359       0.00
105085     82.88
Name: adr, dtype: float64

### 1. Transformación del Target a Gausssiano

In [3017]:
#Mejorarlo con funciones logarítmicas
#y_train = np.log(y_train)
#y_test = np.log(y_test)

In [3018]:
y_train.head()

84964      60.00
27111     237.90
13958      35.00
48359       0.00
105085     82.88
Name: adr, dtype: float64

### 2. Missing Values

##### 2.1 Missing Values para Variables Categóricas

In [3019]:
data.dtypes #(object (0) = string/categóricas)

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [3020]:
#Agregamos el caso especial de 'is_canceled' y 'is_repeated_guest' ya que por definición, estas variables son categórica.

cat_vars = [var for var in data.columns if data[var].dtype == 'O']
cat_vars = cat_vars + ['is_canceled', 'is_repeated_guest'] #Lista + lista.

In [3021]:
X_test.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
50840,City Hotel,0,126,2016,May,20,9,1,5,2,0.0,0,BB,AUT,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,0,Transient-Party,0,1,Check-Out,2016-05-15
44106,City Hotel,1,27,2015,September,40,30,0,2,2,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,0,Non Refund,21,Transient,0,0,Canceled,2015-09-24
107102,City Hotel,0,137,2017,March,10,6,1,1,2,0.0,0,BB,ISR,Groups,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient-Party,0,1,Check-Out,2017-03-08
31113,Resort Hotel,0,7,2016,December,51,13,0,2,1,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,0,0,A,D,0,No Deposit,0,Transient-Party,0,0,Check-Out,2016-12-15
98401,City Hotel,0,20,2016,September,39,24,2,3,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient,0,1,Check-Out,2016-09-29


In [3022]:
#Repetir tanto para train como para test
X_train[cat_vars] = X_train[cat_vars].astype('O')
X_test[cat_vars] = X_test[cat_vars].astype('O')

In [3023]:
len(cat_vars)

14

##### 2.1.1 Detección de NaN en variables categórias

In [3024]:
cat_vars_with_na = [var for var in cat_vars 
                        if X_train[var].isnull().sum() > 0]

In [3025]:
cat_vars_with_na

['country']

In [3026]:
pd.unique(X_train['country'])

array(['PRT', 'NLD', 'GBR', 'FRA', 'ESP', 'MLT', 'ITA', 'USA', 'ARG',
       'ISR', 'ROU', 'BEL', 'CHE', 'POL', 'JPN', 'IRL', 'DEU', 'HUN',
       'CYP', 'BRA', 'LUX', 'KOR', 'CHN', 'IRN', 'AUT', 'NOR', 'CN',
       'EST', nan, 'AGO', 'AUS', 'RUS', 'ISL', 'ZWE', 'TUR', 'CUB', 'CZE',
       'SWE', 'DNK', 'IDN', 'MAR', 'FIN', 'IND', 'IRQ', 'SUR', 'ARE',
       'PER', 'MEX', 'GAB', 'MOZ', 'CPV', 'EGY', 'ZAF', 'MAC', 'NZL',
       'PHL', 'GIB', 'GRC', 'ECU', 'SVN', 'URY', 'QAT', 'LTU', 'CAF',
       'ATA', 'VNM', 'TWN', 'JOR', 'LVA', 'COL', 'PRI', 'ASM', 'MYS',
       'SRB', 'AZE', 'BGR', 'UKR', 'GNB', 'VEN', 'MYT', 'SVK', 'HRV',
       'CRI', 'HKG', 'KAZ', 'SLV', 'SGP', 'CIV', 'PAN', 'GEO', 'NGA',
       'BIH', 'DZA', 'ETH', 'LBN', 'BFA', 'CHL', 'DOM', 'MDV', 'SAU',
       'BOL', 'BLR', 'LBY', 'TZA', 'PAK', 'TUN', 'ARM', 'THA', 'OMN',
       'NPL', 'PLW', 'BHR', 'LKA', 'PRY', 'CMR', 'STP', 'GHA', 'UGA',
       'SYR', 'TJK', 'MUS', 'MNE', 'BGD', 'BHS', 'SEN', 'MKD', 'AND',
       'NIC', 'C

In [3027]:
X_train[cat_vars_with_na].isnull().mean().sort_values(ascending = False)

country    0.004188
dtype: float64

In [3028]:
#variables tratadas con etiqueta de faltante por cantidad masiva de faltantes.
vars_with_missing_string = [var for var in cat_vars_with_na
                               if X_train[var].isnull().mean() > 0.2]

#variables tratadas con procedimiento por candiad adecuada de faltantes.
vars_freq_category = [var for var in cat_vars_with_na
                               if X_train[var].isnull().mean() <= 0.2]

In [3029]:
vars_with_missing_string

[]

In [3030]:
vars_freq_category

['country']

In [3031]:
#Faltantes con etiqueta missing
X_train[vars_with_missing_string] = X_train[vars_with_missing_string].fillna('Missing')
X_test[vars_with_missing_string] = X_test[vars_with_missing_string].fillna('Missing')

In [3032]:
#Estamos haciendo frecuency coding con la moda (por categoría para que no se repita)
for var in vars_freq_category:
    mode = X_train[var].mode()[0]
    
    X_train[var].fillna(mode, inplace=True)
    X_test[var].fillna(mode, inplace=True)
    
    print(var, "-------", mode)

country ------- PRT


In [3033]:
X_train['country'].mode()[0]

'PRT'

In [3034]:
X_train[cat_vars_with_na].isnull().mean().sort_values(ascending = False)
X_test[cat_vars_with_na].isnull().mean().sort_values(ascending = False)

country    0.0
dtype: float64

In [3035]:
#Debe salir vacío
cat_vars_with_na = [var for var in cat_vars 
                        if X_train[var].isnull().sum() > 0]
cat_vars_with_na

[]

In [3036]:
#Ya no debería salir el NA/ NAN
pd.unique(X_train['country'])

array(['PRT', 'NLD', 'GBR', 'FRA', 'ESP', 'MLT', 'ITA', 'USA', 'ARG',
       'ISR', 'ROU', 'BEL', 'CHE', 'POL', 'JPN', 'IRL', 'DEU', 'HUN',
       'CYP', 'BRA', 'LUX', 'KOR', 'CHN', 'IRN', 'AUT', 'NOR', 'CN',
       'EST', 'AGO', 'AUS', 'RUS', 'ISL', 'ZWE', 'TUR', 'CUB', 'CZE',
       'SWE', 'DNK', 'IDN', 'MAR', 'FIN', 'IND', 'IRQ', 'SUR', 'ARE',
       'PER', 'MEX', 'GAB', 'MOZ', 'CPV', 'EGY', 'ZAF', 'MAC', 'NZL',
       'PHL', 'GIB', 'GRC', 'ECU', 'SVN', 'URY', 'QAT', 'LTU', 'CAF',
       'ATA', 'VNM', 'TWN', 'JOR', 'LVA', 'COL', 'PRI', 'ASM', 'MYS',
       'SRB', 'AZE', 'BGR', 'UKR', 'GNB', 'VEN', 'MYT', 'SVK', 'HRV',
       'CRI', 'HKG', 'KAZ', 'SLV', 'SGP', 'CIV', 'PAN', 'GEO', 'NGA',
       'BIH', 'DZA', 'ETH', 'LBN', 'BFA', 'CHL', 'DOM', 'MDV', 'SAU',
       'BOL', 'BLR', 'LBY', 'TZA', 'PAK', 'TUN', 'ARM', 'THA', 'OMN',
       'NPL', 'PLW', 'BHR', 'LKA', 'PRY', 'CMR', 'STP', 'GHA', 'UGA',
       'SYR', 'TJK', 'MUS', 'MNE', 'BGD', 'BHS', 'SEN', 'MKD', 'AND',
       'NIC', 'COM', 

In [3037]:
pd.unique(X_test['country'])

array(['AUT', 'PRT', 'ISR', 'DEU', 'ITA', 'CN', 'NZL', 'GBR', 'DNK',
       'FRA', 'IRL', 'ESP', 'POL', 'NLD', 'ZAF', 'AUS', 'CHN', 'COL',
       'CHE', 'AGO', 'BRA', 'USA', 'NOR', 'SWE', 'BEL', 'TUR', 'ROU',
       'GRC', 'RUS', 'ARG', 'PER', 'ARE', 'IND', 'FIN', 'ISL', 'SVN',
       'HRV', 'MYS', 'CYP', 'TUN', 'MAR', 'JPN', 'ALB', 'BGR', 'LUX',
       'KOR', 'CHL', 'GTM', 'TWN', 'THA', 'LTU', 'HUN', 'CZE', 'GIB',
       'DZA', 'LIE', 'PHL', 'EGY', 'AND', 'MAC', 'LVA', 'IDN', 'EST',
       'ECU', 'HKG', 'SVK', 'ARM', 'SYR', 'SLE', 'LBY', 'BEN', 'MEX',
       'VEN', 'ZWE', 'SGP', 'MDV', 'SRB', 'PAK', 'JOR', 'MUS', 'MOZ',
       'DOM', 'NGA', 'GAB', 'CPV', 'SAU', 'URY', 'LBN', 'BLR', 'MLT',
       'IRN', 'UKR', 'CRI', 'VGB', 'PAN', 'CUB', 'SUR', 'MCO', 'TMP',
       'SLV', 'MKD', 'BIH', 'GEO', 'KAZ', 'IRQ', 'AZE', 'GNB', 'MDG',
       'KEN', 'CMR', 'KWT', 'ATA', 'BOL', 'BGD', 'NCL', 'OMN', 'SYC'],
      dtype=object)

##### 2.2 Missing Values para Variables Numéricas

In [3038]:
num_vars = [var for var in X_train.columns
               if var not in cat_vars and var != 'adr'] #Número de variables numéricas

In [3039]:
len(num_vars)

15

In [3040]:
num_vars

['lead_time',
 'arrival_date_year',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'booking_changes',
 'days_in_waiting_list',
 'required_car_parking_spaces',
 'total_of_special_requests']

In [3041]:
##Numéricas con faltantes
num_vars_with_na = [var for var in num_vars
                       if X_train[var].isnull().sum() > 0] #Número de variables numéricas con faltantes

In [3042]:
X_train[num_vars_with_na].isnull().mean() # % de faltantes

children    0.000039
dtype: float64

In [3043]:
pd.unique(X_train['children'])

array([ 0.,  1.,  2.,  3., nan])

##### Se aplica criterio para tratar data faltante

In [3044]:
for var in num_vars_with_na:
    mean_val = X_train[var].mean()
    
    print(var, mean_val)
    
    X_train[var].fillna(mean_val, inplace=True)
    X_test[var].fillna(mean_val, inplace=True) #Estamos usando la media de train para sustituir, no la de test
    

children 0.10388560954699094


In [3045]:
X_train[num_vars_with_na].isnull().mean() # Ya debería salir 0

children    0.0
dtype: float64

In [3046]:
pd.unique(X_train['children'])

array([0.        , 1.        , 2.        , 3.        , 0.10388561])

### 3. Variables Temporales

In [3047]:
def elapsed_time(df, var, ref):
    df[var] = df[ref] - df[var]
    return df

In [3048]:
year_vars = [var for var in num_vars if (('Yr' in var) or ('date' in var))]
year_vars

['arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month']

In [3049]:
ref = 'arrival_date_year'

for var in year_vars:
    if(var is not ref):
        X_train = elapsed_time(X_train, var, ref)
        X_test = elapsed_time(X_test, var, ref)

In [3050]:
#Eliminamos columnas debido a que guardarán el valor 0
X_train.drop(['arrival_date_year'], axis=1, inplace=True)
X_test.drop(['arrival_date_year'], axis=1, inplace=True)

### 4. Transformación de Variables Numéricas

In [3051]:
#Aplicamos transformación logaritmica
# Para algunas variables da error la fórmula porque tiene valores negativos
for var in ['arrival_date_week_number' , 'arrival_date_day_of_month']:
    X_train[var] = np.log(X_train[var])
    X_test[var] = np.log(X_test[var])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [3052]:
#aplicamos transoformación de Yeo-Jonhson, no en todas las variables es conveniente mejorar la gráfica
#LotArea es a la única que vale la pena hacer una tranformación
X_train['booking_changes'], param = stats.yeojohnson(X_train['booking_changes'])

In [3053]:
X_test['booking_changes'] = stats.yeojohnson(X_test['booking_changes'], lmbda=param) #lmbda = Correr Yeo Johnson pero con un valor ya conocido

In [3054]:
print(param)

-7.551898638985714


#### 4.1 Binarización de Variables con Sesgo fuerte

In [3055]:
sesgadas = ['stays_in_week_nights', 'previous_bookings_not_canceled', 'booking_changes']

In [3056]:
#Codificar con 0 y 1
for var in sesgadas:
    X_train[var] = np.where(X_train[var] == 0, 0, 1)
    X_test[var] = np.where(X_test[var] == 0, 0, 1)

In [3057]:
X_train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
84964,City Hotel,0,88,March,,,1,1,2,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,0,0,A,D,0,No Deposit,0,Transient,0,1,Check-Out,2016-03-07
27111,Resort Hotel,0,13,August,,,0,1,2,1.0,0,HB,PRT,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,0,Transient,0,2,Check-Out,2016-08-20
13958,Resort Hotel,0,0,February,,,0,1,1,0.0,0,BB,PRT,Corporate,Corporate,0,2,1,A,D,0,No Deposit,0,Transient,1,2,Check-Out,2017-02-22
48359,City Hotel,0,0,March,,,0,1,2,0.0,0,BB,PRT,Complementary,Direct,0,0,0,A,A,1,No Deposit,0,Transient,0,1,Check-Out,2016-03-23
105085,City Hotel,0,23,January,,,0,1,2,0.0,0,BB,NLD,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient,0,0,Check-Out,2017-01-29


In [3058]:
X_train = X_train.drop(['arrival_date_day_of_month','arrival_date_week_number', 'country', 'reservation_status_date'], axis=1)
X_test = X_test.drop(['arrival_date_day_of_month','arrival_date_week_number', 'country', 'reservation_status_date'], axis=1)

In [3059]:
X_train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status
84964,City Hotel,0,88,March,1,1,2,0.0,0,BB,Offline TA/TO,TA/TO,0,0,0,A,D,0,No Deposit,0,Transient,0,1,Check-Out
27111,Resort Hotel,0,13,August,0,1,2,1.0,0,HB,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,0,Transient,0,2,Check-Out
13958,Resort Hotel,0,0,February,0,1,1,0.0,0,BB,Corporate,Corporate,0,2,1,A,D,0,No Deposit,0,Transient,1,2,Check-Out
48359,City Hotel,0,0,March,0,1,2,0.0,0,BB,Complementary,Direct,0,0,0,A,A,1,No Deposit,0,Transient,0,1,Check-Out
105085,City Hotel,0,23,January,0,1,2,0.0,0,BB,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient,0,0,Check-Out


In [3060]:
X_test.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status
50840,City Hotel,0,126,May,1,1,2,0.0,0,BB,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,0,Transient-Party,0,1,Check-Out
44106,City Hotel,1,27,September,0,1,2,0.0,0,BB,Offline TA/TO,TA/TO,0,0,0,A,A,0,Non Refund,21,Transient,0,0,Canceled
107102,City Hotel,0,137,March,1,1,2,0.0,0,BB,Groups,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient-Party,0,1,Check-Out
31113,Resort Hotel,0,7,December,0,1,1,0.0,0,BB,Offline TA/TO,TA/TO,0,0,0,A,D,0,No Deposit,0,Transient-Party,0,0,Check-Out
98401,City Hotel,0,20,September,2,1,2,0.0,0,BB,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient,0,1,Check-Out


### 5. Codificación de Variables Categóricas

Valores únicos de las variables numéricas

In [3061]:
#pd.unique(X_train['is_canceled'])
#pd.unique(X_train['lead_time'])
#pd.unique(X_train['stays_in_weekend_nights'])
#pd.unique(X_train['stays_in_week_nights'])
#pd.unique(X_train['adults'])
#pd.unique(X_train['children'])
#pd.unique(X_train['babies'])
#pd.unique(X_train['is_repeated_guest'])
#pd.unique(X_train['required_car_parking_spaces'])
#pd.unique(X_train['total_of_special_requests'])
#pd.unique(X_train['previous_cancellations'])
#pd.unique(X_train['previous_bookings_not_canceled'])
#pd.unique(X_train['booking_changes'])
#pd.unique(X_train['days_in_waiting_list'])

Valores únicos de las variables categóricas

In [3062]:
pd.unique(X_train['hotel'])

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


array(['City Hotel', 'Resort Hotel'], dtype=object)

In [3063]:
pd.unique(X_train['arrival_date_month'])

array(['March', 'August', 'February', 'January', 'November', 'May',
       'July', 'December', 'June', 'October', 'April', 'September'],
      dtype=object)

In [3064]:
pd.unique(X_train['meal'])

array(['BB', 'HB', 'Undefined', 'SC', 'FB'], dtype=object)

In [3065]:
pd.unique(X_train['market_segment'])

array(['Offline TA/TO', 'Online TA', 'Corporate', 'Complementary',
       'Groups', 'Direct', 'Aviation', 'Undefined'], dtype=object)

In [3066]:
pd.unique(X_train['distribution_channel'])

array(['TA/TO', 'Corporate', 'Direct', 'GDS', 'Undefined'], dtype=object)

In [3067]:
pd.unique(X_train['reserved_room_type'])

array(['A', 'D', 'E', 'G', 'C', 'F', 'B', 'H', 'L', 'P'], dtype=object)

In [3068]:
pd.unique(X_train['deposit_type'])

array(['No Deposit', 'Non Refund', 'Refundable'], dtype=object)

In [3069]:
pd.unique(X_train['customer_type'])

array(['Transient', 'Contract', 'Transient-Party', 'Group'], dtype=object)

In [3070]:
pd.unique(X_train['reservation_status'])

array(['Check-Out', 'Canceled', 'No-Show'], dtype=object)

In [3071]:
pd.unique(X_train['assigned_room_type'])

array(['D', 'A', 'E', 'G', 'C', 'F', 'B', 'K', 'I', 'H', 'P', 'L'],
      dtype=object)

In [3072]:
X_train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status
84964,City Hotel,0,88,March,1,1,2,0.0,0,BB,Offline TA/TO,TA/TO,0,0,0,A,D,0,No Deposit,0,Transient,0,1,Check-Out
27111,Resort Hotel,0,13,August,0,1,2,1.0,0,HB,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,0,Transient,0,2,Check-Out
13958,Resort Hotel,0,0,February,0,1,1,0.0,0,BB,Corporate,Corporate,0,2,1,A,D,0,No Deposit,0,Transient,1,2,Check-Out
48359,City Hotel,0,0,March,0,1,2,0.0,0,BB,Complementary,Direct,0,0,0,A,A,1,No Deposit,0,Transient,0,1,Check-Out
105085,City Hotel,0,23,January,0,1,2,0.0,0,BB,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient,0,0,Check-Out


In [3073]:
#Hacer una codificación ordinal con los valores (un diccionario)
Diccio_ART = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'H':8, 'I':9,'K':10, 'L':11, 'P':12, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_ADM = {'April':1, 'August':2, 'December':3, 'February':4, 'January':5, 'July':6, 'June':7, 'March':8, 'May':9,'November':10, 'October':11, 'September':12, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_MS = {'Aviation':1, 'Complementary':2, 'Corporate':3, 'Direct':4, 'Groups':5, 'Offline TA/TO':6, 'Online TA':7, 'Undefined':8, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_DC = {'Corporate':1, 'Direct':2, 'GDS':3, 'TA/TO':4, 'Undefined':5, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_Meal = {'BB':1, 'HB':2, 'Undefined':3, 'SC':4, 'FB':5, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_CT = {'Contract':1, 'Group':2, 'Transient':3, 'Transient-Party':4, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_RS = {'Canceled':1, 'Check-Out':2, 'No-Show':3, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_Hotel = {'City Hotel':1, 'Resort Hotel':2, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_DT = {'No Deposit':1, 'Non Refund':2, 'Refundable':3, 'Missing':0, 'NA':0, 'NaN':0}
Diccio_RRT = {'A':1, 'D':2, 'E':3, 'G':4, 'C':5, 'F':6, 'B':7, 'H':8, 'L':9, 'P':10, 'Missing':0, 'NA':0, 'NaN':0}

In [3074]:
X_train['assigned_room_type'] = X_train['assigned_room_type'].map(Diccio_ART)
X_train['arrival_date_month'] = X_train['arrival_date_month'].map(Diccio_ADM)
X_train['market_segment'] = X_train['market_segment'].map(Diccio_MS)
X_train['distribution_channel'] = X_train['distribution_channel'].map(Diccio_DC)
X_train['meal'] = X_train['meal'].map(Diccio_Meal)
X_train['customer_type'] = X_train['customer_type'].map(Diccio_CT)
X_train['reservation_status'] = X_train['reservation_status'].map(Diccio_RS)
X_train['hotel'] = X_train['hotel'].map(Diccio_Hotel)
X_train['deposit_type'] = X_train['deposit_type'].map(Diccio_DT)
X_train['reserved_room_type'] = X_train['reserved_room_type'].map(Diccio_RRT)

In [3075]:
X_test['assigned_room_type'] = X_test['assigned_room_type'].map(Diccio_ART)
X_test['arrival_date_month'] = X_test['arrival_date_month'].map(Diccio_ADM)
X_test['market_segment'] = X_test['market_segment'].map(Diccio_MS)
X_test['distribution_channel'] = X_test['distribution_channel'].map(Diccio_DC)
X_test['meal'] = X_test['meal'].map(Diccio_Meal)
X_test['customer_type'] = X_test['customer_type'].map(Diccio_CT)
X_test['reservation_status'] = X_test['reservation_status'].map(Diccio_RS)
X_test['hotel'] = X_test['hotel'].map(Diccio_Hotel)
X_test['deposit_type'] = X_test['deposit_type'].map(Diccio_DT)
X_test['reserved_room_type'] = X_test['reserved_room_type'].map(Diccio_RRT)

In [3076]:
X_train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status
84964,1,0,88,8,1,1,2,0.0,0,1,6,4,0,0,0,1,4,0,1,0,3,0,1,2
27111,2,0,13,2,0,1,2,1.0,0,2,7,4,0,0,0,2,4,0,1,0,3,0,2,2
13958,2,0,0,4,0,1,1,0.0,0,1,3,1,0,2,1,1,4,0,1,0,3,1,2,2
48359,1,0,0,8,0,1,2,0.0,0,1,2,2,0,0,0,1,1,1,1,0,3,0,1,2
105085,1,0,23,5,0,1,2,0.0,0,1,7,4,0,0,0,1,1,0,1,0,3,0,0,2


In [3077]:
X_test.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status
50840,1,0,126,9,1,1,2,0.0,0,1,7,4,0,0,0,2,4,0,1,0,4,0,1,2
44106,1,1,27,12,0,1,2,0.0,0,1,6,4,0,0,0,1,1,0,2,21,3,0,0,1
107102,1,0,137,8,1,1,2,0.0,0,1,5,4,0,0,0,1,1,0,1,0,4,0,1,2
31113,2,0,7,3,0,1,1,0.0,0,1,6,4,0,0,0,1,4,0,1,0,4,0,0,2
98401,1,0,20,12,2,1,2,0.0,0,1,7,4,0,0,0,1,1,0,1,0,3,0,1,2


### 6. Feature Scaling

In [3078]:
scaler = MinMaxScaler()

scaler.fit(X_train)

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns)


X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns)

In [3079]:
X_train.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status
0,0.0,0.0,0.119403,0.636364,0.052632,1.0,0.036364,0.0,0.0,0.0,0.714286,0.75,0.0,0.0,0.0,0.0,0.272727,0.0,0.0,0.0,0.666667,0.0,0.2,0.5
1,1.0,0.0,0.017639,0.090909,0.0,1.0,0.036364,0.333333,0.0,0.25,0.857143,0.75,0.0,0.0,0.0,0.111111,0.272727,0.0,0.0,0.0,0.666667,0.0,0.4,0.5
2,1.0,0.0,0.0,0.272727,0.0,1.0,0.018182,0.0,0.0,0.0,0.285714,0.0,0.0,0.076923,1.0,0.0,0.272727,0.0,0.0,0.0,0.666667,0.125,0.4,0.5
3,0.0,0.0,0.0,0.636364,0.0,1.0,0.036364,0.0,0.0,0.0,0.142857,0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.666667,0.0,0.2,0.5
4,0.0,0.0,0.031208,0.363636,0.0,1.0,0.036364,0.0,0.0,0.0,0.857143,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.5


In [3080]:
#Guardamos dataset con data preparada para entrenamiento.

X_train.to_csv('prep_Xtrain.csv', index=False)
X_test.to_csv('prep_Xtest.csv', index=False)

y_train.to_csv('prep_ytrain.csv', index=False)
y_test.to_csv('prep_ytest.csv', index=False)

In [3081]:
joblib.dump(scaler, 'minmax_scaler.joblib')

['minmax_scaler.joblib']

In [3082]:
np.sum(X_train[X_train == 'Unf'].sum(axis=0))

0.0