#### Cargando data y mostrando las columnas

In [6]:
import pandas as pd
import matplotlib.pyplot as plt

#loading data
data = pd.read_csv('./data/booking.csv' )
df = data.copy()
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


#### Convirtiendo data categorica en numerica

In [7]:
# Convirtiendo datos categoricos a numeros

def category(df,col):
    df[col] = pd.Categorical(data[col])
    return df[col].cat.codes

df['hotel'] = category(data,'hotel')
df['arrival_date_month'] = category(data,'arrival_date_month')
df['deposit_type'] = category(data,'deposit_type')
df['country'] = category(data,'country')
df['market_segment'] = category(data,'market_segment')
df['distribution_channel'] = category(data,'distribution_channel')
df['deposit_type'] = category(data,'deposit_type')
df['customer_type'] = category(data,'customer_type')
df['reservation_status'] = category(data,'reservation_status')
df['meal'] = category(data,'meal')
df['reserved_room_type'] = category(data,'reserved_room_type')
df['assigned_room_type'] = category(data,'assigned_room_type')

# agent es un id asi que asignamos -1 a los valores null
df['agent'] = data['agent'].fillna(-1)

# el 94% de los valores de company son NaN, removemos la columna
print("company null values:",data['company'].isna().sum() / len(data['company']),'%')
del df['company']

company null values: 0.943068933746545 %


#### Procesando datos de fechas para ser numericos

In [8]:
# Convirtiendo datos fechas a UNIX timestamp

def to_unix(df,col):
    date = pd.to_datetime(df[col])
    return date.astype('int64') // 10**9
    
df['reservation_status_date'] = to_unix(data,'reservation_status_date')

#### Observando la data numerica para determinar en cuales columnas existen sobresalientes


In [9]:
# observando el comportamiento de columnas con valores numericos

df.describe()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,...,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0
mean,0.335539,0.370416,104.011416,2016.156554,5.490544,27.165173,15.798241,0.927599,2.500302,1.856403,...,0.221124,0.124893,74.691457,2.321149,2.137323,101.831122,0.062518,0.571363,0.649803,1469838000.0
std,0.472181,0.482918,106.863097,0.707476,3.535075,13.605138,8.780829,0.998613,1.908286,0.579261,...,0.652306,0.334678,107.238047,17.594721,0.57704,50.53579,0.245291,0.792798,0.497776,19806200.0
min,0.0,0.0,0.0,2015.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,-6.38,0.0,0.0,0.0,1413504000.0
25%,0.0,0.0,18.0,2016.0,2.0,16.0,8.0,0.0,1.0,2.0,...,0.0,0.0,7.0,0.0,2.0,69.29,0.0,0.0,0.0,1454285000.0
50%,0.0,0.0,69.0,2016.0,6.0,28.0,16.0,1.0,2.0,2.0,...,0.0,0.0,9.0,0.0,2.0,94.575,0.0,0.0,1.0,1470528000.0
75%,1.0,1.0,160.0,2017.0,8.0,38.0,23.0,2.0,3.0,2.0,...,0.0,0.0,152.0,0.0,2.0,126.0,0.0,1.0,1.0,1486512000.0
max,1.0,1.0,737.0,2017.0,11.0,53.0,31.0,19.0,50.0,55.0,...,21.0,2.0,535.0,391.0,3.0,5400.0,8.0,5.0,2.0,1505347000.0


In [10]:
# Standarizar valores numericos para evitar que el modelo de prioridad a valores grandes. 
# Llenamos los valores nulos. Para variables con outliers usamos la media
def fill_nulls(data, df, col, is_mean = False):
    if is_mean:
        df[col] = data[col].fillna(data[col].mean())
    else:
        df[col] = data[col].fillna(data[col].median())

def standarize(data, df,col, is_mean = False):
    fill_nulls(data, df,col, is_mean)
    df[col] = (data[col] - data[col].mean()) / data[col].std()

# usando la media porque hay outliers
standarize(data, df,'lead_time')
standarize(data,df,'stays_in_weekend_nights')
standarize(data,df,'stays_in_week_nights')
standarize(data,df,'adults')
standarize(data,df,'children')
standarize(data,df,'babies')
standarize(data,df,'previous_cancellations')
standarize(data,df,'previous_bookings_not_canceled')
standarize(data,df,'booking_changes')
standarize(data,df,'days_in_waiting_list')
standarize(data,df,'adr')
standarize(data,df,'required_car_parking_spaces')
standarize(data,df,'total_of_special_requests')
standarize(df,df,'reservation_status_date')


In [11]:
# check final para ver que los tipos de datos no nos den problemas luego en el modelo
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  int8   
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  float64
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  int8   
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  float64
 8   stays_in_week_nights            119390 non-null  float64
 9   adults                          119390 non-null  float64
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  float64
 12  meal            

In [12]:
# Guardando la data preprocesada para usarla en el entrenamiento del modelo
df.to_csv('./data/booking_preprocessed.csv', index=False)
