In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Toma de datos

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


df = pd.read_csv('/content/drive/MyDrive/Datasets/Hotel_Reservations.csv')
df

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.00,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.00,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.00,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.50,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,INN36271,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,167.80,1,Not_Canceled
36271,INN36272,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,90.95,2,Canceled
36272,INN36273,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,98.39,2,Not_Canceled
36273,INN36274,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,94.50,0,Canceled


In [3]:
# No hay valores nulos
df.isnull().sum()

Booking_ID                              0
no_of_adults                            0
no_of_children                          0
no_of_weekend_nights                    0
no_of_week_nights                       0
type_of_meal_plan                       0
required_car_parking_space              0
room_type_reserved                      0
lead_time                               0
arrival_year                            0
arrival_month                           0
arrival_date                            0
market_segment_type                     0
repeated_guest                          0
no_of_previous_cancellations            0
no_of_previous_bookings_not_canceled    0
avg_price_per_room                      0
no_of_special_requests                  0
booking_status                          0
dtype: int64

## Analisis de datos y normalización

In [4]:
# eliminar la columna, Booking_ID, no nos sirve de nada
df.drop('Booking_ID', axis=1, inplace=True)

# Teniendo en cuenta que la variable objetivo es "booking status" nos interesa que sea binaria
# par su funcionamiento
df['booking_status'] = df['booking_status'].replace({'Canceled': 0, 'Not_Canceled': 1})
df

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.00,0,1
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,1
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.00,0,0
3,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.00,0,0
4,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.50,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,167.80,1,1
36271,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,90.95,2,0
36272,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,98.39,2,1
36273,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,94.50,0,0


In [22]:
# Depende del plan de comida tienes una mejor o peor calidad por lo que es (ORDINAL)
type_of_meal_plan = df['type_of_meal_plan'].unique()
type_of_meal_plan

array(['Meal Plan 1', 'Not Selected', 'Meal Plan 2', 'Meal Plan 3'],
      dtype=object)

In [24]:
# El tipo de habitación sugiere un orden en especifico (ORDINAL)
room_type_reserved = df['room_type_reserved'].unique()
room_type_reserved

array(['Room_Type 1', 'Room_Type 4', 'Room_Type 2', 'Room_Type 6',
       'Room_Type 5', 'Room_Type 7', 'Room_Type 3'], dtype=object)

In [25]:
# Ordinal ya que no se puede clasificar en un orden en expecífico (NOMINAL)
market_segment_type = df['market_segment_type'].unique()
market_segment_type

array(['Offline', 'Online', 'Corporate', 'Aviation', 'Complementary'],
      dtype=object)

In [8]:
df.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,1
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,1
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,0
3,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,0
4,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,0


In [26]:
# Crear una instancia de OneHotEncoder para las variables categóricas ordinales
oe = OrdinalEncoder(categories=[type_of_meal_plan,room_type_reserved])

# Crear una instancia de OneHotEncoder para las variables categóricas nominales
ohe = OneHotEncoder()

# tranformar los datos
preprocessor = make_column_transformer(
    (oe, ['type_of_meal_plan', 'room_type_reserved']),
    (ohe, ['market_segment_type']),
    remainder='passthrough'
)

In [27]:
# Variable objetivo estado de la reserva

X = df.drop("booking_status", axis=1)
y = df["booking_status"]

In [28]:
# Dividision de los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [29]:
model = LinearRegression()

In [30]:
# Utilizar el preprocessor para transformar los datos antes del entrenamiento
# tanto en el entrenamiento como en el test
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

X_train_transformed

array([[  0.  ,   0.  ,   0.  , ...,   0.  , 123.3 ,   1.  ],
       [  0.  ,   0.  ,   0.  , ...,   0.  ,  68.47,   0.  ],
       [  0.  ,   1.  ,   0.  , ...,   0.  , 168.3 ,   0.  ],
       ...,
       [  1.  ,   0.  ,   0.  , ...,   0.  , 108.  ,   1.  ],
       [  0.  ,   0.  ,   0.  , ...,   0.  , 113.46,   2.  ],
       [  1.  ,   0.  ,   0.  , ...,   0.  ,  91.5 ,   1.  ]])

In [31]:
# Entrenar el modelo utilizando los datos transformados
model.fit(X_train_transformed, y_train)

LinearRegression()

In [32]:
y_pred = model.predict(X_test_transformed)

In [33]:
y_pred

array([1.090884  , 0.90716754, 0.75955877, ..., 0.63720547, 0.90158444,
       0.05307668])

In [34]:
model.score(X_test_transformed, y_test)

0.3453380764671885