### Importamos todos los paquetes a usar y cargamos los datos preprocesados

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_absolute_error, confusion_matrix, classification_report

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

df = pd.read_csv('./data/booking_preprocessed.csv')

# observamos
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  int64  
 1   is_canceled                     119390 non-null  float64
 2   lead_time                       119390 non-null  float64
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  int64  
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  float64
 8   stays_in_week_nights            119390 non-null  float64
 9   adults                          119390 non-null  float64
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  float64
 12  meal            

### Cargamos solo las columnas que usaremos en la clasificacion

In [4]:
# queremos predecir is_canceled. Entonces vamos a particionar la data para entrenamiento y medir la precision

X = df[[
    'hotel', 
    'lead_time', 
    'arrival_date_month', 
    'arrival_date_day_of_month',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'adults',
    'children',
    'babies',
    'distribution_channel',
    'is_repeated_guest',
    'previous_cancellations',
    'reserved_room_type',
    'booking_changes',
    'deposit_type',
    'days_in_waiting_list',
]].copy()
y = df['is_canceled'].astype(int)

X.fillna(X.mean(), inplace=True)
X.head()

Unnamed: 0,hotel,lead_time,arrival_date_month,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,distribution_channel,is_repeated_guest,previous_cancellations,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list
0,1,2.227042,5,1,-0.928887,-1.310234,0.247896,-0.260662,-0.081579,1,0,-0.103179,2,4.260083,0,-0.131923
1,1,5.92336,5,1,-0.928887,-1.310234,0.247896,-0.260662,-0.081579,1,0,-0.103179,2,5.793107,0,-0.131923
2,1,-0.90781,5,1,-0.928887,-0.786204,-1.478441,-0.260662,-0.081579,1,0,-0.103179,0,-0.338988,0,-0.131923
3,1,-0.851664,5,1,-0.928887,-0.786204,-1.478441,-0.260662,-0.081579,0,0,-0.103179,0,-0.338988,0,-0.131923
4,1,-0.842306,5,1,-0.928887,-0.262173,0.247896,-0.260662,-0.081579,3,0,-0.103179,0,-0.338988,0,-0.131923


### Separamos la data en 20% para testing y 80% para entrenamiento

In [5]:
# separar la data en seccion de entrenamiento y de testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# regresion logistica

model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)


### Hacemos predicciones con el set de testing

Con esto podemos obtener una puntuacion de precision. Ademas podemos obtener un reporte completo de la calidad del reporte de clasificacion.

Con este reporte podemos cambiar parametros en el algoritmo y modificar las variables usadas para mejorar su eficacia.

In [7]:
# Making predictions
predictions = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.7643018678281263
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.98      0.84     14907
           1       0.92      0.41      0.56      8971

    accuracy                           0.76     23878
   macro avg       0.83      0.69      0.70     23878
weighted avg       0.80      0.76      0.74     23878



### Ahora hagamos regresion tambien. Vamos a predecir la variable adr

In [9]:
X_r = df[[
    'hotel', 
    'is_canceled', 
    'lead_time', 
    'arrival_date_month', 
    'arrival_date_day_of_month', 
    'stays_in_weekend_nights',
    'stays_in_week_nights', 
    'adults', 
    'children', 
    'babies', 
    'meal',
    'country', 
    'market_segment', 
    'distribution_channel',
    'is_repeated_guest', 
    'previous_cancellations',
    'previous_bookings_not_canceled', 
    'reserved_room_type',
    'assigned_room_type', 
    'deposit_type', 
    'days_in_waiting_list', 
    'customer_type', 
    'required_car_parking_spaces', 
    'total_of_special_requests',
    'reservation_status', 
]].copy()
y_r = df['adr'].astype(int)

X_r.fillna(X.mean(), inplace=True)
X_r.head()

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_r, y_r, test_size=0.2, random_state=42)

In [10]:
# Inicializamos el modelo
model = LinearRegression()

# Entrenamos el modelo
model.fit(X_train, y_train)


In [11]:

# Making predictions
predictions = model.predict(X_test)

# Calculating metrics
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

# Creating a DataFrame to display the metrics as a table
metrics_df = pd.DataFrame({
    "Mean absolute error": [mae],
    "Mean squared error": [mse],
    "RMSE": [rmse],
    "R²": [r2]
})

# Displaying the table
print(metrics_df)

   Mean absolute error  Mean squared error      RMSE        R²
0             0.347753            0.170694  0.413151  0.272248
