🧱 **PHASE 1**: *Setup and Data Collection*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import logging
import os

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def clean_data(df):
    """Clean hotel bookings data."""
    try:
        logging.info("Cleaning data...")
        df['children'] = df['children'].fillna(df['children'].median())
        df['country'] = df['country'].fillna('Unknown')
        df['agent'] = df['agent'].fillna(0)
        df['company'] = df['company'].fillna(0)
        df = df.drop(columns=['reservation_status'], errors='ignore')
        logging.info("Data cleaned successfully.")
        return df
    except Exception as e:
        logging.error(f"Error in clean_data: {e}")
        raise

def engineer_features(df):
    """Add derived features."""
    try:
        logging.info("Engineering features...")
        df['total_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
        df['total_guests'] = df['adults'] + df['children'] + df['babies']
        df['is_long_lead'] = (df['lead_time'] > 90).astype(int)
        logging.info("Features engineered successfully.")
        return df
    except Exception as e:
        logging.error(f"Error in engineer_features: {e}")
        raise

# Load and preprocess data
data = pd.read_csv('data/raw/hotel_bookings.csv')
data_clean = clean_data(data)
data_clean = engineer_features(data_clean)
data_clean.to_csv('data/processed/hotel_bookings_clean.csv', index=False)
print("Cleaned data saved to data/processed/hotel_bookings_clean.csv")

In [None]:
import pandas as pd
from src.vizualize import plot_cancellation_trends, plot_lead_time_analysis, plot_correlation_matrix

# Load cleaned data
data_clean = pd.read_csv('data/processed/hotel_bookings_clean.csv')

# Perform EDA
print(data_clean.describe())
print(data_clean.isnull().sum())

# Generate visualizations
output_path = 'outputs/visuals'
plot_cancellation_trends(data_clean, output_path)
plot_lead_time_analysis(data_clean, output_path)
plot_correlation_matrix(data_clean, output_path)

print("Visualizations saved to outputs/visuals")


In [None]:
import pandas as pd
from src.model import train_model

# Load cleaned data
data_clean = pd.read_csv('data/processed/hotel_bookings_clean.csv')

# Train model
output_path = 'outputs/models'
metrics, model_path = train_model(data_clean, target_column='is_canceled', output_path=output_path)

print("Model trained successfully.")
print("Metrics:", metrics)
print(f"Model saved to {model_path}")

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [5]:
print(data.describe())

         is_canceled      lead_time  arrival_date_year  \
count  119390.000000  119390.000000      119390.000000   
mean        0.370416     104.011416        2016.156554   
std         0.482918     106.863097           0.707476   
min         0.000000       0.000000        2015.000000   
25%         0.000000      18.000000        2016.000000   
50%         0.000000      69.000000        2016.000000   
75%         1.000000     160.000000        2017.000000   
max         1.000000     737.000000        2017.000000   

       arrival_date_week_number  arrival_date_day_of_month  \
count             119390.000000              119390.000000   
mean                  27.165173                  15.798241   
std                   13.605138                   8.780829   
min                    1.000000                   1.000000   
25%                   16.000000                   8.000000   
50%                   28.000000                  16.000000   
75%                   38.000000            

In [6]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [7]:
data.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [8]:
data.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [9]:
data.select_dtypes(include='object').columns

Index(['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
       'distribution_channel', 'reserved_room_type', 'assigned_room_type',
       'deposit_type', 'customer_type', 'reservation_status',
       'reservation_status_date'],
      dtype='object')

In [None]:
def clean_data(df):
    """Clean the hotel bookings dataset."""
    try:
        # Impute missing values
        df['children'] = df['children'].fillna(df['children'].median())
        df['country'] = df['country'].fillna('Unknown')
        
        # Drop redundant columns (if they exist)
        redundant_cols = ['reservation_status', 'reservation_status_date']
        df = df.drop(columns=[col for col in redundant_cols if col in df.columns], errors='ignore')
        
        logging.info("Data cleaned successfully")
        return df
    except Exception as e:
        logging.error(f"Error in clean_data: {e}")
        raise

def engineer_features(df):
    """Create derived features."""
    try:
        df['total_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
        df['total_guests'] = df['adults'] + df['children'] + df['babies']
        df['is_long_lead'] = (df['lead_time'] > 90).astype(int)
        logging.info("Features engineered successfully")
        return df
    except Exception as e:
        logging.error(f"Error in engineer_features: {e}")
        raise

# Apply preprocessing
df = clean_data(df)
df = engineer_features(df)

# Save cleaned data
os.makedirs('data/processed', exist_ok=True)
df.to_csv('data/processed/hotel_bookings_clean.csv', index=False)
print("Cleaned data saved to data/processed/hotel_bookings_clean.csv")