Libraries and Display options

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 1000)

In [None]:
data = pd.read_csv("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/raw/hotel_booking_cancellation_prediction.csv")
# load data 

Initial Data Exploration

In [18]:
print("=== Sample Data ===")
display(data.sample(30))

print("\n=== Basic info ===")
#display(data.info())

print("\n=== Data Shape ===")
#display(f"Rows: {data.shape[0]}, Columns: {data.shape[1]}")

=== Sample Data ===


Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,stays_in_weeks_nights
5073,5073,City Hotel,1,29,2016,November,47,14,1.0,3,2,0,0,BB,ESP,Direct,Direct,0,0,0,D,D,1,No Deposit,14.0,,0,Transient,100.38,0,2,Canceled,11/13/2016,
3376,3376,City Hotel,1,101,2017,July,29,20,0.0,3,2,0,0,BB,CHN,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,9.0,,0,Transient,136.5,0,1,Canceled,4/24/2017,
8215,8215,City Hotel,1,311,2017,May,21,24,0.0,4,0,2,0,BB,DNK,Online TA,TA/TO,0,0,0,B,B,2,No Deposit,9.0,,0,Transient,99.45,0,0,Canceled,8/14/2016,
8040,8040,City Hotel,1,123,2017,May,18,1,1.0,4,1,0,0,BB,PRT,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,8.0,,0,Transient,,0,1,Canceled,4/5/2017,
5497,5497,City Hotel,1,277,2016,November,46,7,1.0,2,2,0,0,BB,PRT,Groups,TA/TO,0,0,0,A,A,0,Non Refund,,,0,Transient,100.0,0,0,Canceled,4/4/2016,
1883,1883,Resort Hotel,0,1,2016,August,35,21,1.0,0,2,0,0,BB,PRT,Direct,Direct,0,0,0,A,A,1,No Deposit,250.0,,0,Transient,200.0,0,1,Check-Out,8/22/2016,
3073,3073,Resort Hotel,1,302,2017,May,19,9,0.0,4,2,0,0,HB,PRT,Groups,Direct,0,0,0,A,A,0,No Deposit,,,0,Transient-Party,81.0,0,0,Canceled,8/25/2016,
7202,7202,Resort Hotel,1,23,2016,July,28,8,1.0,2,2,1,0,HB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,241.0,,0,Transient,162.24,0,0,Canceled,6/22/2016,
2882,2882,City Hotel,0,7,2016,April,17,22,1.0,2,2,0,0,BB,PRT,Direct,Direct,0,0,0,A,E,0,No Deposit,,,0,Transient,75.0,0,1,Check-Out,4/25/2016,
6835,6835,City Hotel,1,79,2017,February,7,13,2.0,5,3,0,0,BB,POL,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,9.0,,0,Transient,130.9,0,1,Canceled,12/5/2016,



=== Basic info ===

=== Data Shape ===


In [None]:
hotel_data = data.copy()
# copy dataset for manipulation

Summary of Numerical/Categorical columns

In [None]:
print("\n=== Numerical Columns Summary===")
display(hotel_data.describe())

print("\n=== Categorical Columns Summary ===")
display(hotel_data.describe(include="object"))

Column names and data types

In [None]:
print("\n=== Column names")
display(hotel_data.columns.to_list())

print("\n=== Data types ===")
display(hotel_data.dtypes)

Missing Values Check

In [None]:
print("\n=== Missing values ===")
display(hotel_data.isnull().sum())

print("\n=== Missing value(procents) ===")
display(hotel_data.isnull().sum()/ len(hotel_data) * 100)

Check duplicates

In [None]:
print("\n=== Check for duplicates===")
display(hotel_data.duplicated().sum())

Target variable analysis

In [None]:
print("\n=== Target variable analysis")
hotel_data["reserved_room_type"].value_counts()
hotel_data["assigned_room_type"].value_counts()

#rint(hotel_data["company"].value_counts().head(5))

Outliers in Numerical / Categorical columns

In [None]:
print("\n=== Identify outliers in numerical columns")
numerical_cols = hotel_data.select_dtypes(include=["int64","float64"]).columns
print(numerical_cols)
for col in numerical_cols:
    Q1 = hotel_data[col].quantile(0.25)
    Q2 = hotel_data[col].quantile(0.75)
    IQR = Q2 - Q1
    outliers = hotel_data[(hotel_data[col] < Q1 - 1.5*IQR) | (hotel_data[col] > Q2 + 1.5*IQR)]
    print(f"{col}: {len(outliers)} outliers")
    print("+" * 50) 

print("\n=== Identify outliers in categorical columns")
categorical_columns = hotel_data.select_dtypes(include="object").columns
for col in categorical_columns:
    display(f"{col}: {hotel_data[col].nunique()} unique elements") 
    display(hotel_data[col].value_counts().head())
    print("-" * 50)


Data consistency checks

Non-numeric values in numeric columns

In [None]:
print("---Non-numeric values in numeric columns---")
for col in numerical_cols:
    non_numeric = hotel_data[col].astype(str).str.contains('[^0-9,-]', regex=True).sum() 
    if non_numeric > 0:
        print(f"{col}: {non_numeric} non-numeric values found")
        examples = hotel_data[hotel_data[col].astype(str).str.contains('[^0-9.-]', regex=True)][col].head()
        print(f"Examples: {examples.to_list()}")


Logical Relationship Checks

In [None]:
# Total guests > 0
zero_guests = hotel_data[(hotel_data["adults"] + hotel_data["children"] + hotel_data["babies"]) == 0]
print(f"Bookings with 0 total guests: {len(zero_guests)}")

# Stay duration > 0
zero_nights = hotel_data[(hotel_data['stays_in_weekend_nights'] + hotel_data['stays_in_week_nights']) == 0]
print(f"Bookings with 0 nights stay: {len(zero_nights)}")

# Lead time >=0
negative_lead = hotel_data[hotel_data["lead_time"] < 0]
print(f"Bookings with negative lead time: {len(negative_lead)}")


invalid_repeated = hotel_data[~hotel_data['is_repeated_guest'].isin([0, 1])]
print(f"Invalid is_repeated_guest values: {len(invalid_repeated)}")


negative_adults = hotel_data[hotel_data['adults'] < 0]
print(f"Bookings with negative adults: {len(negative_adults)}")


negative_children = hotel_data[hotel_data['children'] < 0]
print(f"Bookings with negative children: {len(negative_children)}")


negative_babies = hotel_data[hotel_data['babies'] < 0]
print(f"Bookings with negative babies: {len(negative_babies)}")

Impossible Values

In [None]:
negative_adr = hotel_data[hotel_data['adr'] < 0]
print(f"Bookings with negative ADR: {len(negative_adr)}")


negative_prev_cancel = hotel_data[hotel_data['previous_cancellations'] < 0]
print(f"Bookings with negative previous_cancellations: {len(negative_prev_cancel)}")


negative_prev_bookings = hotel_data[hotel_data['previous_bookings_not_canceled'] < 0]
print(f"Bookings with negative previous_bookings_not_canceled: {len(negative_prev_bookings)}")


negative_changes = hotel_data[hotel_data['booking_changes'] < 0]
print(f"Bookings with negative booking_changes: {len(negative_changes)}")


negative_waiting = hotel_data[hotel_data['days_in_waiting_list'] < 0]
print(f"Bookings with negative days_in_waiting_list: {len(negative_waiting)}")


negative_parking = hotel_data[hotel_data['required_car_parking_spaces'] < 0]
print(f"Bookings with negative required_car_parking_spaces: {len(negative_parking)}")


negative_requests = hotel_data[hotel_data['total_of_special_requests'] < 0]
print(f"Bookings with negative total_of_special_requests: {len(negative_requests)}")