In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

# DISPLAY
pd.set_option("display.max_columns", None)  # show all columns
pd.set_option("display.max_rows", None)  # show all rows
pd.set_option("display.max_colwidth", None) # prevent columns from being truncated
pd.set_option("display.width", 1000) # widen output window (default is 80....)

data = pd.read_csv("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/raw/hotel_booking_cancellation_prediction.csv")

print("=== Sample Data ===")
#display(data.sample(1000))

print("\n=== Basic info ===")
#display(data.info())

print("\n=== Data Shape ===")
#display(f"Rows: {data.shape[0]}, Columns: {data.shape[1]}")

hotel_data = data.copy()

print("\n=== Numerical Columns Summary===")
#display(hotel_data.describe())

print("\n=== Categorical Columns Summary ===")
#display(hotel_data.describe(include="object"))

print("\n=== Column names")
#display(hotel_data.columns.to_list())

print("\n=== Data types ===")
#display(hotel_data.dtypes)

print("\n=== Missing values ===")
#display(hotel_data.isnull().sum())

print("\n=== Missing value(procents) ===")
#display(hotel_data.isnull().sum()/ len(hotel_data) * 100)

print("\n=== Check for duplicates===")
#display(hotel_data.duplicated().sum()) # 0 

print("\n=== Target variable analysis")
#hotel_data["is_canceled"].value_counts() # 0 = 6232 , 1 = 3768

print("\n=== Identify outliers in numerical columns")
numerical_cols = hotel_data.select_dtypes(include=["int64","float64"]).columns
for col in numerical_cols:
    Q1 = hotel_data[col].quantile(0.25)
    Q2 = hotel_data[col].quantile(0.75)
    IQR = Q2 - Q1
    outliers = hotel_data[(hotel_data[col] < Q1 - 1.5*IQR) | (hotel_data[col] > Q2 + 1.5*IQR)] # formula to determine an erronate value(outlier)
    print(f"{col}: {len(outliers)} outliers")
    print("+" * 50) 


print("\n=== Identify outliers in categorical columns")
categorical_columns = hotel_data.select_dtypes(include="object").columns
for col in categorical_columns:
    display(f"{col}: {hotel_data[col].nunique()} unique elements") 
    display(hotel_data[col].value_counts().head()) # limit to 5 most common values
    print("-" * 50)
 

print("\n=== Data Consistency Checks ===")


print("---Non-numeric values in numeric columns---")
for col in numerical_cols:
    # convert to string and check to find any character that is not digit,comma and dash
    non_numeric = hotel_data[col].astype(str).str.contains('[^0-9,-]',regex = True).sum() 
    if non_numeric > 0:
        print(f"{col}: {non_numeric} non-numeric values found")
        examples = hotel_data[hotel_data[col].astype(str).str.contains('[^0-9.-]', regex=True)][col].head()
        print(f"Examples: {examples.to_list()}")


print("\n--- Logical relationship checks ---")

# total guests > 0
zero_guests = hotel_data[(hotel_data["adults"] + hotel_data["children"] + hotel_data["babies"]) == 0] # keeps only the rows where cond true
print(f"Bookings with 0 total guests: {len(zero_guests)}")


# Stay duration > 0
zero_nights = hotel_data[(hotel_data['stays_in_weekend_nights'] + hotel_data['stays_in_week_nights']) == 0]
print(f"Bookings with 0 nights stay: {len(zero_nights)}")

# Lead time >=0
negative_lead = hotel_data[hotel_data["lead_time"] < 0]
print(f"Bookings with negative lead time: {len(negative_lead)}")

# Impossible values
print("\n--- Impossible value checks ---")
# ADR (Average Daily Rate) > 0
negative_adr = hotel_data[hotel_data['adr'] < 0]
print(f"Bookings with negative ADR: {len(negative_adr)}")

# Previous cancellations should be >= 0
negative_prev_cancel = hotel_data[hotel_data['previous_cancellations'] < 0]
print(f"Bookings with negative previous_cancellations: {len(negative_prev_cancel)}")

# Previous bookings not canceled should be >= 0
negative_prev_bookings = hotel_data[hotel_data['previous_bookings_not_canceled'] < 0]
print(f"Bookings with negative previous_bookings_not_canceled: {len(negative_prev_bookings)}")

# Booking changes should be >= 0
negative_changes = hotel_data[hotel_data['booking_changes'] < 0]
print(f"Bookings with negative booking_changes: {len(negative_changes)}")

# Days in waiting list should be >= 0
negative_waiting = hotel_data[hotel_data['days_in_waiting_list'] < 0]
print(f"Bookings with negative days_in_waiting_list: {len(negative_waiting)}")

# Required car parking spaces should be >= 0
negative_parking = hotel_data[hotel_data['required_car_parking_spaces'] < 0]
print(f"Bookings with negative required_car_parking_spaces: {len(negative_parking)}")

# Total special requests should be >= 0
negative_requests = hotel_data[hotel_data['total_of_special_requests'] < 0]
print(f"Bookings with negative total_of_special_requests: {len(negative_requests)}")

# 4. Check for logical constraints
print("\n--- Additional logical checks ---")
# is_repeated_guest should be 0 or 1
invalid_repeated = hotel_data[~hotel_data['is_repeated_guest'].isin([0, 1])]
print(f"Invalid is_repeated_guest values: {len(invalid_repeated)}")

# Adults should be >= 0
negative_adults = hotel_data[hotel_data['adults'] < 0]
print(f"Bookings with negative adults: {len(negative_adults)}")

# Children should be >= 0
negative_children = hotel_data[hotel_data['children'] < 0]
print(f"Bookings with negative children: {len(negative_children)}")

# Babies should be >= 0
negative_babies = hotel_data[hotel_data['babies'] < 0]
print(f"Bookings with negative babies: {len(negative_babies)}")

print("\n" + "="*60)
print("DATA OVERVIEW COMPLETE")
print("="*60)

=== Sample Data ===

=== Basic info ===

=== Data Shape ===

=== Numerical Columns Summary===

=== Categorical Columns Summary ===

=== Column names

=== Data types ===

=== Missing values ===

=== Missing value(procents) ===

=== Check for duplicates===

=== Target variable analysis

=== Identify outliers in numerical columns
Unnamed: 0: 0 outliers
++++++++++++++++++++++++++++++++++++++++++++++++++
is_canceled: 0 outliers
++++++++++++++++++++++++++++++++++++++++++++++++++
lead_time: 237 outliers
++++++++++++++++++++++++++++++++++++++++++++++++++
arrival_date_year: 0 outliers
++++++++++++++++++++++++++++++++++++++++++++++++++
arrival_date_week_number: 0 outliers
++++++++++++++++++++++++++++++++++++++++++++++++++
arrival_date_day_of_month: 0 outliers
++++++++++++++++++++++++++++++++++++++++++++++++++
stays_in_weekend_nights: 20 outliers
++++++++++++++++++++++++++++++++++++++++++++++++++
stays_in_week_nights: 270 outliers
++++++++++++++++++++++++++++++++++++++++++++++++++
adults: 2363 ou

'hotel: 2 unique elements'

hotel
City Hotel      6612
Resort Hotel    3388
Name: count, dtype: int64

--------------------------------------------------


'arrival_date_month: 12 unique elements'

arrival_date_month
August     1147
July       1035
May         996
June        946
October     931
Name: count, dtype: int64

--------------------------------------------------


'meal: 5 unique elements'

meal
BB           7718
HB           1200
SC            895
Undefined     116
FB             71
Name: count, dtype: int64

--------------------------------------------------


'country: 109 unique elements'

country
PRT    4069
GBR    1033
FRA     882
ESP     694
DEU     573
Name: count, dtype: int64

--------------------------------------------------


'market_segment: 7 unique elements'

market_segment
Online TA        4648
Offline TA/TO    2074
Groups           1730
Direct           1058
Corporate         413
Name: count, dtype: int64

--------------------------------------------------


'distribution_channel: 4 unique elements'

distribution_channel
TA/TO        8176
Direct       1262
Corporate     544
GDS            18
Name: count, dtype: int64

--------------------------------------------------


'reserved_room_type: 8 unique elements'

reserved_room_type
A    7033
D    1492
E     523
F     236
G     159
Name: count, dtype: int64

--------------------------------------------------


'assigned_room_type: 10 unique elements'

assigned_room_type
A    6227
D    2064
E     678
F     320
G     204
Name: count, dtype: int64

--------------------------------------------------


'deposit_type: 3 unique elements'

deposit_type
No Deposit    8696
Non Refund    1289
Refundable      15
Name: count, dtype: int64

--------------------------------------------------


'customer_type: 4 unique elements'

customer_type
Transient          7521
Transient-Party    2077
Contract            359
Group                43
Name: count, dtype: int64

--------------------------------------------------


'reservation_status: 3 unique elements'

reservation_status
Check-Out    6232
Canceled     3674
No-Show        94
Name: count, dtype: int64

--------------------------------------------------


'reservation_status_date: 853 unique elements'

reservation_status_date
10/21/2015    130
11/25/2016     78
1/1/2015       66
7/6/2015       60
1/18/2016      60
Name: count, dtype: int64

--------------------------------------------------

=== Data Consistency Checks ===
---Non-numeric values in numeric columns---
stays_in_weekend_nights: 10000 non-numeric values found
Examples: [nan, nan, nan, nan, nan]
agent: 10000 non-numeric values found
Examples: [nan, nan, nan, nan, nan]
company: 10000 non-numeric values found
Examples: [nan, nan, nan, nan, nan]
adr: 10000 non-numeric values found
Examples: [nan, nan, nan, nan, nan]
stays_in_weeks_nights: 10000 non-numeric values found
Examples: [nan, nan, nan, nan, nan]

--- Logical relationship checks ---
Bookings with 0 total guests: 18
Bookings with 0 nights stay: 57
Bookings with negative lead time: 0

--- Impossible value checks ---
Bookings with negative ADR: 1
Bookings with negative previous_cancellations: 0
Bookings with negative previous_bookings_not_canceled: 0
Bookings with negative booking_changes: 0
Bookings with negative days_in_waiting_list: 0
Bookings with negative required_car_parking_spaces: 0
Bookings with nega