Libraries and Display options

In [2]:
import pandas as pd
import numpy as np
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 1000)

In [3]:
data = pd.read_csv("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/raw/hotel_booking_cancellation_prediction.csv")
# data_missing = pd.read_csv("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/cleaned/01_missing_values_cleaned.csv")
# data_duplicate = pd.read_csv("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/cleaned/02_duplicates_cleaned.csv")
# data_statistical = pd.read_csv("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/cleaned/03_statistical_outliers_cleaned.csv")
# data_date_errors = pd.read_csv("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/cleaned/04_data_errors_cleaned.csv")
data_dtype = pd.read_pickle("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/cleaned/07_dtypes_cleaned.pkl")
final = pd.read_pickle("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/cleaned/final_cleaned_data.pkl")
encoded_data = pd.read_pickle("/home/antonios/Desktop/Practica_de_vara/data-science-internship/data/results/feature_engineered_data.pkl")
# load data 

Initial Data Exploration

In [19]:
print("=== Sample Data ===")
# display(data.head(10))
# display(data_missing.head(10))
# display(data_duplicate.head(10))
# display(data_statistical.head(10))
display(final["market_segment"].value_counts())
#display(encoded_data["high_risk_segment"])

#display(final["deposit_type"])

print("\n=== Basic info ===")
# display(data_dtype.dtypes)

print("\n=== Data Shape ===")

#display(f"Rows: {data.shape[0]}, Columns: {data.shape[1]}")

=== Sample Data ===


market_segment
Online TA        4573
Offline TA/TO    2039
Groups           1707
Direct           1036
Corporate         410
Complementary      53
Aviation           23
Name: count, dtype: int64


=== Basic info ===

=== Data Shape ===


In [None]:
hotel_data = data.copy()
# copy dataset for manipulation

Summary of Numerical/Categorical columns

In [None]:
print("\n=== Numerical Columns Summary===")
display(hotel_data.describe())

print("\n=== Categorical Columns Summary ===")
display(hotel_data.describe(include="object"))

Column names and data types

In [None]:
print("\n=== Column names")
#display(hotel_data.columns.to_list())

print("\n=== Data types ===")
display(hotel_data.dtypes)

Missing Values Check

In [None]:
print("\n=== Missing values ===")
display(hotel_data.isnull().sum())

print("\n=== Missing value(procents) ===")
display(hotel_data.isnull().sum()/ len(hotel_data) * 100)

Check duplicates

In [None]:
print("\n=== Check for duplicates===")
display(hotel_data.duplicated().sum())

Target variable analysis

In [None]:
print("\n=== Target variable analysis")
hotel_data["reserved_room_type"].value_counts()
hotel_data["assigned_room_type"].value_counts()

#rint(hotel_data["company"].value_counts().head(5))

Outliers in Numerical / Categorical columns

In [None]:
print("\n=== Identify outliers in numerical columns")
numerical_cols = hotel_data.select_dtypes(include=["int64","float64"]).columns
print(numerical_cols)
for col in numerical_cols:
    Q1 = hotel_data[col].quantile(0.25)
    Q2 = hotel_data[col].quantile(0.75)
    IQR = Q2 - Q1
    outliers = hotel_data[(hotel_data[col] < Q1 - 1.5*IQR) | (hotel_data[col] > Q2 + 1.5*IQR)]
    print(f"{col}: {len(outliers)} outliers")
    print("+" * 50) 

print("\n=== Identify outliers in categorical columns")
categorical_columns = hotel_data.select_dtypes(include="object").columns
for col in categorical_columns:
    display(f"{col}: {hotel_data[col].nunique()} unique elements") 
    display(hotel_data[col].value_counts().head())
    print("-" * 50)


Data consistency checks

Non-numeric values in numeric columns

In [None]:
print("---Non-numeric values in numeric columns---")
for col in numerical_cols:
    non_numeric = hotel_data[col].astype(str).str.contains('[^0-9,-]', regex=True).sum() 
    if non_numeric > 0:
        print(f"{col}: {non_numeric} non-numeric values found")
        examples = hotel_data[hotel_data[col].astype(str).str.contains('[^0-9.-]', regex=True)][col].head()
        print(f"Examples: {examples.to_list()}")


Logical Relationship Checks

In [None]:
# Total guests > 0
zero_guests = hotel_data[(hotel_data["adults"] + hotel_data["children"] + hotel_data["babies"]) == 0]
print(f"Bookings with 0 total guests: {len(zero_guests)}")

# Stay duration > 0
zero_nights = hotel_data[(hotel_data['stays_in_weekend_nights'] + hotel_data['stays_in_week_nights']) == 0]
print(f"Bookings with 0 nights stay: {len(zero_nights)}")

# Lead time >=0
negative_lead = hotel_data[hotel_data["lead_time"] < 0]
print(f"Bookings with negative lead time: {len(negative_lead)}")


invalid_repeated = hotel_data[~hotel_data['is_repeated_guest'].isin([0, 1])]
print(f"Invalid is_repeated_guest values: {len(invalid_repeated)}")


negative_adults = hotel_data[hotel_data['adults'] < 0]
print(f"Bookings with negative adults: {len(negative_adults)}")


negative_children = hotel_data[hotel_data['children'] < 0]
print(f"Bookings with negative children: {len(negative_children)}")


negative_babies = hotel_data[hotel_data['babies'] < 0]
print(f"Bookings with negative babies: {len(negative_babies)}")

Impossible Values

In [None]:
negative_adr = hotel_data[hotel_data['adr'] < 0]
print(f"Bookings with negative ADR: {len(negative_adr)}")


negative_prev_cancel = hotel_data[hotel_data['previous_cancellations'] < 0]
print(f"Bookings with negative previous_cancellations: {len(negative_prev_cancel)}")


negative_prev_bookings = hotel_data[hotel_data['previous_bookings_not_canceled'] < 0]
print(f"Bookings with negative previous_bookings_not_canceled: {len(negative_prev_bookings)}")


negative_changes = hotel_data[hotel_data['booking_changes'] < 0]
print(f"Bookings with negative booking_changes: {len(negative_changes)}")


negative_waiting = hotel_data[hotel_data['days_in_waiting_list'] < 0]
print(f"Bookings with negative days_in_waiting_list: {len(negative_waiting)}")


negative_parking = hotel_data[hotel_data['required_car_parking_spaces'] < 0]
print(f"Bookings with negative required_car_parking_spaces: {len(negative_parking)}")


negative_requests = hotel_data[hotel_data['total_of_special_requests'] < 0]
print(f"Bookings with negative total_of_special_requests: {len(negative_requests)}")