In [None]:
import pandas as pd 

df = pd.read_csv("hotel_bookings.csv")
##df.head()

In [None]:
print("Statistical summary: \n", df.describe())

In [None]:
print("Column types: \n", df.dtypes)

In [None]:
null_counts = df.isnull().sum()
null_columns = null_counts[null_counts > 0]

if not null_columns.empty:
    print("Null values:\n", null_columns)
else:
    print("No null values found.")


In [None]:
null_percentage = df.isnull().mean() * 100
print("Percentage of null values:\n", null_percentage[null_percentage > 0])
df = df.drop(columns=['agent', 'company'])
df['children'] = df['children'].fillna(0)
df['country'] = df['country'].fillna(df['country'].mode()[0])
print(df.isnull().sum())

In [None]:
print("Duplicated rows:", df.duplicated().sum())
df = df.drop_duplicates()

In [None]:
duplicate_percentage = df.duplicated().mean() * 100
print("Percentage of duplicated rows:\n", duplicate_percentage[duplicate_percentage > 0])

In [None]:

df["is_canceled"].value_counts(normalize=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


df['is_canceled_label'] = df['is_canceled'].map({0: 'Not Canceled', 1: 'Canceled'})

plt.figure(figsize=(6, 4))
sns.countplot(x="is_canceled_label", data=df, hue="is_canceled_label", palette="Set2", legend=False)
plt.title("Cancellation Distribution")
plt.xlabel("Booking Status")
plt.ylabel("Number of Bookings")


plt.tight_layout()
plt.show()


In [None]:
pivot = df.pivot_table(values="is_canceled", index="deposit_type", aggfunc="mean")
print(pivot)

In [None]:
pivot_lead = df.pivot_table(
    values="is_canceled",
    index="lead_time",
    aggfunc="mean"
)

threshold = pivot_lead[pivot_lead["is_canceled"] == 1].index.min()
print(f"All bookings are canceled from lead_time = {threshold} days onward.")

In [None]:
plt.figure(figsize=(10,6))
plt.plot(pivot_lead.index, pivot_lead["is_canceled"], color='teal', linewidth=2)
plt.title("Cancellation Rate vs Lead Time (Days)")
plt.xlabel("Lead Time (Days)")
plt.ylabel("Cancellation Rate")
plt.grid(True, linestyle='--', alpha=0.6)
plt.xlim(0, 357)

plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.barplot(x="deposit_type", y="is_canceled", data=pivot, palette="Set2")

plt.title("Cancellation Rate by Deposit Type")
plt.xlabel("Deposit Type")
plt.ylabel("Mean Cancellation Rate")
plt.ylim(0, 1)
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.show()


In [None]:
df['deposit_type'].value_counts(normalize=True) * 100

In [None]:

pivot_month = df.pivot_table(values="is_canceled", index="arrival_date_month", aggfunc="mean").sort_values("is_canceled", ascending=False)
print(pivot_month)
plt.figure(figsize=(10,5))
sns.barplot(x=pivot_month.index, y=pivot_month["is_canceled"], palette="viridis")
plt.title("Cancellation Rate by Arrival Month")
plt.ylabel("Cancellation Rate")
plt.xlabel("Arrival Month")
plt.xticks(rotation=45)
plt.show()


In [None]:
pivot_segment = df.pivot_table(values="is_canceled", index="market_segment", aggfunc="mean").sort_values("is_canceled", ascending=False)
sns.barplot(x=pivot_segment.index, y=pivot_segment["is_canceled"], palette="coolwarm")
plt.title("Cancellation Rate by Market Segment")
plt.xticks(rotation=45)
plt.show()


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12,5))

sns.boxplot(x="is_canceled", y="previous_cancellations", data=df, ax=axes[0])
axes[0].set_title("Previous Cancellations vs. New Cancellation")

sns.boxplot(x="is_canceled", y="previous_bookings_not_canceled", data=df, ax=axes[1])
axes[1].set_title("Previous Successful Bookings vs. New Cancellation")

f

In [None]:
df = df.drop(columns=['reservation_status', 'reservation_status_date'])
df.to_csv("cleaned_hotel_data.csv", index=False)