# ðŸ“Š Exploratory Data Analysis (EDA)
Analyzing hotel booking data to understand customer behavior and cancellation trends.

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"D:\Topic_13_Project\Topic_13_Retail_Store_Sales_Time_Series\data\raw\transactions.csv")

df.head(5)
df.info()
df.describe()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          83488 non-null  object
 1   store_nbr     83488 non-null  int64 
 2   transactions  83488 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.9+ MB


date            0
store_nbr       0
transactions    0
dtype: int64

## Cleaning steps


In [2]:
def clean_transactions(raw: pd.DataFrame) -> pd.DataFrame:
    df = raw.copy()

    # 1. Standardize column names
    df.columns = [c.strip() for c in df.columns]

    required_cols = {"date", "store_nbr", "transactions"}
    missing = required_cols - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # 2. Parse date
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.dropna(subset=["date"])

    # 3. Coerce numeric types
    df["store_nbr"] = pd.to_numeric(df["store_nbr"], errors="coerce")
    df["transactions"] = pd.to_numeric(df["transactions"], errors="coerce")

    # Drop rows with invalid store/transactions
    df = df.dropna(subset=["store_nbr", "transactions"])

    # store_nbr must be integer
    df["store_nbr"] = df["store_nbr"].astype(int)

    # 4. Handle invalid transactions (should be non-negative counts)
    df.loc[df["transactions"] < 0, "transactions"] = np.nan
    df = df.dropna(subset=["transactions"])
    df["transactions"] = df["transactions"].astype(int)

    # 5. Resolve duplicates: sum transactions per (date, store_nbr)
    df = (
        df.groupby(["date", "store_nbr"], as_index=False)["transactions"]
          .sum()
          .sort_values(["date", "store_nbr"])
    )

    # 6. Build complete panel (all dates x all stores)
    all_dates = pd.date_range(df["date"].min(), df["date"].max(), freq="D")
    all_stores = np.sort(df["store_nbr"].unique())

    panel = (
        pd.MultiIndex.from_product([all_dates, all_stores], names=["date", "store_nbr"])
          .to_frame(index=False)
          .merge(df, on=["date", "store_nbr"], how="left")
    )

    # 7. Fill missing with 0 and flag imputed
    panel["is_imputed"] = panel["transactions"].isna().astype(int)
    panel["transactions"] = panel["transactions"].fillna(0).astype(int)

    # Make date string for clean CSV (easy to submit/read)
    panel["date"] = panel["date"].dt.strftime("%Y-%m-%d")

    return panel


df_clean = clean_transactions(df)
df_clean.head()


Unnamed: 0,date,store_nbr,transactions,is_imputed
0,2013-01-01,1,0,1
1,2013-01-01,2,0,1
2,2013-01-01,3,0,1
3,2013-01-01,4,0,1
4,2013-01-01,5,0,1


In [3]:
# ====== QUICK CHECKS ======
print("Shape:", df_clean.shape)
print("Missing values:\n", df_clean.isnull().sum())
print("Duplicate (date, store_nbr):", df_clean.duplicated(subset=["date", "store_nbr"]).sum())
print("Unique stores:", df_clean["store_nbr"].nunique())
print("Unique days:", pd.to_datetime(df_clean["date"]).nunique())

df_clean.describe(include="all").T.head(10)


Shape: (91152, 4)
Missing values:
 date            0
store_nbr       0
transactions    0
is_imputed      0
dtype: int64
Duplicate (date, store_nbr): 0
Unique stores: 54
Unique days: 1688


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
date,91152.0,1688.0,2017-08-15,54.0,,,,,,,
store_nbr,91152.0,,,,27.5,15.58587,1.0,14.0,27.5,41.0,54.0
transactions,91152.0,,,,1552.121127,1034.916203,0.0,926.0,1329.0,1974.0,8359.0
is_imputed,91152.0,,,,0.084079,0.277508,0.0,0.0,0.0,0.0,1.0


In [4]:
# ====== EXPORT CLEANED DATA ======
output_path = "transactions_cleaned.csv"
df_clean.to_csv(output_path, index=False)
output_path


'transactions_cleaned.csv'