In [10]:
import pandas as pd
import os
import numpy as np

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "../.."))
RAW_DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw")

files = [
    "campaign_desc.csv", "campaign_table.csv", "coupon.csv",
    "coupon_redempt.csv", "causal_data.csv", "product.csv",
    "transaction_data.csv", "hh_demographic.csv"
]

datasets = {}
missing_placeholders = ["Unknown", "None", "NONE", "None/Unknown", "", "U", "NaN"]

for f in files:
    path = os.path.join(RAW_DATA_PATH, f)
    df = pd.read_csv(path)
    datasets[f] = df

    print(f"\n--- {f} ---")
    print(f"Shape: {df.shape}")
    print("\nInfo:")
    print(df.info())

    missing_summary = {}

    for col in df.columns:
        series_as_str = df[col].astype(str).str.strip()
        count_na = df[col].isna().sum()
        count_placeholder = series_as_str.isin(missing_placeholders).sum()
        total_missing = count_na + count_placeholder

        if total_missing > 0:
            missing_summary[col] = {
                "NaN_count": count_na,
                "Unknown_count": count_placeholder,
                "Total_missing": total_missing,
                "Missing_%": total_missing / len(df) * 100
            }

    if missing_summary:
        print("Columns with missing values:")
        summary_df = pd.DataFrame.from_dict(missing_summary, orient="index")
        print(summary_df)
    else:
        print("No missing or unknown values detected.")


--- campaign_desc.csv ---
Shape: (30, 4)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 4 columns):
DESCRIPTION    30 non-null object
CAMPAIGN       30 non-null int64
START_DAY      30 non-null int64
END_DAY        30 non-null int64
dtypes: int64(3), object(1)
memory usage: 1.0+ KB
None
No missing or unknown values detected.

--- campaign_table.csv ---
Shape: (7208, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7208 entries, 0 to 7207
Data columns (total 3 columns):
DESCRIPTION      7208 non-null object
household_key    7208 non-null int64
CAMPAIGN         7208 non-null int64
dtypes: int64(2), object(1)
memory usage: 169.0+ KB
None
No missing or unknown values detected.

--- coupon.csv ---
Shape: (124548, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124548 entries, 0 to 124547
Data columns (total 3 columns):
COUPON_UPC    124548 non-null int64
PRODUCT_ID    124548 non-null int64
CAMPAIGN      124548 non-nul