In [0]:
import pandas as pd
from datetime import datetime
import os

In [0]:
# Paths
SOURCE_PATH = "../Bronze/customers.csv"
TARGET_DIR = "../Silver"
TARGET_FILE = f"{TARGET_DIR}/customers.csv"

os.makedirs(TARGET_DIR, exist_ok=True)

In [0]:
df = pd.read_csv("../Bronze/customers.csv")

print("===== DATASET SHAPE =====")
print(df.shape)

print("\n===== DUPLICATES =====")
print(df.duplicated(subset=["customer_id","first_name","last_name"]).sum())

print("\n===== NULL VALUES =====")
print(df.isnull().sum())

print("\n===== NULL PERCENTAGE =====")
print((df.isnull().mean() * 100).round(2))

print("\n===== AGE DISTRIBUTION =====")
print(df["age"].describe())

print("\n===== GENDER VALUES =====")
print(df["gender"].value_counts(dropna=False))


In [0]:
# String cleanup
df["first_name"] = df["first_name"].str.strip().str.title()
df["last_name"] = df["last_name"].str.strip().str.title()
df["email"] = df["email"].str.lower().str.strip()

df.head


In [0]:
# Normalize gender
df["gender"] = df["gender"].str.lower().map({
    "male": "M",
    "female": "F"
}).fillna("Other")


In [0]:
df.head

In [0]:
# Type conversions
df["registered"] = pd.to_datetime(df["registered"], errors="coerce")
df["ingestion_timestamp"] = pd.to_datetime(df["ingestion_timestamp"], errors="coerce")

print(df["registered"],
df["ingestion_timestamp"])


In [0]:
null_summary = df.isnull().sum().sort_values(ascending=False)
print(null_summary)


In [0]:
df["age"].describe()
df[df["age"] < 0]
df[df["age"] > 120]


In [0]:
df["spent"].describe()
df[df["spent"] < 0]


In [0]:
# Data quality rules
df = df[df["customer_id"].notna()]
df = df[df["age"].between(0, 100)]
df = df[df["spent"] >= 0]

In [0]:
duplicates = df[df.duplicated(subset=["customer_id", "first_name", "last_name"], keep=False)]

print(f"Number of duplicate customers: {duplicates.shape[0]}")
duplicates.sort_values("customer_id").head(10)

In [0]:
# Deduplication
df = df.drop_duplicates(subset=["customer_id"])


In [0]:
df["gender"].value_counts(dropna=False)
df["is_married"].value_counts(dropna=False)


In [0]:
# Silver metadata
df["silver_processed_at"] = datetime.now()

In [0]:
# Write to bronze
df.to_csv(TARGET_FILE, index=False)

print("Customers Silver layer created successfully.")