In [2]:


import pandas as pd
from pathlib import Path

# -------------------
# Config
# -------------------
INPUT_PATH  = "../../data/raw/Total sales by product variant.csv"
OUTPUT_PATH = "../../data/clean/sales_by_product_clean.csv"
BIGQUERY_SCHEMA_JSON = "../../data/clean/sales_by_product_schema.json"

# -------------------
# Load raw data
# -------------------
df = pd.read_csv(INPUT_PATH)

# -------------------
# Standardize column names
# -------------------
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(" ", "_")
              .str.replace("-", "_")
)

# -------------------
# Clean and rename for clarity
# -------------------
rename_map = {
    "product_title": "product_name",
    "product_variant_title": "variant_title",
    "product_variant_sku": "sku",
    "net_items_sold": "units_sold",
    "gross_sales": "gross_sales",
    "discounts": "discounts",
    "returns": "returns",
    "net_sales": "net_sales",
    "taxes": "taxes",
    "total_sales": "total_sales"
}
df = df.rename(columns=rename_map)

# -------------------
# Numeric type conversions
# -------------------
num_cols = ["units_sold", "gross_sales", "discounts", "returns", "net_sales", "taxes", "total_sales"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

# -------------------
# Fill NAs and standardize text
# -------------------
df = df.fillna({
    "variant_title": "Default",
    "sku": "N/A"
})
df["product_name"] = df["product_name"].str.strip()

# -------------------
# Derive useful metrics
# -------------------
df["discount_rate"] = df["discounts"].abs() / df["gross_sales"]
df["return_rate"] = df["returns"].abs() / (df["net_sales"] + df["returns"].abs())

# -------------------
# Output cleaned dataset
# -------------------
Path(OUTPUT_PATH).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Cleaned CSV saved -> {OUTPUT_PATH} ({len(df)} rows)")

# -------------------
# Optional: BigQuery schema export
# -------------------
schema = [{"name": c, "type": "FLOAT" if c in num_cols else "STRING"} for c in df.columns]
import json
Path(BIGQUERY_SCHEMA_JSON).parent.mkdir(parents=True, exist_ok=True)
with open(BIGQUERY_SCHEMA_JSON, "w") as f:
    json.dump(schema, f, indent=2)
print(f"✅ BigQuery schema saved -> {BIGQUERY_SCHEMA_JSON}")


✅ Cleaned CSV saved -> ../../data/clean/sales_by_product_clean.csv (51 rows)
✅ BigQuery schema saved -> ../../data/clean/sales_by_product_schema.json
