# Shopify Customers — Reusable Cleaning Notebook (`shopify_customers.ipynb`)

**Purpose:** Clean and standardize Shopify `customers_export.csv` for downstream analytics (BigQuery / SQL models / BI dashboards).  
**Inputs:** Raw `customers_export.csv` exported from Shopify Admin.  
**Outputs:** `customers_clean.csv` (tidy, analysis-ready) + optional BigQuery schema JSON.

---

## How to use
1. Set parameters in the **Parameters** cell below (paths, columns to keep, type overrides).
2. Run notebook top-to-bottom.
3. Upload `customers_clean.csv` to BigQuery (recommended dataset: `shopify_clean.customers`).

**Key features**
- Robust column normalization (snake_case)
- Drop fully empty columns/rows
- Safe type coercion (dates, numerics)
- Excludes internal employees:
  - Customers with first or last name containing 'test' (case-insensitive)
  - Customers with email domain '@inscoder.com'
- Handles oddities (leading `'` in ZIPs, mixed currency formats)
- Selects a recommended subset of columns for analytics
- Emits a BigQuery schema JSON (optional convenience)


In [1]:
# ======= Parameters (edit these) =======
INPUT_PATH = "../../data/raw/customers_export.csv"              # path to your raw Shopify customers CSV
OUTPUT_PATH = "../../data/clean/customers_clean.csv"         # cleaned CSV output
BIGQUERY_SCHEMA_JSON = "customers_clean_bq_schema.json"  # optional schema file

# Columns to keep (subset). Any missing columns will be ignored gracefully.
KEEP_COLS = [
    "Customer ID", "First Name", "Last Name", "Email",
    "Accepts Email Marketing", "Accepts SMS Marketing",
    "Default Address Company", "Default Address Address1", "Default Address Address2",
    "Default Address City", "Default Address Province Code", "Default Address Country Code",
    "Default Address Zip", "Default Address Phone", "Phone",
    "Total Spent", "Total Orders",
    "Tags", "Note", "Tax Exempt"
]

# Numeric columns to coerce
NUM_COLS = ["Total Spent", "Total Orders"]

# Optional: force certain dtype (after cleaning column names). Keys should be final snake_case names.
# Example: DTYPE_OVERRIDES = {"total_orders": "Int64", "total_spent": "Float64"}
DTYPE_OVERRIDES = {}


In [2]:
import pandas as pd
import numpy as np
import re
import json 
from pathlib import Path

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)


In [3]:
def to_snake(s: str) -> str:
    """Normalize a column name to snake_case and strip unsafe chars."""
    s = s.strip()
    s = re.sub(r"[^0-9a-zA-Z_]+", "_", s)
    s = re.sub(r"__+", "_", s)
    return s.lower().strip("_")

def load_shopify_csv(path: str) -> pd.DataFrame:
    """Load CSV using sensible defaults for Shopify exports."""
    # utf-8-sig handles BOM if present
    df = pd.read_csv(path, dtype=str, encoding="utf-8-sig", keep_default_na=True)
    return df

def strip_apostrophes(series: pd.Series) -> pd.Series:
    """Remove leading apostrophes often seen in ZIP codes like `'53188`."""
    return series.str.replace(r"^'", "", regex=True)

def coerce_numeric(series: pd.Series) -> pd.Series:
    """Coerce a text series to float, handling commas and currency symbols."""
    if series.dtype.name.startswith("float") or series.dtype.name.startswith("int"):
        return series
    cleaned = series.astype(str).str.replace(",", "", regex=False)
    cleaned = cleaned.str.replace(r"[^0-9.\-]", "", regex=True)
    return pd.to_numeric(cleaned, errors="coerce")

def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Standardize column names to snake_case."""
    df = df.copy()
    df.columns = [to_snake(c) for c in df.columns]
    return df

def drop_empty(df: pd.DataFrame) -> pd.DataFrame:
    """Drop fully empty rows and columns."""
    df = df.dropna(how="all")
    df = df.dropna(axis=1, how="all")
    return df

def select_columns(df: pd.DataFrame, keep_cols: list) -> pd.DataFrame:
    """Select a subset of columns, ignoring those not present."""
    existing = [c for c in keep_cols if c in df.columns]
    return df[existing].copy()

def exclude_internal_employees(df: pd.DataFrame) -> pd.DataFrame:
    """
    Exclude internal employees from the dataset.
    Filters out customers where:
    - First Name or Last Name contains 'test' (case-insensitive)
    - Email domain is '@inscoder.com'
    """
    df = df.copy()
    original_count = len(df)
    
    # Filter out customers with 'test' in first or last name (case-insensitive)
    # After normalization, "First Name" becomes "first_name" and "Last Name" becomes "last_name"
    first_name_col = None
    last_name_col = None
    
    for c in df.columns:
        col_lower = c.lower()
        if col_lower == 'first_name' or (col_lower.startswith('first') and col_lower.endswith('name')):
            first_name_col = c
        if col_lower == 'last_name' or (col_lower.startswith('last') and col_lower.endswith('name')):
            last_name_col = c
    
    test_mask = pd.Series([False] * len(df), index=df.index)
    
    if first_name_col:
        test_mask |= df[first_name_col].astype(str).str.lower().str.contains('test', na=False, regex=False)
    
    if last_name_col:
        test_mask |= df[last_name_col].astype(str).str.lower().str.contains('test', na=False, regex=False)
    
    # Filter out customers with @inscoder.com email domain
    email_col = None
    for c in df.columns:
        if c.lower() == 'email':
            email_col = c
            break
    
    email_mask = pd.Series([False] * len(df), index=df.index)
    
    if email_col:
        email_mask = df[email_col].astype(str).str.contains('@inscoder.com', na=False, regex=False, case=False)
    
    # Combine masks: exclude if test name OR inscoder.com email
    exclude_mask = test_mask | email_mask
    
    excluded_count = exclude_mask.sum()
    df = df[~exclude_mask].copy()
    
    print(f"Excluded {excluded_count} internal employee records:")
    print(f"  - {test_mask.sum()} customers with 'test' in first/last name")
    print(f"  - {email_mask.sum()} customers with @inscoder.com email")
    print(f"  - Remaining: {len(df)} customers (from {original_count} total)")
    
    return df

def summarize_nulls(df: pd.DataFrame) -> pd.DataFrame:
    nulls = df.isna().sum().sort_values(ascending=False).to_frame("null_count")
    nulls["pct_null"] = (nulls["null_count"] / len(df)).round(4)
    return nulls

def bigquery_type_for_series(s: pd.Series) -> str:
    """Infer a reasonable BigQuery type from a pandas Series."""
    if pd.api.types.is_integer_dtype(s):
        return "INTEGER"
    if pd.api.types.is_float_dtype(s):
        return "FLOAT"
    if pd.api.types.is_bool_dtype(s):
        return "BOOL"
    if pd.api.types.is_datetime64_any_dtype(s):
        return "TIMESTAMP"
    return "STRING"

def emit_bq_schema(df: pd.DataFrame, path: str):
    schema = []
    for col in df.columns:
        schema.append({
            "name": col,
            "type": bigquery_type_for_series(df[col]),
            "mode": "NULLABLE"
        })
    Path(path).write_text(json.dumps(schema, indent=2), encoding="utf-8")
    print(f"Wrote BigQuery schema to: {path}")


In [4]:
raw_path = Path(INPUT_PATH)
assert raw_path.exists(), f"Input file not found: {raw_path}"
df_raw = load_shopify_csv(str(raw_path))
print("Loaded rows:", len(df_raw))
display(df_raw.head(3))


Loaded rows: 1499


Unnamed: 0,Customer ID,First Name,Last Name,Email,Accepts Email Marketing,Default Address Company,Default Address Address1,Default Address Address2,Default Address City,Default Address Province Code,Default Address Country Code,Default Address Zip,Default Address Phone,Phone,Accepts SMS Marketing,Total Spent,Total Orders,Note,Tax Exempt,Tags,GW Referral link (customer.metafields.growave.referral_link)
0,'6318094024933,clare,gao,clare.gao@inscoder.com,yes,,80 Vandenburgh Avenue,,Troy,NY,US,12180.0,'(632) 434-8643,,no,0.0,1,,no,"newsletter, prospect",
1,'6336708706533,Test,QA,rakesh.parajiya@inscoder.com,no,,,,,,US,,,,no,0.0,0,,no,,
2,'6336727253221,Test,QA,imran.saiyad@inscoder.com,no,,,,,,US,,,,no,0.0,0,,no,,


In [5]:
df = df_raw.copy()
df = drop_empty(df)
# Select a subset first (based on original headers), then normalize names
subset_cols = [c for c in KEEP_COLS if c in df.columns]
df = df[subset_cols].copy()
df.columns = [to_snake(c) for c in df.columns]

print("Columns after selection & normalization:", list(df.columns))
display(df.head(3))


Columns after selection & normalization: ['customer_id', 'first_name', 'last_name', 'email', 'accepts_email_marketing', 'accepts_sms_marketing', 'default_address_company', 'default_address_address1', 'default_address_address2', 'default_address_city', 'default_address_province_code', 'default_address_country_code', 'default_address_zip', 'default_address_phone', 'phone', 'total_spent', 'total_orders', 'tags', 'note', 'tax_exempt']


Unnamed: 0,customer_id,first_name,last_name,email,accepts_email_marketing,accepts_sms_marketing,default_address_company,default_address_address1,default_address_address2,default_address_city,default_address_province_code,default_address_country_code,default_address_zip,default_address_phone,phone,total_spent,total_orders,tags,note,tax_exempt
0,'6318094024933,clare,gao,clare.gao@inscoder.com,yes,no,,80 Vandenburgh Avenue,,Troy,NY,US,12180.0,'(632) 434-8643,,0.0,1,"newsletter, prospect",,no
1,'6336708706533,Test,QA,rakesh.parajiya@inscoder.com,no,no,,,,,,US,,,,0.0,0,,,no
2,'6336727253221,Test,QA,imran.saiyad@inscoder.com,no,no,,,,,,US,,,,0.0,0,,,no


In [6]:
# Exclude internal employees (test names and @inscoder.com emails)
df = exclude_internal_employees(df)


Excluded 9 internal employee records:
  - 6 customers with 'test' in first/last name
  - 6 customers with @inscoder.com email
  - Remaining: 1490 customers (from 1499 total)


In [7]:
# Fix common odd ZIP apostrophes if present
for col in ["default_address_zip"]:
    if col in df.columns:
        df[col] = strip_apostrophes(df[col].astype(str))

# Numeric coercion
for col in [to_snake(c) for c in NUM_COLS if c in df_raw.columns]:
    if col in df.columns:
        df[col] = coerce_numeric(df[col])

# Optional dtype overrides
for col, dtype in (DTYPE_OVERRIDES or {}).items():
    if col in df.columns:
        try:
            df[col] = df[col].astype(dtype)
        except Exception as e:
            print(f"Warning: could not apply dtype {dtype} to column {col}: {e}")


In [8]:
# Remove fully empty rows again (after coercion)
df = drop_empty(df)

# Check for duplicate customer IDs
if 'customer_id' in df.columns:
    dup_count = df.duplicated(subset=['customer_id'], keep='first').sum()
    if dup_count > 0:
        print(f"Found {dup_count} duplicated customer_id rows -> dropping duplicates.")
        df = df.drop_duplicates(subset=['customer_id'], keep='first')

# Basic quality summary
print("Rows after cleaning:", len(df))
display(df.head(5))

print("\nNull summary (top 20):")
display(summarize_nulls(df).head(20))

# Sanity checks (soft asserts as warnings)
def warn_if(cond, msg):
    if cond:
        print("WARNING:", msg)

warn_if('customer_id' not in df.columns, "Missing 'customer_id' column.")
warn_if('email' not in df.columns, "Missing 'email' column.")
warn_if('total_spent' in df.columns and df['total_spent'].isna().mean() > 0.2, "More than 20% of 'total_spent' is null.")


Rows after cleaning: 1490


Unnamed: 0,customer_id,first_name,last_name,email,accepts_email_marketing,accepts_sms_marketing,default_address_company,default_address_address1,default_address_address2,default_address_city,default_address_province_code,default_address_country_code,default_address_zip,default_address_phone,phone,total_spent,total_orders,tags,note,tax_exempt
3,'6336787448037,,,galaxyaio96@gmail.com,yes,no,,,,,,,,,,0.0,0,"password page, prospect",,no
4,'6339484516581,,,kamranahmad1650@gmail.com,yes,no,,,,,,,,,,0.0,0,"password page, prospect",,no
5,'6360571216101,ray,li,rayl@myves.com,yes,no,,480 7th Avenue,apt 3,San Francisco,CA,US,94118.0,'(973) 330-6115,,0.0,2,"password page, prospect",,no
6,'6375400571109,Michael,Tai,maikerutai.mt@gmail.com,no,no,,Us,Us,Us,AZ,US,86556.0,'(889) 254-33,,79.95,1,,,no
8,'6377410691301,Jennifer,Poh,aromemx@gmail.com,no,no,,,,,,,,,,0.0,0,,,no



Null summary (top 20):


Unnamed: 0,null_count,pct_null
note,1489,0.9993
default_address_company,1436,0.9638
phone,1427,0.9577
default_address_address2,1227,0.8235
tags,1165,0.7819
default_address_phone,607,0.4074
default_address_province_code,588,0.3946
default_address_address1,588,0.3946
default_address_city,588,0.3946
last_name,306,0.2054


In [9]:
# Define output paths relative to notebook
out_path = Path(OUTPUT_PATH)
schema_path = Path("../../data/clean/" + BIGQUERY_SCHEMA_JSON)

# Save cleaned CSV
df.to_csv(out_path, index=False, encoding="utf-8")
print(f"✅ Saved cleaned CSV → {out_path.resolve()}  (rows={len(df)})")

# Emit a convenience BigQuery schema JSON (optional)
emit_bq_schema(df, schema_path)
print(f"✅ Wrote BigQuery schema JSON → {schema_path.resolve()}")


✅ Saved cleaned CSV → /Users/alvychen/Desktop/Ecommerce_Growth_Analytics/data/clean/customers_clean.csv  (rows=1490)
Wrote BigQuery schema to: ../../data/clean/customers_clean_bq_schema.json
✅ Wrote BigQuery schema JSON → /Users/alvychen/Desktop/Ecommerce_Growth_Analytics/data/clean/customers_clean_bq_schema.json


In [10]:
# Optional: quick preview of customer metrics
if 'total_spent' in df.columns and 'total_orders' in df.columns:
    print("Customer Summary Statistics:")
    print(f"Total customers: {len(df):,}")
    print(f"Total revenue: ${df['total_spent'].sum():,.2f}")
    print(f"Average customer value: ${df['total_spent'].mean():,.2f}")
    print(f"Total orders: {df['total_orders'].sum():,.0f}")
    print(f"Average orders per customer: {df['total_orders'].mean():.2f}")
    
    # Top customers by spend
    if 'email' in df.columns:
        print("\nTop 10 customers by total spent:")
        top_customers = df.nlargest(10, 'total_spent')[['customer_id', 'email', 'first_name', 'last_name', 'total_spent', 'total_orders']]
        display(top_customers)
else:
    print("Skipping metrics preview: missing 'total_spent' or 'total_orders' columns.")


Customer Summary Statistics:
Total customers: 1,490
Total revenue: $134,399.00
Average customer value: $90.20
Total orders: 2,315
Average orders per customer: 1.55

Top 10 customers by total spent:


Unnamed: 0,customer_id,email,first_name,last_name,total_spent,total_orders
771,'7257538298085,operations+brand@flip.shop,B2B:,flip-shop,88141.76,1650
827,'7327797346533,execassistant@maronelectric.com,Eric,Nixon,703.56,1
1170,'7915484283109,tpmonaco@gmail.com,Patti,Monaco,399.75,1
523,'6821920932069,howell.maleah@gmail.com,Maleah,Howell,397.74,1
341,'6667770003685,daear_taran@yahoo.com,Deborah,Lewis,329.8,2
1030,'7736143544549,denise.gahm@gmail.com,Denise,Gahm,319.8,1
1419,'8737287569637,scartile@aol.com,Jill,Scarbrough,305.61,1
178,'6464337084645,sirinya16@hotmail.com,Sirinya,Laiteerapong,238.32,1
34,'6400751894757,nephalem322@icloud.com,Abraham,Parra,224.24,1
808,'7308259033317,itflics@mac.com,Martin,Flics,222.05,5
