In [1]:
import pandas as pd, json
from pathlib import Path

# ---------- Parameters ----------
INPUT_PATH  = "../../data/raw/sessions_over_time.csv"
OUTPUT_PATH = "../../data/clean/sessions_over_time_clean.csv"
SCHEMA_PATH = "../../data/clean/sessions_over_time_bq_schema.json"


In [2]:
# ---------- Load ----------
df = pd.read_csv(INPUT_PATH, encoding="utf-8-sig")
print("Raw shape:", df.shape)

# ---------- Drop previous-period columns ----------
cols_keep = [c for c in df.columns if "(previous" not in c.lower()]
df = df[cols_keep]
df.columns = df.columns.str.strip()

Raw shape: (80965, 26)


In [3]:
# ---------- Rename core columns ----------
rename_map = {
    "Day": "date",
    "Referrer source": "referrer_source",
    "Referrer name": "referrer_name",
    "Referring channel": "referring_channel",
    "Session country": "country",
    "Session country code": "country_code",
    "Session city": "city",
    "Session device type": "device_type",
    "Session duration": "session_duration",
    "Traffic type": "traffic_type",
    "Referrer domain": "referrer_domain",
    "Landing page path": "landing_page_path",
    "Landing page URL": "landing_page_url",
    "Referrer URL": "referrer_url",
    "Session bounced": "session_bounced",
    "Session browser": "browser",
    "Online store visitors": "online_store_visitors",
    "Sessions": "sessions",
}
df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns})


In [4]:
# ---------- Type coercion ----------
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.date
for c in ["online_store_visitors","sessions","session_duration"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype("int64")

bool_map = {"TRUE": True, "FALSE": False, "True": True, "False": False}
if "session_bounced" in df.columns:
    df["session_bounced"] = df["session_bounced"].map(bool_map).fillna(False)


  df["session_bounced"] = df["session_bounced"].map(bool_map).fillna(False)


In [5]:
# Drop duplicate metric column
if "online_store_visitors" in df.columns:
    df = df.drop(columns=["online_store_visitors"])
    print("Dropped redundant column: online_store_visitors (identical to sessions)")


Dropped redundant column: online_store_visitors (identical to sessions)


In [6]:
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.date
df = df.dropna(subset=["date"])

In [7]:
# ---------- Select relevant analytics columns ----------
cols_final = [
    "date","referrer_source","referrer_name","referring_channel",
    "country","country_code","city","device_type","traffic_type",
    "referrer_domain","landing_page_path","landing_page_url",
    "referrer_url","browser","session_duration","session_bounced",
    "online_store_visitors","sessions"
]
df = df[[c for c in cols_final if c in df.columns]].copy()

# ---------- Basic QA ----------
print("After cleaning:", df.shape)
print(df.head(3))
print("\nNulls:\n", df.isna().sum())


After cleaning: (80891, 17)
         date referrer_source referrer_name referring_channel        country  \
0  2024-10-01          social      facebook          facebook  United States   
1  2024-10-01          social      facebook          facebook  United States   
2  2024-10-01          social      facebook          facebook      Gibraltar   

  country_code        city device_type traffic_type referrer_domain  \
0           US  Saugerties      mobile         paid  l.facebook.com   
1           US      Denver     desktop         paid  l.facebook.com   
2           GI         NaN     desktop         paid  l.facebook.com   

                                   landing_page_path  \
0  /products/myves-food-purification-box-p1-fruit...   
1  /products/myves-retro-style-immersion-blender-...   
2  /products/myves-retro-style-immersion-blender-...   

                                    landing_page_url    referrer_url  \
0  https://myves.com/products/myves-food-purifica...  l.facebook.com 

In [8]:
# ---------- Save cleaned CSV ----------
Path(OUTPUT_PATH).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")
print(f"✅ Saved cleaned CSV → {Path(OUTPUT_PATH).resolve()}")


✅ Saved cleaned CSV → /Users/alvychen/Desktop/Ecommerce_Growth_Analytics/data/clean/sessions_over_time_clean.csv


In [9]:
# ---------- Emit BigQuery schema ----------
schema = [
    {"name":"date","type":"DATE","mode":"NULLABLE"},
    {"name":"referrer_source","type":"STRING","mode":"NULLABLE"},
    {"name":"referrer_name","type":"STRING","mode":"NULLABLE"},
    {"name":"referring_channel","type":"STRING","mode":"NULLABLE"},
    {"name":"country","type":"STRING","mode":"NULLABLE"},
    {"name":"country_code","type":"STRING","mode":"NULLABLE"},
    {"name":"city","type":"STRING","mode":"NULLABLE"},
    {"name":"device_type","type":"STRING","mode":"NULLABLE"},
    {"name":"traffic_type","type":"STRING","mode":"NULLABLE"},
    {"name":"referrer_domain","type":"STRING","mode":"NULLABLE"},
    {"name":"landing_page_path","type":"STRING","mode":"NULLABLE"},
    {"name":"landing_page_url","type":"STRING","mode":"NULLABLE"},
    {"name":"referrer_url","type":"STRING","mode":"NULLABLE"},
    {"name":"browser","type":"STRING","mode":"NULLABLE"},
    {"name":"session_duration","type":"INTEGER","mode":"NULLABLE"},
    {"name":"session_bounced","type":"BOOLEAN","mode":"NULLABLE"},
    {"name":"online_store_visitors","type":"INTEGER","mode":"NULLABLE"},
    {"name":"sessions","type":"INTEGER","mode":"NULLABLE"},
]
Path(SCHEMA_PATH).write_text(json.dumps(schema, indent=2), encoding="utf-8")
print(f"✅ Wrote schema JSON → {Path(SCHEMA_PATH).resolve()}")

✅ Wrote schema JSON → /Users/alvychen/Desktop/Ecommerce_Growth_Analytics/data/clean/sessions_over_time_bq_schema.json
