In [None]:
!pip install google-cloud-bigquery pandas -q

import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import warnings
warnings.filterwarnings("ignore")

print("Libraries imported successfully!")

# %%
# Step 2: Configuration
KEY_PATH   = r"C:\Users\Vishnu Vardhan\OneDrive\Desktop\Bigquery_Ecommerce\even-blueprint-441418-p2-043f8a9d855b.json"
PROJECT_ID = "even-blueprint-441418-p2"
SILVER     = "ecommerce_Silver_Layer"
GOLD       = "ecommerce_Gold_Layer"

# Initialize BigQuery client
credentials = service_account.Credentials.from_service_account_file(KEY_PATH)
client = bigquery.Client(credentials=credentials, project=PROJECT_ID)

print(f"Connected to project: {PROJECT_ID}")
print(f"Silver Layer: {SILVER}")
print(f"Gold Layer: {GOLD}")

# %%
# Step 3: Load Surrogate Key Maps from Silver Dimensions
def load_sk_map(table, biz_key, sk_key):
    query = f"SELECT {biz_key}, {sk_key} FROM `{PROJECT_ID}.{SILVER}.{table}`"
    df = client.query(query).to_dataframe()
    print(f"Loaded {len(df):,} rows from {table} ({biz_key} â†’ {sk_key})")
    return dict(zip(df[biz_key], df[sk_key]))

print("Loading dimension surrogate keys...\n")
order_sk    = load_sk_map("dim_order",    "order_id",    "order_sk")
customer_sk = load_sk_map("dim_customer", "customer_id", "customer_sk")
product_sk  = load_sk_map("dim_products", "product_id",  "product_sk")
seller_sk   = load_sk_map("dim_sellers",  "seller_id",   "seller_sk")
payment_sk  = load_sk_map("dim_payments", "order_id",    "payment_sk")

# %%
# Step 4: Load Date Dimension Map
print("\nLoading date dimension...")
date_query = f"SELECT full_date, date_sk FROM `{PROJECT_ID}.{SILVER}.dim_date`"
date_df = client.query(date_query).to_dataframe()
date_df['full_date'] = pd.to_datetime(date_df['full_date']).dt.date
date_sk_map = dict(zip(date_df['full_date'], date_df['date_sk']))
print(f"Loaded {len(date_sk_map):,} date entries")

# %%
# Step 5: Extract Order Items from Silver Tables
print("\nExtracting order items from Silver layer...")
raw_sql = f"""
SELECT
    o.order_id,
    o.order_purchase_timestamp,
    o.order_delivered_customer_date,
    c.customer_id,
    p.product_id,
    s.seller_id,
    pay.payment_sk,
    pay.payment_type,
    pay.payment_installments,
    -- Placeholder values for order-item-specific fields
    1 AS order_item_id,
    1 AS quantity,
    100.0 AS price,
    10.0 AS freight_value,
    5 AS review_score
FROM `{PROJECT_ID}.{SILVER}.dim_order` o
JOIN `{PROJECT_ID}.{SILVER}.dim_customer` c ON o.customer_id = c.customer_id
JOIN `{PROJECT_ID}.{SILVER}.dim_products` p ON TRUE    -- Map product (for demo)
JOIN `{PROJECT_ID}.{SILVER}.dim_sellers` s ON TRUE     -- Map seller (for demo)
JOIN `{PROJECT_ID}.{SILVER}.dim_payments` pay ON o.order_id = pay.order_id
"""

df_raw = client.query(raw_sql).to_dataframe()
print(f"Extracted {len(df_raw):,} raw order item rows")

# %%
# Step 6: Enrich with Surrogate Keys
print("\nEnriching with surrogate keys...")
df = df_raw.copy()

df['order_sk']    = df['order_id'].map(order_sk)
df['customer_sk'] = df['customer_id'].map(customer_sk)
df['product_sk']  = df['product_id'].map(product_sk)
df['seller_sk']   = df['seller_id'].map(seller_sk)
df['payment_sk']  = df['payment_sk']

# Date keys
df['purchase_date']    = pd.to_datetime(df['order_purchase_timestamp']).dt.date
df['delivery_date']    = pd.to_datetime(df['order_delivered_customer_date']).dt.date
df['purchase_date_sk'] = df['purchase_date'].map(date_sk_map)
df['delivery_date_sk'] = df['delivery_date'].map(date_sk_map)

# %%
# Step 7: Compute Business Metrics
print("Computing business metrics...")
df['total_revenue']   = df['price'] * df['quantity'] + df['freight_value']
df['profit_estimate'] = df['price'] * df['quantity'] * 0.3
df['discount_pct']    = 0.0

# Generate fact surrogate key
df['order_item_sk'] = range(1, len(df) + 1)

# %%
# Step 8: Final Column Selection & Cleanup
final_cols = [
    'order_item_sk', 'order_sk', 'customer_sk', 'product_sk', 'seller_sk', 'payment_sk',
    'purchase_date_sk', 'delivery_date_sk',
    'order_item_id', 'quantity', 'price', 'freight_value', 'payment_value',
    'review_score', 'total_revenue', 'profit_estimate', 'discount_pct'
]

df_final = df[final_cols].copy()

# Drop rows with missing critical keys
before = len(df_final)
df_final.dropna(
    subset=['order_sk', 'customer_sk', 'product_sk', 'seller_sk', 'payment_sk', 'purchase_date_sk'],
    inplace=True
)
after = len(df_final)
print(f"Dropped {before - after:,} rows due to missing keys")

# %%
# Step 9: Load into Gold Fact Table
print(f"\nLoading {len(df_final):,} rows into Gold fact table...")
table_ref = f"{PROJECT_ID}.{GOLD}.fact_order_items"

job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",
    schema=[
        bigquery.SchemaField("order_item_sk", "INT64"),
        bigquery.SchemaField("order_sk", "INT64"),
        bigquery.SchemaField("customer_sk", "INT64"),
        bigquery.SchemaField("product_sk", "INT64"),
        bigquery.SchemaField("seller_sk", "INT64"),
        bigquery.SchemaField("payment_sk", "INT64"),
        bigquery.SchemaField("purchase_date_sk", "INT64"),
        bigquery.SchemaField("delivery_date_sk", "INT64"),
        bigquery.SchemaField("order_item_id", "INT64"),
        bigquery.SchemaField("quantity", "INT64"),
        bigquery.SchemaField("price", "NUMERIC"),
        bigquery.SchemaField("freight_value", "NUMERIC"),
        bigquery.SchemaField("payment_value", "NUMERIC"),
        bigquery.SchemaField("review_score", "INT64"),
        bigquery.SchemaField("total_revenue", "NUMERIC"),
        bigquery.SchemaField("profit_estimate", "NUMERIC"),
        bigquery.SchemaField("discount_pct", "NUMERIC"),
    ]
)

job = client.load_table_from_dataframe(df_final, table_ref, job_config=job_config)
job.result()

print(f"SUCCESS: Gold fact table `{table_ref}` loaded with {len(df_final):,} rows!")

# %%
# Step 10: Verify Load
print("\nVerifying first 5 rows in Gold fact table:")
verify_query = f"SELECT * FROM `{table_ref}` LIMIT 5"
display(client.query(verify_query).to_dataframe())

print("\nGold fact table is ready for data marts and Streamlit dashboards!")
