In [0]:
# =============================================================================
# PHASE 1 — Mock Data Generator (~100K rows)
# Search Log Analysis Pipeline | Ride-Hailing App (India)
#
# HOW TO USE:
#   1. Open your Databricks workspace
#   2. Create a new Notebook → name it "01_generate_mock_data" → Language: Python
#   3. Copy-paste this entire script into Cell 1
#   4. Click "Run All"
#   5. CSV saved to: /FileStore/search_logs/raw_search_logs.csv
#
# EXPECTED OUTPUT: ~100,000 rows
#   - Active cities    (10 cities x 4,000-5,000 rows) = ~45,000 rows
#   - Expansion cities (15 cities x 2,500-3,500 rows) = ~45,000 rows
#   - 3% dirty records intentionally injected for Silver layer cleaning
# =============================================================================

import random
import uuid
from datetime import datetime, timedelta

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

# -----------------------------------------------------------------------------
# 1. CITY MASTER DATA
#    Active    = Ola already operates here -> mostly successful rides
#    Expansion = Ola NOT here yet          -> mostly errors (our expansion signal!)
# -----------------------------------------------------------------------------

ACTIVE_CITIES = [
    {"city": "Mumbai",    "state": "Maharashtra",   "lat": 19.0760, "lng": 72.8777, "tier": 1},
    {"city": "Delhi",     "state": "Delhi",         "lat": 28.7041, "lng": 77.1025, "tier": 1},
    {"city": "Bangalore", "state": "Karnataka",     "lat": 12.9716, "lng": 77.5946, "tier": 1},
    {"city": "Hyderabad", "state": "Telangana",     "lat": 17.3850, "lng": 78.4867, "tier": 1},
    {"city": "Chennai",   "state": "Tamil Nadu",    "lat": 13.0827, "lng": 80.2707, "tier": 1},
    {"city": "Kolkata",   "state": "West Bengal",   "lat": 22.5726, "lng": 88.3639, "tier": 1},
    {"city": "Pune",      "state": "Maharashtra",   "lat": 18.5204, "lng": 73.8567, "tier": 2},
    {"city": "Ahmedabad", "state": "Gujarat",       "lat": 23.0225, "lng": 72.5714, "tier": 2},
    {"city": "Jaipur",    "state": "Rajasthan",     "lat": 26.9124, "lng": 75.7873, "tier": 2},
    {"city": "Lucknow",   "state": "Uttar Pradesh", "lat": 26.8467, "lng": 80.9462, "tier": 2},
]

EXPANSION_CITIES = [
    {"city": "Indore",      "state": "Madhya Pradesh", "lat": 22.7196, "lng": 75.8577, "tier": 2},
    {"city": "Nagpur",      "state": "Maharashtra",    "lat": 21.1458, "lng": 79.0882, "tier": 2},
    {"city": "Coimbatore",  "state": "Tamil Nadu",     "lat": 11.0168, "lng": 76.9558, "tier": 2},
    {"city": "Bhopal",      "state": "Madhya Pradesh", "lat": 23.2599, "lng": 77.4126, "tier": 2},
    {"city": "Surat",       "state": "Gujarat",        "lat": 21.1702, "lng": 72.8311, "tier": 2},
    {"city": "Vadodara",    "state": "Gujarat",        "lat": 22.3072, "lng": 73.1812, "tier": 2},
    {"city": "Mysore",      "state": "Karnataka",      "lat": 12.2958, "lng": 76.6394, "tier": 2},
    {"city": "Vijayawada",  "state": "Andhra Pradesh", "lat": 16.5062, "lng": 80.6480, "tier": 2},
    {"city": "Patna",       "state": "Bihar",          "lat": 25.5941, "lng": 85.1376, "tier": 2},
    {"city": "Ranchi",      "state": "Jharkhand",      "lat": 23.3441, "lng": 85.3096, "tier": 3},
    {"city": "Jodhpur",     "state": "Rajasthan",      "lat": 26.2389, "lng": 73.0243, "tier": 3},
    {"city": "Guwahati",    "state": "Assam",          "lat": 26.1445, "lng": 91.7362, "tier": 3},
    {"city": "Bhubaneswar", "state": "Odisha",         "lat": 20.2961, "lng": 85.8245, "tier": 3},
    {"city": "Dehradun",    "state": "Uttarakhand",    "lat": 30.3165, "lng": 78.0322, "tier": 3},
    {"city": "Amritsar",    "state": "Punjab",         "lat": 31.6340, "lng": 74.8723, "tier": 3},
]

# -----------------------------------------------------------------------------
# 2. LOOKUP LISTS
# -----------------------------------------------------------------------------

RIDE_TYPES       = ["Mini", "Sedan", "Auto", "Bike", "Prime SUV", "Share"]
SUCCESS_STATUSES = ["COMPLETED", "DRIVER_ASSIGNED", "EN_ROUTE"]
DEVICES          = ["Android", "iOS"]
APP_VERSIONS     = ["4.1.2", "4.2.0", "4.2.1", "4.3.0", "4.3.5"]

# Error type weights (must sum to 1.0)
ERROR_TYPES = {
    "NO_SERVICE_AREA":   0.45,   # City not covered -> strongest expansion signal
    "NO_DRIVERS_NEARBY": 0.25,   # Covered but no drivers -> secondary signal
    "SURGE_ABANDONED":   0.15,   # User quit due to high surge pricing
    "APP_ERROR":         0.10,   # Technical glitch -> not expansion signal
    "PAYMENT_FAILED":    0.05,   # Payment issue -> not expansion signal
}

# -----------------------------------------------------------------------------
# 3. HELPER FUNCTIONS
# -----------------------------------------------------------------------------

def random_timestamp(start_days_ago=90):
    """Random timestamp within the last 90 days."""
    end       = datetime.now()
    start     = end - timedelta(days=start_days_ago)
    delta_sec = int((end - start).total_seconds())
    return (start + timedelta(seconds=random.randint(0, delta_sec))).strftime("%Y-%m-%d %H:%M:%S")

def generate_record(city_info, is_error, inject_dirt=False):
    """
    Generate one search log record.
    inject_dirt=True randomly corrupts one field to simulate real messy data.
    The Silver layer will detect and handle these dirty records.
    """
    if is_error:
        error_type           = random.choices(list(ERROR_TYPES.keys()), weights=list(ERROR_TYPES.values()))[0]
        status               = "FAILED"
        session_duration_sec = random.randint(5, 120)
    else:
        error_type           = None
        status               = random.choice(SUCCESS_STATUSES)
        session_duration_sec = random.randint(30, 600)

    record = {
        "search_id":            str(uuid.uuid4()),
        "user_id":              f"USR_{random.randint(10000, 99999)}",
        "timestamp":            random_timestamp(),
        "city":                 city_info["city"],
        "state":                city_info["state"],
        "city_tier":            city_info["tier"],
        "pickup_lat":           round(city_info["lat"] + random.uniform(-0.05, 0.05), 6),
        "pickup_lng":           round(city_info["lng"] + random.uniform(-0.05, 0.05), 6),
        "ride_type":            random.choice(RIDE_TYPES),
        "status":               status,
        "error_type":           error_type,
        "device":               random.choice(DEVICES),
        "app_version":          random.choice(APP_VERSIONS),
        "session_duration_sec": session_duration_sec,
        "is_repeat_search":     random.choice([True, False]),
    }

    # Inject ~3% dirty records (nulls / bad values) — Silver layer will clean these
    if inject_dirt:
        dirty_field = random.choice(["city", "state", "pickup_lat", "pickup_lng", "timestamp", "user_id"])
        if dirty_field in ("pickup_lat", "pickup_lng"):
            record[dirty_field] = None           # null coordinate
        elif dirty_field == "timestamp":
            record[dirty_field] = "INVALID_TS"   # unparseable timestamp
        else:
            record[dirty_field] = None           # null string field

    return record

# -----------------------------------------------------------------------------
# 4. GENERATE ALL RECORDS
#    Active cities    -> 4,000-5,000 records each, 15% error rate  ~ 45,000 rows
#    Expansion cities -> 2,500-3,500 records each, 90% error rate  ~ 45,000 rows
#    Grand total      -> ~100,000 rows
# -----------------------------------------------------------------------------

records   = []
DIRT_RATE = 0.03   # 3% of all records will have one dirty field

print("=" * 60)
print("Generating records for ACTIVE cities (15% error rate)...")
print("=" * 60)
for city in ACTIVE_CITIES:
    n = random.randint(4000, 5000)
    for _ in range(n):
        records.append(generate_record(city, is_error=random.random() < 0.15, inject_dirt=random.random() < DIRT_RATE))
    print(f"  {city['city']:<15} Tier-{city['tier']}  {n:,} records")

print()
print("Generating records for EXPANSION cities (90% error rate)...")
print("=" * 60)
for city in EXPANSION_CITIES:
    n = random.randint(2500, 3500)
    for _ in range(n):
        records.append(generate_record(city, is_error=random.random() < 0.90, inject_dirt=random.random() < DIRT_RATE))
    print(f"  {city['city']:<15} Tier-{city['tier']}  {n:,} records")

print()
print(f"TOTAL RECORDS GENERATED: {len(records):,}")
print("=" * 60)

# -----------------------------------------------------------------------------
# 5. SCHEMA + SPARK DATAFRAME
# -----------------------------------------------------------------------------

schema = StructType([
    StructField("search_id",            StringType(),  False),
    StructField("user_id",              StringType(),  True),
    StructField("timestamp",            StringType(),  True),
    StructField("city",                 StringType(),  True),
    StructField("state",                StringType(),  True),
    StructField("city_tier",            IntegerType(), False),
    StructField("pickup_lat",           DoubleType(),  True),
    StructField("pickup_lng",           DoubleType(),  True),
    StructField("ride_type",            StringType(),  False),
    StructField("status",               StringType(),  False),
    StructField("error_type",           StringType(),  True),
    StructField("device",               StringType(),  False),
    StructField("app_version",          StringType(),  False),
    StructField("session_duration_sec", IntegerType(), False),
    StructField("is_repeat_search",     BooleanType(), False),
])

pdf = pd.DataFrame(records)
sdf = spark.createDataFrame(pdf, schema=schema)

# Sort by timestamp so it looks like a natural event stream
sdf = sdf.orderBy(col("timestamp"))

# -----------------------------------------------------------------------------
# 6. SAVE TO FILESTORE (this is the Bronze raw landing zone)
# -----------------------------------------------------------------------------

output_path = "/FileStore/search_logs/raw_search_logs.csv"

sdf.coalesce(1) \
   .write \
   .mode("overwrite") \
   .option("header", "true") \
   .csv(output_path)

print(f"\nData saved to: {output_path}")

# -----------------------------------------------------------------------------
# 7. QUICK VALIDATION — run these to confirm data looks right
# -----------------------------------------------------------------------------

print("\n--- RECORD COUNT BY CITY ---")
sdf.groupBy("city", "state", "city_tier") \
   .count() \
   .orderBy("city_tier", col("count").desc()) \
   .show(30, truncate=False)

print("--- STATUS BREAKDOWN ---")
sdf.groupBy("status").count().orderBy(col("count").desc()).show()

print("--- ERROR TYPE BREAKDOWN (FAILED searches only) ---")
sdf.filter(col("status") == "FAILED") \
   .groupBy("error_type") \
   .count() \
   .orderBy(col("count").desc()) \
   .show()

print("--- DIRTY RECORDS INJECTED (null counts per field) ---")
sdf.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in ["city", "state", "pickup_lat", "pickup_lng", "timestamp", "user_id"]
]).show()

print("\nPhase 1 Complete! Next: run 02_bronze_layer.py")