# Scalable Test Data Generator for Potential Failures

This notebook generates comprehensive test data for the `app_potential_failures` table with the following features:

## Features
- ✅ Configurable data volume (~15k records, adjustable)
- ✅ All KPI codes from `bronze.fms_dimkpiclassification`
- ✅ Selectable KPI codes (all or specific)
- ✅ Various task durations (short, medium, long) per KPI group
- ✅ Financial year spanning jobs (at least 1 per KPI code)
- ✅ Edge cases for downtime thresholds (24, 48, 100 hours)
- ✅ Random start/end times over 2-year period starting 25/05/25
- ✅ All jobs with COMP status
- ✅ Period boundary crossing tasks
- ✅ Distribution across all stations (excluding NULL sections)
- ✅ Join with core_dimdate for period/week
- ✅ Overlapping dates for duplicate testing
- ✅ Configurable frequency per KPI code
- ✅ Optional: Status simulation (WAPPR → APPR → COMP)
- ✅ Optional: Non-KPI code tasks

## Configuration Options
See the configuration section below to customize data generation.

## 1. Configuration & Setup

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import uuid
from typing import List, Dict, Optional
import pyodbc
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility (comment out for true randomness)
# random.seed(42)
# np.random.seed(42)

In [None]:
# ═══════════════════════════════════════════════════════════════
#  CONFIGURATION FLAGS
# ═══════════════════════════════════════════════════════════════

CONFIG = {
    # ─── Data Volume ───
    'TOTAL_RECORDS': 15000,  # Target number of records to generate
    
    # ─── Date Range ───
    'START_DATE': '2025-05-25',  # GTS started EL
    'PERIOD_YEARS': 2,  # Generate data over 2 years
    
    # ─── KPI Code Selection ───
    'USE_ALL_KPI_CODES': True,  # If False, use KPI_CODES_FILTER
    'KPI_CODES_FILTER': [],  # e.g., ['GRAFFITI', 'TRACKSIDE'] - only used if USE_ALL_KPI_CODES = False
    
    # ─── KPI Code Frequency ───
    # Weight distribution for KPI codes (higher = more records)
    'KPI_FREQUENCY_WEIGHTS': {},  # e.g., {'GRAFFITI': 2.0, 'TRACKSIDE': 1.5} - empty means equal distribution
    
    # ─── Task Durations ───
    'DURATION_CATEGORIES': {
        'short': {'min_hours': 1, 'max_hours': 24, 'weight': 0.4},
        'medium': {'min_hours': 25, 'max_hours': 120, 'weight': 0.4},
        'long': {'min_hours': 121, 'max_hours': 720, 'weight': 0.2},
    },
    
    # ─── Edge Cases ───
    'YEAR_SPANNING_PER_KPI': 1,  # At least 1 year-spanning job per KPI code
    'DOWNTIME_THRESHOLD_TESTS': True,  # Create jobs with 24, 48, 100 hour thresholds
    'PERIOD_BOUNDARY_CROSSING_RATIO': 0.3,  # 30% of jobs should cross period boundaries
    
    # ─── Duplicate Testing ───
    'CREATE_OVERLAPPING_GROUPS': True,  # Create overlapping date groups
    'OVERLAPPING_GROUPS_COUNT': 50,  # Number of overlap groups to create
    'OVERLAP_WINDOW_HOURS': 6,  # Tasks within 6 hours are considered overlapping
    
    # ─── Status ───
    'ALL_COMPLETED': True,  # All jobs COMP status
    
    # ─── Output ───
    'OUTPUT_MODE': 'LAKEHOUSE',  # 'LAKEHOUSE' or 'SQL_SERVER'
    'LAKEHOUSE_PATH': '/lakehouse/default/Tables/test_potential_failures',  # For validation
    'SQL_TABLE_NAME': 'customer_success.app_potential_failures_test',  # Final table
    
    # ─── Optional Features ───
    'GENERATE_STATUS_HISTORY': False,  # Generate WAPPR → APPR → COMP history files
    'INCLUDE_NON_KPI_CODES': False,  # Include non-KPI tasks
    'NON_KPI_RATIO': 0.1,  # 10% non-KPI tasks if enabled
    
    # ─── Database Connection ───
    'SQL_CONNECTION_STRING': None,  # Set to None to use Fabric default
}

print("✓ Configuration loaded")
print(f"  Target Records: {CONFIG['TOTAL_RECORDS']:,}")
print(f"  Date Range: {CONFIG['START_DATE']} + {CONFIG['PERIOD_YEARS']} years")
print(f"  Output Mode: {CONFIG['OUTPUT_MODE']}")

## 2. Database Connection & Reference Data Loading

In [None]:
# ═══════════════════════════════════════════════════════════════
#  DATABASE CONNECTION
# ═══════════════════════════════════════════════════════════════

def get_connection():
    """Get database connection (Fabric or custom)"""
    if CONFIG['SQL_CONNECTION_STRING']:
        return pyodbc.connect(CONFIG['SQL_CONNECTION_STRING'])
    else:
        # Use Fabric notebook connection
        from notebookutils import mssparkutils
        return mssparkutils.credentials.getConnectionString()

def load_reference_data():
    """Load KPI codes, stations, and date dimensions"""
    print("Loading reference data...")
    
    # Load KPI Classification codes
    kpi_query = """
    SELECT DISTINCT 
        KPICode,
        KPIDescription,
        KPICategory,
        ThresholdHours
    FROM bronze.fms_dimkpiclassification
    WHERE IsKPI = 1
    ORDER BY KPICode
    """
    
    # Load Stations (excluding NULL sections/depots)
    station_query = """
    SELECT DISTINCT
        StationCode as Building,
        StationName as BuildingName,
        LocationName,
        StationSection
    FROM customer_success.dimStation
    WHERE StationSection IS NOT NULL
        AND StationCode IS NOT NULL
    ORDER BY StationCode
    """
    
    # Load Date Dimension with Period information
    date_query = """
    SELECT 
        Date,
        Period,
        PeriodWeek,
        PeriodYear,
        FinancialYear
    FROM core_dimdate
    WHERE Date BETWEEN '2025-05-25' AND '2027-05-31'
    ORDER BY Date
    """
    
    try:
        # Use Spark SQL in Fabric
        kpi_codes = spark.sql(kpi_query).toPandas()
        stations = spark.sql(station_query).toPandas()
        date_dim = spark.sql(date_query).toPandas()
        
        print(f"  ✓ Loaded {len(kpi_codes)} KPI codes")
        print(f"  ✓ Loaded {len(stations)} stations")
        print(f"  ✓ Loaded {len(date_dim)} dates")
        
        return kpi_codes, stations, date_dim
    except Exception as e:
        print(f"  ⚠ Error loading reference data: {e}")
        print("  Using mock data for demonstration...")
        return create_mock_reference_data()

def create_mock_reference_data():
    """Create mock data for testing without database access"""
    
    # Mock KPI codes
    kpi_codes = pd.DataFrame({
        'KPICode': ['GRAFFITI', 'TRACKSIDE', 'PLATFORM_CLEAN', 'LIFT_MAINT', 'ESCALATOR', 
                    'LIGHTING', 'SIGNAGE', 'DRAINAGE', 'FIRE_SAFETY', 'ACCESS_CONTROL'],
        'KPIDescription': ['Graffiti Removal', 'Trackside Cleaning', 'Platform Cleaning', 
                          'Lift Maintenance', 'Escalator Maintenance', 'Lighting Repair',
                          'Signage Updates', 'Drainage Maintenance', 'Fire Safety Checks',
                          'Access Control Maintenance'],
        'KPICategory': ['Cleaning', 'Cleaning', 'Cleaning', 'Mechanical', 'Mechanical',
                       'Electrical', 'Infrastructure', 'Infrastructure', 'Safety', 'Security'],
        'ThresholdHours': [24, 48, 24, 100, 100, 48, 24, 48, 24, 48]
    })
    
    # Mock stations
    stations = pd.DataFrame({
        'Building': ['KGX', 'STN', 'LIV', 'MAN', 'BHM', 'EDI', 'GLA', 'LEE', 'BRI', 'CAR',
                     'OXF', 'CAM', 'YRK', 'NEW', 'SHE', 'NOR', 'IPS', 'PET', 'MKC', 'WAT'],
        'BuildingName': ['Kings Cross', 'Stratford', 'Liverpool Street', 'Manchester Piccadilly',
                        'Birmingham New Street', 'Edinburgh Waverley', 'Glasgow Central',
                        'Leeds Station', 'Bristol Temple Meads', 'Cardiff Central',
                        'Oxford Station', 'Cambridge Station', 'York Station', 'Newcastle Central',
                        'Sheffield Station', 'Norwich Station', 'Ipswich Station', 'Peterborough',
                        'Milton Keynes Central', 'Waterloo'],
        'LocationName': ['London', 'London', 'London', 'Manchester', 'Birmingham', 'Edinburgh',
                        'Glasgow', 'Leeds', 'Bristol', 'Cardiff', 'Oxford', 'Cambridge', 'York',
                        'Newcastle', 'Sheffield', 'Norwich', 'Ipswich', 'Peterborough',
                        'Milton Keynes', 'London'],
        'StationSection': ['Main', 'Main', 'Main', 'Main', 'Main', 'Main', 'Main', 'Main',
                          'Main', 'Main', 'Main', 'Main', 'Main', 'Main', 'Main', 'Main',
                          'Main', 'Main', 'Main', 'Main']
    })
    
    # Mock date dimension
    start = pd.to_datetime('2025-05-25')
    dates = pd.date_range(start, periods=730, freq='D')
    date_dim = pd.DataFrame({
        'Date': dates,
    })
    
    # Calculate Period, PeriodWeek, PeriodYear, FinancialYear
    def get_period_info(date):
        # Simplified period logic (4-week periods)
        year = date.year if date.month >= 4 else date.year - 1
        fy_start = pd.to_datetime(f'{year}-04-01')
        days_since = (date - fy_start).days
        period = min((days_since // 28) + 1, 13)
        week = min((days_since // 7) + 1, 52)
        return period, week, year
    
    date_dim[['Period', 'PeriodWeek', 'PeriodYear']] = date_dim['Date'].apply(
        lambda x: pd.Series(get_period_info(x))
    )
    date_dim['FinancialYear'] = date_dim['PeriodYear'].apply(lambda x: f'FY{x}/{str(x+1)[-2:]}')
    date_dim['Period'] = date_dim['Period'].apply(lambda x: f'P{x:02d}')
    
    print(f"  ✓ Created {len(kpi_codes)} mock KPI codes")
    print(f"  ✓ Created {len(stations)} mock stations")
    print(f"  ✓ Created {len(date_dim)} mock dates")
    
    return kpi_codes, stations, date_dim

# Load or create reference data
kpi_codes_df, stations_df, date_dim_df = load_reference_data()

In [None]:
# Filter KPI codes based on configuration
if not CONFIG['USE_ALL_KPI_CODES'] and CONFIG['KPI_CODES_FILTER']:
    kpi_codes_df = kpi_codes_df[kpi_codes_df['KPICode'].isin(CONFIG['KPI_CODES_FILTER'])]
    print(f"Filtered to {len(kpi_codes_df)} KPI codes: {CONFIG['KPI_CODES_FILTER']}")

# Display reference data summary
print("\n" + "="*60)
print("REFERENCE DATA SUMMARY")
print("="*60)
print(f"\nKPI Codes ({len(kpi_codes_df)}):")
print(kpi_codes_df.head(10))
print(f"\nStations ({len(stations_df)}):")
print(stations_df.head(10))
print(f"\nDate Range: {date_dim_df['Date'].min()} to {date_dim_df['Date'].max()}")

## 3. Data Generation Functions

In [None]:
# ═══════════════════════════════════════════════════════════════
#  HELPER FUNCTIONS
# ═══════════════════════════════════════════════════════════════

def random_datetime(start_date, end_date):
    """Generate random datetime between start and end"""
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)
    delta = (end - start).total_seconds()
    random_seconds = random.uniform(0, delta)
    return start + timedelta(seconds=random_seconds)

def get_duration_category():
    """Select duration category based on weights"""
    categories = list(CONFIG['DURATION_CATEGORIES'].keys())
    weights = [CONFIG['DURATION_CATEGORIES'][cat]['weight'] for cat in categories]
    return random.choices(categories, weights=weights)[0]

def generate_duration_hours(category=None):
    """Generate task duration in hours"""
    if category is None:
        category = get_duration_category()
    
    min_h = CONFIG['DURATION_CATEGORIES'][category]['min_hours']
    max_h = CONFIG['DURATION_CATEGORIES'][category]['max_hours']
    return random.uniform(min_h, max_h)

def get_period_info_for_date(date, date_dim_df):
    """Get period information for a given date"""
    date = pd.to_datetime(date).normalize()
    match = date_dim_df[date_dim_df['Date'] == date]
    if len(match) > 0:
        row = match.iloc[0]
        return row['Period'], row['PeriodWeek'], row['PeriodYear']
    return None, None, None

def generate_task_id():
    """Generate unique task ID"""
    return f"TASK-{uuid.uuid4().hex[:8].upper()}"

def generate_record_id():
    """Generate unique record ID"""
    return f"REC-{uuid.uuid4().hex[:12].upper()}"

# Sample data pools
REPORTERS = [
    ('John Smith', 'john.smith@gts.com'),
    ('Sarah Johnson', 'sarah.johnson@gts.com'),
    ('Michael Brown', 'michael.brown@gts.com'),
    ('Emma Wilson', 'emma.wilson@gts.com'),
    ('David Lee', 'david.lee@gts.com'),
    ('Lisa Anderson', 'lisa.anderson@gts.com'),
    ('James Taylor', 'james.taylor@gts.com'),
    ('Sophie Martin', 'sophie.martin@gts.com'),
]

LOGGED_BY = [
    'System_Auto',
    'Maintenance_Team',
    'Operations_Manager',
    'Station_Manager',
    'Facilities_Team',
]

SHORT_DESCRIPTIONS = {
    'GRAFFITI': ['Graffiti on platform wall', 'Graffiti on ticket machine', 'Graffiti in waiting area'],
    'TRACKSIDE': ['Trackside debris removal', 'Trackside vegetation clearance', 'Trackside litter collection'],
    'PLATFORM_CLEAN': ['Platform cleaning required', 'Spillage cleanup', 'General platform maintenance'],
    'LIFT_MAINT': ['Lift routine maintenance', 'Lift repair required', 'Lift safety inspection'],
    'ESCALATOR': ['Escalator maintenance', 'Escalator cleaning', 'Escalator safety check'],
    'LIGHTING': ['Light bulb replacement', 'Lighting circuit repair', 'Emergency lighting check'],
    'SIGNAGE': ['Sign replacement', 'Directional sign update', 'Digital display repair'],
    'DRAINAGE': ['Drain clearance', 'Drainage system inspection', 'Gully cleaning'],
    'FIRE_SAFETY': ['Fire alarm test', 'Fire extinguisher check', 'Emergency exit inspection'],
    'ACCESS_CONTROL': ['Access barrier repair', 'Gate maintenance', 'Ticket barrier service'],
}

SLA_STATUS = ['Within SLA', 'Near SLA', 'SLA Breach']

print("✓ Helper functions defined")

In [None]:
# ═══════════════════════════════════════════════════════════════
#  CORE DATA GENERATION
# ═══════════════════════════════════════════════════════════════

def create_base_task(kpi_code, station, reported_date, duration_hours, 
                     duration_category='medium', is_year_spanning=False):
    """Create a single task record"""
    
    kpi_info = kpi_codes_df[kpi_codes_df['KPICode'] == kpi_code].iloc[0]
    
    # Generate times
    reported_dt = pd.to_datetime(reported_date)
    scheduled_dt = reported_dt + timedelta(hours=random.uniform(1, 24))
    started_dt = scheduled_dt + timedelta(hours=random.uniform(0, 12))
    finished_dt = started_dt + timedelta(hours=duration_hours)
    
    # Due date based on KPI threshold
    due_dt = reported_dt + timedelta(hours=kpi_info['ThresholdHours'])
    
    # Logged and modified dates
    logged_dt = reported_dt - timedelta(minutes=random.uniform(0, 30))
    modified_dt = finished_dt + timedelta(minutes=random.uniform(0, 60))
    
    # SLA status
    hours_to_complete = (finished_dt - reported_dt).total_seconds() / 3600
    if hours_to_complete <= kpi_info['ThresholdHours'] * 0.8:
        sla_status = 'Within SLA'
    elif hours_to_complete <= kpi_info['ThresholdHours']:
        sla_status = 'Near SLA'
    else:
        sla_status = 'SLA Breach'
    
    # Get reporter
    reporter, email = random.choice(REPORTERS)
    
    # Get short description
    short_desc = random.choice(SHORT_DESCRIPTIONS.get(kpi_code, ['Maintenance task required']))
    long_desc = f"{short_desc}. Duration: {duration_hours:.1f} hours. Category: {duration_category}."
    
    # Get period info from finished date (for reporting)
    period, period_week, period_year = get_period_info_for_date(finished_dt.date(), date_dim_df)
    
    task = {
        'TaskId': generate_task_id(),
        'RecordID': generate_record_id(),
        'Instruction_Code': kpi_code,
        'Building': station['Building'],
        'BuildingName': station['BuildingName'],
        'LocationName': station['LocationName'],
        'ShortDescription': short_desc,
        'LongDescription': long_desc,
        'Reporter': reporter,
        'ReporterEmail': email,
        'Notes': f'Generated test data - {duration_category} duration',
        'ReportedDate': reported_dt,
        'DueBy': due_dt,
        'ScheduledFor': scheduled_dt,
        'Finished': finished_dt,
        'Status': 'COMP',
        'LoggedBy': random.choice(LOGGED_BY),
        'LoggedOn': logged_dt,
        'ModifiedOn': modified_dt,
        'SLAStatus': sla_status,
        'CreatedTimestamp': logged_dt,
        'LastUploaded': datetime.now(),
        'IsCurrent': 1,
        'Period': period,
        'PeriodWeek': period_week,
        'PeriodYear': period_year,
        'StationSection': station['StationSection'],
        'KPIDescription': kpi_info['KPIDescription'],
        'KPICategory': kpi_info['KPICategory'],
    }
    
    return task

print("✓ Core generation functions defined")

## 4. Generate Test Data with Edge Cases

In [None]:
# ═══════════════════════════════════════════════════════════════
#  MAIN DATA GENERATION
# ═══════════════════════════════════════════════════════════════

print("Generating test data...")
print("="*60)

all_tasks = []

# Calculate distribution
kpi_codes_list = kpi_codes_df['KPICode'].tolist()
num_kpis = len(kpi_codes_list)

# Apply frequency weights if configured
kpi_weights = []
for kpi in kpi_codes_list:
    weight = CONFIG['KPI_FREQUENCY_WEIGHTS'].get(kpi, 1.0)
    kpi_weights.append(weight)

# Normalize weights
total_weight = sum(kpi_weights)
kpi_weights = [w / total_weight for w in kpi_weights]

# Calculate records per KPI
base_records = CONFIG['TOTAL_RECORDS']

# Reserve records for mandatory edge cases
mandatory_year_spanning = num_kpis * CONFIG['YEAR_SPANNING_PER_KPI']
mandatory_downtime = num_kpis * 3 if CONFIG['DOWNTIME_THRESHOLD_TESTS'] else 0
mandatory_total = mandatory_year_spanning + mandatory_downtime

remaining_records = base_records - mandatory_total

print(f"Total target records: {CONFIG['TOTAL_RECORDS']:,}")
print(f"Mandatory edge cases: {mandatory_total:,}")
print(f"  - Year-spanning: {mandatory_year_spanning}")
print(f"  - Downtime tests: {mandatory_downtime}")
print(f"Remaining for distribution: {remaining_records:,}")
print()

# Date range
start_date = pd.to_datetime(CONFIG['START_DATE'])
end_date = start_date + timedelta(days=365 * CONFIG['PERIOD_YEARS'])

# Financial year boundary (assuming April 1st)
fy_boundary = pd.to_datetime(f"{start_date.year + 1}-04-01")

print(f"Date range: {start_date.date()} to {end_date.date()}")
print(f"Financial year boundary: {fy_boundary.date()}")
print()

In [None]:
# ═══════════════════════════════════════════════════════════════
#  STEP 1: Generate Year-Spanning Tasks (Mandatory)
# ═══════════════════════════════════════════════════════════════

print("Step 1: Generating year-spanning tasks...")

for kpi_code in kpi_codes_list:
    for _ in range(CONFIG['YEAR_SPANNING_PER_KPI']):
        # Start before FY boundary, end after
        start_before_fy = random_datetime(
            fy_boundary - timedelta(days=90),
            fy_boundary - timedelta(days=1)
        )
        
        # Duration to ensure it crosses boundary
        min_duration = ((fy_boundary - start_before_fy).total_seconds() / 3600) + 24
        duration = random.uniform(min_duration, min_duration + 200)
        
        station = stations_df.sample(1).iloc[0]
        task = create_base_task(
            kpi_code, station, start_before_fy, duration,
            duration_category='long', is_year_spanning=True
        )
        task['Notes'] = 'Year-spanning task - crosses FY boundary'
        all_tasks.append(task)

print(f"  ✓ Generated {len(all_tasks)} year-spanning tasks")

In [None]:
# ═══════════════════════════════════════════════════════════════
#  STEP 2: Generate Downtime Threshold Tests (Optional)
# ═══════════════════════════════════════════════════════════════

if CONFIG['DOWNTIME_THRESHOLD_TESTS']:
    print("\nStep 2: Generating downtime threshold test tasks...")
    
    threshold_hours = [24, 48, 100]
    
    for kpi_code in kpi_codes_list:
        kpi_threshold = kpi_codes_df[kpi_codes_df['KPICode'] == kpi_code].iloc[0]['ThresholdHours']
        
        for threshold in threshold_hours:
            # Create task that starts in year 1, uses threshold, closes in year 2
            start_in_y1 = random_datetime(
                fy_boundary - timedelta(days=120),
                fy_boundary - timedelta(days=30)
            )
            
            # Use some or all of year 2 threshold
            y2_usage = random.uniform(threshold * 0.5, threshold)
            total_duration = threshold + y2_usage
            
            station = stations_df.sample(1).iloc[0]
            task = create_base_task(
                kpi_code, station, start_in_y1, total_duration,
                duration_category='long'
            )
            task['Notes'] = f'Downtime threshold test - {threshold}h threshold, Y1+Y2 rollover'
            all_tasks.append(task)
    
    print(f"  ✓ Generated {len(all_tasks) - mandatory_year_spanning} threshold test tasks")
else:
    print("\nStep 2: Skipped (downtime threshold tests disabled)")

In [None]:
# ═══════════════════════════════════════════════════════════════
#  STEP 3: Generate Regular Tasks with Duration Mix
# ═══════════════════════════════════════════════════════════════

print("\nStep 3: Generating regular tasks with mixed durations...")

tasks_before_regular = len(all_tasks)

# Distribute remaining records across KPIs
for i, kpi_code in enumerate(kpi_codes_list):
    num_tasks = int(remaining_records * kpi_weights[i])
    
    # Ensure each duration category is represented
    short_count = int(num_tasks * CONFIG['DURATION_CATEGORIES']['short']['weight'])
    medium_count = int(num_tasks * CONFIG['DURATION_CATEGORIES']['medium']['weight'])
    long_count = num_tasks - short_count - medium_count
    
    for category, count in [('short', short_count), ('medium', medium_count), ('long', long_count)]:
        for _ in range(count):
            reported_date = random_datetime(start_date, end_date - timedelta(days=30))
            duration = generate_duration_hours(category)
            station = stations_df.sample(1).iloc[0]
            
            task = create_base_task(
                kpi_code, station, reported_date, duration,
                duration_category=category
            )
            all_tasks.append(task)

regular_tasks = len(all_tasks) - tasks_before_regular
print(f"  ✓ Generated {regular_tasks:,} regular tasks")
print(f"  Total tasks so far: {len(all_tasks):,}")

In [None]:
# ═══════════════════════════════════════════════════════════════
#  STEP 4: Generate Period-Crossing Tasks
# ═══════════════════════════════════════════════════════════════

print("\nStep 4: Ensuring period-crossing tasks...")

# Get unique periods from date dimension
periods = date_dim_df.groupby(['Period', 'PeriodYear']).agg({
    'Date': ['min', 'max']
}).reset_index()
periods.columns = ['Period', 'PeriodYear', 'PeriodStart', 'PeriodEnd']

target_period_crossing = int(len(all_tasks) * CONFIG['PERIOD_BOUNDARY_CROSSING_RATIO'])
period_crossing_created = 0

# Create tasks that start in one period and end in another
for _ in range(target_period_crossing):
    # Pick a random period boundary
    period_row = periods.sample(1).iloc[0]
    period_end = pd.to_datetime(period_row['PeriodEnd'])
    
    # Start before period end, finish after
    start_date_task = random_datetime(
        period_end - timedelta(days=7),
        period_end - timedelta(hours=1)
    )
    
    # Duration to cross period
    min_duration = ((period_end - start_date_task).total_seconds() / 3600) + 24
    duration = random.uniform(min_duration, min_duration + 48)
    
    kpi_code = random.choice(kpi_codes_list)
    station = stations_df.sample(1).iloc[0]
    
    task = create_base_task(
        kpi_code, station, start_date_task, duration,
        duration_category='medium'
    )
    task['Notes'] = 'Period-crossing task - crosses period boundary'
    all_tasks.append(task)
    period_crossing_created += 1

print(f"  ✓ Generated {period_crossing_created} period-crossing tasks")
print(f"  Total tasks: {len(all_tasks):,}")

In [None]:
# ═══════════════════════════════════════════════════════════════
#  STEP 5: Generate Overlapping Groups (Duplicate Testing)
# ═══════════════════════════════════════════════════════════════

if CONFIG['CREATE_OVERLAPPING_GROUPS']:
    print("\nStep 5: Generating overlapping task groups for duplicate testing...")
    
    overlap_created = 0
    
    for _ in range(CONFIG['OVERLAPPING_GROUPS_COUNT']):
        # Pick a random station and KPI
        station = stations_df.sample(1).iloc[0]
        kpi_code = random.choice(kpi_codes_list)
        
        # Base time
        base_time = random_datetime(start_date, end_date - timedelta(days=30))
        
        # Create 2-4 overlapping tasks
        num_overlap = random.randint(2, 4)
        
        for i in range(num_overlap):
            # Vary time within overlap window
            task_time = base_time + timedelta(hours=random.uniform(0, CONFIG['OVERLAP_WINDOW_HOURS']))
            duration = generate_duration_hours('short')
            
            task = create_base_task(
                kpi_code, station, task_time, duration,
                duration_category='short'
            )
            task['Notes'] = f'Overlapping group - duplicate test #{i+1}'
            all_tasks.append(task)
            overlap_created += 1
    
    print(f"  ✓ Generated {overlap_created} tasks in overlapping groups")
    print(f"  Total tasks: {len(all_tasks):,}")
else:
    print("\nStep 5: Skipped (overlapping groups disabled)")

In [None]:
# ═══════════════════════════════════════════════════════════════
#  STEP 6: Generate Non-KPI Tasks (Optional)
# ═══════════════════════════════════════════════════════════════

if CONFIG['INCLUDE_NON_KPI_CODES']:
    print("\nStep 6: Generating non-KPI tasks...")
    
    non_kpi_count = int(len(all_tasks) * CONFIG['NON_KPI_RATIO'])
    
    # Mock non-KPI codes
    non_kpi_codes = ['GENERAL', 'ADMIN', 'INSPECTION', 'OTHER']
    
    for _ in range(non_kpi_count):
        reported_date = random_datetime(start_date, end_date - timedelta(days=30))
        duration = generate_duration_hours()
        station = stations_df.sample(1).iloc[0]
        kpi_code = random.choice(non_kpi_codes)
        
        # Create simplified task for non-KPI
        task = create_base_task(
            kpi_codes_list[0], station, reported_date, duration  # Use first KPI as template
        )
        task['Instruction_Code'] = kpi_code
        task['KPIDescription'] = f'Non-KPI: {kpi_code}'
        task['KPICategory'] = 'Non-KPI'
        task['Notes'] = 'Non-KPI task - for testing'
        all_tasks.append(task)
    
    print(f"  ✓ Generated {non_kpi_count} non-KPI tasks")
    print(f"  Total tasks: {len(all_tasks):,}")
else:
    print("\nStep 6: Skipped (non-KPI tasks disabled)")

In [None]:
# ═══════════════════════════════════════════════════════════════
#  CREATE FINAL DATAFRAME
# ═══════════════════════════════════════════════════════════════

print("\n" + "="*60)
print("Creating final dataset...")

df_test_data = pd.DataFrame(all_tasks)

print(f"\n✓ Generated {len(df_test_data):,} total records")
print(f"\nData Summary:")
print(f"  Date Range: {df_test_data['ReportedDate'].min()} to {df_test_data['Finished'].max()}")
print(f"  Unique KPI Codes: {df_test_data['Instruction_Code'].nunique()}")
print(f"  Unique Stations: {df_test_data['Building'].nunique()}")
print(f"  Status: {df_test_data['Status'].value_counts().to_dict()}")

print(f"\nRecords per KPI Code:")
print(df_test_data['Instruction_Code'].value_counts().sort_index())

print(f"\nDuration Statistics (hours):")
df_test_data['DurationHours'] = (df_test_data['Finished'] - df_test_data['ScheduledFor']).dt.total_seconds() / 3600
print(df_test_data['DurationHours'].describe())

# Display sample
print(f"\nSample Records:")
display(df_test_data.head(10))

## 5. Data Validation

In [None]:
# ═══════════════════════════════════════════════════════════════
#  DATA VALIDATION
# ═══════════════════════════════════════════════════════════════

print("Running validation checks...")
print("="*60)

validation_issues = []

# Check 1: All required columns present
required_columns = [
    'TaskId', 'RecordID', 'Instruction_Code', 'Building', 'BuildingName',
    'LocationName', 'ShortDescription', 'LongDescription', 'Reporter',
    'ReporterEmail', 'Notes', 'ReportedDate', 'DueBy', 'ScheduledFor',
    'Finished', 'Status', 'LoggedBy', 'LoggedOn', 'ModifiedOn',
    'SLAStatus', 'CreatedTimestamp', 'LastUploaded', 'IsCurrent',
    'Period', 'PeriodWeek', 'PeriodYear', 'StationSection',
    'KPIDescription', 'KPICategory'
]

missing_cols = set(required_columns) - set(df_test_data.columns)
if missing_cols:
    validation_issues.append(f"Missing columns: {missing_cols}")
else:
    print("✓ All required columns present")

# Check 2: No null values in critical columns
critical_columns = ['TaskId', 'RecordID', 'Instruction_Code', 'Building', 'Status', 'Finished']
for col in critical_columns:
    null_count = df_test_data[col].isna().sum()
    if null_count > 0:
        validation_issues.append(f"Null values in {col}: {null_count}")

if not validation_issues or len([i for i in validation_issues if 'Null values' in i]) == 0:
    print("✓ No null values in critical columns")

# Check 3: Date logic
date_issues = df_test_data[
    (df_test_data['Finished'] < df_test_data['ReportedDate']) |
    (df_test_data['ScheduledFor'] < df_test_data['ReportedDate'])
]
if len(date_issues) > 0:
    validation_issues.append(f"Invalid date sequences: {len(date_issues)} records")
else:
    print("✓ Date logic is valid")

# Check 4: Year-spanning tasks
fy_boundary = pd.to_datetime(f"{start_date.year + 1}-04-01")
year_spanning = df_test_data[
    (df_test_data['ReportedDate'] < fy_boundary) &
    (df_test_data['Finished'] > fy_boundary)
]
print(f"✓ Year-spanning tasks: {len(year_spanning)} (target: {mandatory_year_spanning})")

# Check 5: Status distribution
if CONFIG['ALL_COMPLETED']:
    non_comp = df_test_data[df_test_data['Status'] != 'COMP']
    if len(non_comp) > 0:
        validation_issues.append(f"Non-COMP status found: {len(non_comp)} records")
    else:
        print("✓ All tasks have COMP status")

# Check 6: Record count vs target
variance = abs(len(df_test_data) - CONFIG['TOTAL_RECORDS']) / CONFIG['TOTAL_RECORDS']
if variance > 0.1:  # More than 10% variance
    validation_issues.append(f"Record count variance: {variance*100:.1f}%")
else:
    print(f"✓ Record count within target (variance: {variance*100:.1f}%)")

# Check 7: Unique IDs
dup_tasks = df_test_data['TaskId'].duplicated().sum()
dup_records = df_test_data['RecordID'].duplicated().sum()
if dup_tasks > 0 or dup_records > 0:
    validation_issues.append(f"Duplicate IDs - Tasks: {dup_tasks}, Records: {dup_records}")
else:
    print("✓ All IDs are unique")

print("\n" + "="*60)
if validation_issues:
    print("⚠ VALIDATION ISSUES FOUND:")
    for issue in validation_issues:
        print(f"  - {issue}")
else:
    print("✅ ALL VALIDATION CHECKS PASSED")
print("="*60)

## 6. Output to Lakehouse / SQL Server

In [None]:
# ═══════════════════════════════════════════════════════════════
#  OUTPUT TO LAKEHOUSE (FOR VALIDATION)
# ═══════════════════════════════════════════════════════════════

if CONFIG['OUTPUT_MODE'] == 'LAKEHOUSE':
    print("Writing to Lakehouse for validation...")
    
    try:
        # Convert to Spark DataFrame
        spark_df = spark.createDataFrame(df_test_data)
        
        # Write to Lakehouse
        spark_df.write.mode("overwrite").format("delta").save(CONFIG['LAKEHOUSE_PATH'])
        
        print(f"✓ Data written to {CONFIG['LAKEHOUSE_PATH']}")
        print(f"  Records: {len(df_test_data):,}")
        print("\n⚠ IMPORTANT: Review the data in Lakehouse before writing to SQL Server")
        print("  To write to SQL Server, change OUTPUT_MODE to 'SQL_SERVER' and re-run")
        
    except Exception as e:
        print(f"⚠ Error writing to Lakehouse: {e}")
        print("Saving to CSV as backup...")
        csv_path = "/lakehouse/default/Files/test_potential_failures.csv"
        df_test_data.to_csv(csv_path, index=False)
        print(f"✓ Data saved to {csv_path}")

In [None]:
# ═══════════════════════════════════════════════════════════════
#  OUTPUT TO SQL SERVER (AFTER VALIDATION)
# ═══════════════════════════════════════════════════════════════

if CONFIG['OUTPUT_MODE'] == 'SQL_SERVER':
    print("Writing to SQL Server...")
    print(f"Target table: {CONFIG['SQL_TABLE_NAME']}")
    
    try:
        # Convert to Spark DataFrame
        spark_df = spark.createDataFrame(df_test_data)
        
        # Write to SQL Server
        spark_df.write \
            .format("jdbc") \
            .mode("overwrite") \
            .option("url", "jdbc:sqlserver://<your_server>.database.windows.net:1433;database=<your_db>") \
            .option("dbtable", CONFIG['SQL_TABLE_NAME']) \
            .option("user", "<username>") \
            .option("password", "<password>") \
            .save()
        
        print(f"✓ Data written to SQL Server")
        print(f"  Table: {CONFIG['SQL_TABLE_NAME']}")
        print(f"  Records: {len(df_test_data):,}")
        
    except Exception as e:
        print(f"⚠ Error writing to SQL Server: {e}")
        print("Please update JDBC connection details in the code")

## 7. Optional: Generate Status History Files

In [None]:
# ═══════════════════════════════════════════════════════════════
#  GENERATE STATUS HISTORY (WAPPR → APPR → COMP)
# ═══════════════════════════════════════════════════════════════

if CONFIG['GENERATE_STATUS_HISTORY']:
    print("Generating status history files...")
    
    # Take a sample of records to create history for
    sample_size = min(1000, len(df_test_data))
    history_tasks = df_test_data.sample(sample_size).copy()
    
    all_history = []
    
    for idx, row in history_tasks.iterrows():
        # WAPPR (Waiting Approval)
        wappr_record = row.copy()
        wappr_record['Status'] = 'WAPPR'
        wappr_record['Finished'] = pd.NaT
        wappr_record['ModifiedOn'] = row['LoggedOn'] + timedelta(hours=1)
        wappr_record['LastUploaded'] = wappr_record['ModifiedOn']
        all_history.append(wappr_record)
        
        # APPR (Approved)
        appr_record = row.copy()
        appr_record['Status'] = 'APPR'
        appr_record['Finished'] = pd.NaT
        appr_record['ModifiedOn'] = row['ScheduledFor'] - timedelta(hours=1)
        appr_record['LastUploaded'] = appr_record['ModifiedOn']
        all_history.append(appr_record)
        
        # COMP (Completed) - original record
        comp_record = row.copy()
        all_history.append(comp_record)
    
    df_history = pd.DataFrame(all_history)
    
    # Sort by TaskId and ModifiedOn
    df_history = df_history.sort_values(['TaskId', 'ModifiedOn'])
    
    # Save history files
    history_path = "/lakehouse/default/Files/test_potential_failures_history.csv"
    df_history.to_csv(history_path, index=False)
    
    print(f"✓ Generated {len(df_history):,} history records for {sample_size} tasks")
    print(f"  Saved to: {history_path}")
    print(f"  Status breakdown: {df_history['Status'].value_counts().to_dict()}")
else:
    print("Status history generation skipped (disabled in config)")

## 8. Summary & Next Steps

In [None]:
# ═══════════════════════════════════════════════════════════════
#  FINAL SUMMARY
# ═══════════════════════════════════════════════════════════════

print("\n" + "="*60)
print("GENERATION COMPLETE")
print("="*60)

print(f"\n📊 DATA SUMMARY")
print(f"  Total Records: {len(df_test_data):,}")
print(f"  Date Range: {df_test_data['ReportedDate'].min().date()} to {df_test_data['Finished'].max().date()}")
print(f"  KPI Codes: {df_test_data['Instruction_Code'].nunique()}")
print(f"  Stations: {df_test_data['Building'].nunique()}")

print(f"\n✅ EDGE CASES COVERED")
print(f"  Year-spanning tasks: {len(year_spanning)}")
print(f"  Period-crossing tasks: {period_crossing_created}")
if CONFIG['CREATE_OVERLAPPING_GROUPS']:
    print(f"  Overlapping task groups: {CONFIG['OVERLAPPING_GROUPS_COUNT']}")
if CONFIG['DOWNTIME_THRESHOLD_TESTS']:
    print(f"  Downtime threshold tests: {mandatory_downtime}")

print(f"\n📁 OUTPUT")
if CONFIG['OUTPUT_MODE'] == 'LAKEHOUSE':
    print(f"  Location: {CONFIG['LAKEHOUSE_PATH']}")
    print(f"  ⚠ Review data before writing to SQL Server")
else:
    print(f"  Table: {CONFIG['SQL_TABLE_NAME']}")

print(f"\n🔧 NEXT STEPS")
print("  1. Review the generated data in Lakehouse")
print("  2. Verify edge cases and date distributions")
print("  3. Run additional validation queries if needed")
print("  4. Change OUTPUT_MODE to 'SQL_SERVER' when ready")
print("  5. Update JDBC connection details in cell above")
print("  6. Re-run to write to SQL Server")

print("\n" + "="*60)