# Potential Failures Test Data Generator

This notebook generates scalable test data for the app_potential_failures schema with configurable parameters.

## Features:
- Configurable KPI code selection (all or specific codes)
- Variable task durations (short, medium, long) for each KPI group
- Financial year spanning tasks with staggered dates
- Period boundary crossing
- Station distribution across all stations (excluding NULL sections)
- Overlapping tasks for duplicate testing
- Scalable record count (default ~15k)
- All tasks in COMP status
- Optional: Status progression simulation (WAPPR -> APPR -> COMP)
- Optional: Non-KPI code tasks

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from typing import List, Dict, Optional, Tuple
import pyodbc
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
import warnings
warnings.filterwarnings('ignore')

## Configuration Parameters

In [None]:
# ============================================================================
# CONFIGURATION FLAGS
# ============================================================================

CONFIG = {
    # Data Generation Settings
    'TOTAL_RECORDS': 15000,  # Total number of records to generate
    'START_DATE': '2025-05-25',  # GTS started EL date
    'END_DATE': '2027-05-25',  # Two year period
    
    # KPI Code Settings
    'USE_ALL_KPI_CODES': True,  # Set to False to use specific codes
    'SELECTED_KPI_CODES': [],  # List of specific KPI codes if USE_ALL_KPI_CODES is False
    'INCLUDE_NON_KPI_TASKS': False,  # Nice-to-have: generate non-KPI tasks
    'NON_KPI_TASK_PERCENTAGE': 0.05,  # 5% non-KPI tasks if enabled
    
    # Duration Settings (in hours)
    'SHORT_DURATION_RANGE': (1, 24),  # Short tasks: 1-24 hours
    'MEDIUM_DURATION_RANGE': (25, 168),  # Medium tasks: 1-7 days
    'LONG_DURATION_RANGE': (169, 720),  # Long tasks: 7-30 days
    'VERY_LONG_DURATION_RANGE': (721, 2160),  # Very long: 30-90 days for edge cases
    
    # Duration Distribution per KPI Code Group
    'DURATION_DISTRIBUTION': {
        'short': 0.40,  # 40% short duration
        'medium': 0.35,  # 35% medium duration
        'long': 0.20,  # 20% long duration
        'very_long': 0.05  # 5% very long duration
    },
    
    # Financial Year Settings
    'FINANCIAL_YEAR_END': '03-31',  # MM-DD format (March 31st)
    'ENSURE_FY_SPANNING_TASKS': True,  # 1 of each KPI code spans FY
    'STAGGERED_ROLLOVER_TASKS': True,  # Tasks that test threshold rollover
    
    # Duplicate Testing Settings
    'DUPLICATE_TEST_PERCENTAGE': 0.10,  # 10% of tasks will have close overlaps
    'DUPLICATE_TIME_WINDOW_HOURS': 4,  # Within 4 hours for duplicate testing
    
    # Station Distribution
    'EXCLUDE_NULL_SECTIONS': True,  # Exclude depots/NULL sections
    
    # Output Settings
    'OUTPUT_TO_LH': True,  # Output to Lakehouse for validation
    'LH_TABLE_NAME': 'test_potential_failures_validation',
    'OUTPUT_TO_SQL': False,  # Set to True to write to SQL Server after validation
    'SQL_TABLE_NAME': 'app_potential_failures_test',
    
    # Status Progression Simulation (Nice-to-have)
    'SIMULATE_STATUS_PROGRESSION': False,  # Create multiple snapshots with status changes
    'STATUS_PROGRESSION_STEPS': ['WAPPR', 'APPR', 'COMP'],  # Status progression
    
    # Task Frequency Weights by KPI Code (can be customized)
    'KPI_FREQUENCY_WEIGHTS': {},  # Empty dict means equal distribution
    # Example: {'GRAFFITI': 2.0, 'TRACKSIDE_CLEAN': 1.5, 'OTHER': 1.0}
    
    # Database Connection Settings
    'SQL_SERVER': 'your_server.database.windows.net',
    'SQL_DATABASE': 'your_database',
    'SQL_USERNAME': 'your_username',
    'SQL_PASSWORD': 'your_password',  # Use Azure Key Vault in production
}

print("Configuration loaded successfully!")
print(f"Target records: {CONFIG['TOTAL_RECORDS']}")
print(f"Date range: {CONFIG['START_DATE']} to {CONFIG['END_DATE']}")
print(f"Use all KPI codes: {CONFIG['USE_ALL_KPI_CODES']}")

## Initialize Spark Session

In [None]:
# Initialize Spark Session for Fabric/Databricks
spark = SparkSession.builder \
    .appName("PotentialFailuresTestDataGenerator") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")

## Load Reference Data

In [None]:
# ============================================================================
# LOAD REFERENCE DATA FROM TABLES
# ============================================================================

# Load KPI Classification data
print("Loading KPI classification data...")
kpi_classification_query = """
SELECT DISTINCT 
    KPICode,
    KPIDescription,
    KPICategory,
    AnnualThresholdHours
FROM bronze.fms_dimkpiclassification
WHERE KPICode IS NOT NULL
    AND IsKPI = 1  -- Only KPI codes
ORDER BY KPICode
"""

try:
    # Try to read from SQL (Fabric environment)
    df_kpi = spark.read \
        .format("sqlserver") \
        .option("query", kpi_classification_query) \
        .load()
    kpi_codes_df = df_kpi.toPandas()
except Exception as e:
    print(f"Note: Could not load from SQL. Using sample KPI codes. Error: {e}")
    # Sample KPI codes for testing
    kpi_codes_df = pd.DataFrame({
        'KPICode': ['GRAFFITI', 'TRACKSIDE_CLEAN', 'ESCALATOR_REPAIR', 'LIFT_MAINTENANCE', 
                    'STATION_CLEAN', 'PLATFORM_REPAIR', 'LIGHTING_FIX', 'SIGNAGE_UPDATE'],
        'KPIDescription': ['Graffiti Removal', 'Trackside Cleaning', 'Escalator Repair', 
                          'Lift Maintenance', 'Station Cleaning', 'Platform Repair',
                          'Lighting Fix', 'Signage Update'],
        'KPICategory': ['Cleaning', 'Cleaning', 'Maintenance', 'Maintenance',
                       'Cleaning', 'Maintenance', 'Maintenance', 'Maintenance'],
        'AnnualThresholdHours': [24, 48, 100, 100, 48, 72, 24, 24]
    })

print(f"Loaded {len(kpi_codes_df)} KPI codes")

# Filter KPI codes based on configuration
if not CONFIG['USE_ALL_KPI_CODES'] and CONFIG['SELECTED_KPI_CODES']:
    kpi_codes_df = kpi_codes_df[kpi_codes_df['KPICode'].isin(CONFIG['SELECTED_KPI_CODES'])]
    print(f"Using {len(kpi_codes_df)} selected KPI codes")

kpi_codes = kpi_codes_df.to_dict('records')
print("Sample KPI codes:", [k['KPICode'] for k in kpi_codes[:5]])

In [None]:
# Load Station data
print("\nLoading station data...")
station_query = """
SELECT DISTINCT
    StationCode,
    StationName,
    Section
FROM customer_success.dimStation
WHERE StationCode IS NOT NULL
ORDER BY StationCode
"""

try:
    df_stations = spark.read \
        .format("sqlserver") \
        .option("query", station_query) \
        .load()
    stations_df = df_stations.toPandas()
    
    # Exclude NULL sections (depots) if configured
    if CONFIG['EXCLUDE_NULL_SECTIONS']:
        stations_df = stations_df[stations_df['Section'].notna()]
        print(f"Excluded stations with NULL sections (depots)")
except Exception as e:
    print(f"Note: Could not load from SQL. Using sample stations. Error: {e}")
    # Sample stations for testing
    stations_df = pd.DataFrame({
        'StationCode': ['KGX', 'SPX', 'LBG', 'VIC', 'WAT', 'EUS', 'PAD', 'LST'],
        'StationName': ['Kings Cross', 'St Pancras', 'London Bridge', 'Victoria',
                       'Waterloo', 'Euston', 'Paddington', 'Liverpool Street'],
        'Section': ['North', 'North', 'South', 'South', 'South', 'North', 'West', 'East']
    })

stations = stations_df.to_dict('records')
print(f"Loaded {len(stations)} stations")
print("Sample stations:", [s['StationCode'] for s in stations[:5]])

In [None]:
# Load Date Dimension data (for rail period and period week)
print("\nLoading date dimension data...")
date_query = f"""
SELECT 
    Date,
    RailPeriod,
    RailPeriodWeek,
    FiscalYear
FROM core_dimdate
WHERE Date >= '{CONFIG['START_DATE']}' 
    AND Date <= '{CONFIG['END_DATE']}'
ORDER BY Date
"""

try:
    df_dates = spark.read \
        .format("sqlserver") \
        .option("query", date_query) \
        .load()
    dates_df = df_dates.toPandas()
    dates_df['Date'] = pd.to_datetime(dates_df['Date'])
except Exception as e:
    print(f"Note: Could not load from SQL. Generating sample date data. Error: {e}")
    # Generate sample date data
    date_range = pd.date_range(start=CONFIG['START_DATE'], end=CONFIG['END_DATE'], freq='D')
    dates_df = pd.DataFrame({
        'Date': date_range,
        'RailPeriod': [(d.year - 2025) * 13 + ((d.month - 1) // 4) + 1 for d in date_range],
        'RailPeriodWeek': [((d - date_range[0]).days // 7) + 1 for d in date_range],
        'FiscalYear': [d.year if d.month >= 4 else d.year - 1 for d in date_range]
    })

# Create lookup dictionary for dates
date_lookup = dates_df.set_index('Date').to_dict('index')
print(f"Loaded {len(dates_df)} dates")
print(f"Date range: {dates_df['Date'].min()} to {dates_df['Date'].max()}")

## Helper Functions

In [None]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def get_random_datetime(start_date: str, end_date: str) -> datetime:
    """Generate a random datetime between start and end dates."""
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)
    delta = end - start
    random_days = random.randint(0, delta.days)
    random_seconds = random.randint(0, 86400)  # Random time of day
    return start + timedelta(days=random_days, seconds=random_seconds)

def get_duration_category(distribution: Dict[str, float]) -> str:
    """Randomly select a duration category based on distribution weights."""
    categories = list(distribution.keys())
    weights = list(distribution.values())
    return random.choices(categories, weights=weights)[0]

def get_duration_hours(category: str) -> int:
    """Get random duration in hours based on category."""
    ranges = {
        'short': CONFIG['SHORT_DURATION_RANGE'],
        'medium': CONFIG['MEDIUM_DURATION_RANGE'],
        'long': CONFIG['LONG_DURATION_RANGE'],
        'very_long': CONFIG['VERY_LONG_DURATION_RANGE']
    }
    min_h, max_h = ranges[category]
    return random.randint(min_h, max_h)

def get_period_info(date: datetime) -> Tuple[str, int, int]:
    """Get period information for a given date."""
    date_str = pd.to_datetime(date).strftime('%Y-%m-%d')
    date_key = pd.to_datetime(date_str)
    
    if date_key in date_lookup:
        info = date_lookup[date_key]
        period = f"P{info.get('RailPeriod', 1):02d}"
        period_week = info.get('RailPeriodWeek', 1)
        period_year = info.get('FiscalYear', date.year)
    else:
        # Fallback if date not in lookup
        period = f"P{((date.month - 1) // 4) + 1:02d}"
        period_week = ((date - pd.to_datetime(CONFIG['START_DATE'])).days // 7) + 1
        period_year = date.year if date.month >= 4 else date.year - 1
    
    return period, period_week, period_year

def generate_task_id() -> str:
    """Generate a unique task ID."""
    return f"TSK-{random.randint(100000, 999999)}-{random.randint(1000, 9999)}"

def generate_record_id() -> str:
    """Generate a unique record ID."""
    return f"REC-{random.randint(1000000, 9999999)}"

def get_financial_year_dates(year: int) -> Tuple[datetime, datetime]:
    """Get start and end dates for a financial year."""
    fy_end_month, fy_end_day = map(int, CONFIG['FINANCIAL_YEAR_END'].split('-'))
    fy_start = datetime(year - 1, fy_end_month, fy_end_day) + timedelta(days=1)
    fy_end = datetime(year, fy_end_month, fy_end_day)
    return fy_start, fy_end

def should_span_financial_year(kpi_code: str, generated_fy_tasks: set) -> bool:
    """Check if this KPI code should have a FY-spanning task."""
    if not CONFIG['ENSURE_FY_SPANNING_TASKS']:
        return False
    return kpi_code not in generated_fy_tasks

print("Helper functions loaded successfully!")

## Task Generation Logic

In [None]:
# ============================================================================
# TASK GENERATION LOGIC
# ============================================================================

def create_task(kpi_info: Dict, station: Dict, 
                duration_category: Optional[str] = None,
                force_fy_span: bool = False,
                force_period_cross: bool = False,
                reference_task: Optional[Dict] = None) -> Dict:
    """
    Create a single task record.
    
    Args:
        kpi_info: KPI code information
        station: Station information
        duration_category: Force specific duration category
        force_fy_span: Force task to span financial year
        force_period_cross: Force task to cross period boundary
        reference_task: Reference task for creating overlapping duplicates
    """
    
    # Generate task identifiers
    task_id = generate_task_id()
    record_id = generate_record_id()
    
    # Determine duration
    if duration_category is None:
        duration_category = get_duration_category(CONFIG['DURATION_DISTRIBUTION'])
    duration_hours = get_duration_hours(duration_category)
    
    # Generate dates
    if reference_task:
        # Create overlapping task for duplicate testing
        ref_logged = reference_task['LoggedOn']
        time_offset = random.randint(-CONFIG['DUPLICATE_TIME_WINDOW_HOURS'], 
                                     CONFIG['DUPLICATE_TIME_WINDOW_HOURS'])
        logged_on = ref_logged + timedelta(hours=time_offset)
    elif force_fy_span:
        # Create task that spans financial year
        fy_year = 2026  # First FY in our range
        fy_start, fy_end = get_financial_year_dates(fy_year)
        # Start before FY end
        days_before = random.randint(30, 90)
        logged_on = fy_end - timedelta(days=days_before)
        # Ensure it ends after FY end
        duration_hours = max(duration_hours, (days_before + 30) * 24)
    else:
        logged_on = get_random_datetime(CONFIG['START_DATE'], CONFIG['END_DATE'])
    
    # Calculate other dates
    reported_date = logged_on - timedelta(hours=random.randint(1, 48))
    scheduled_for = logged_on + timedelta(hours=random.randint(1, 24))
    finished = logged_on + timedelta(hours=duration_hours)
    
    # Ensure finished date is within our range
    end_date = pd.to_datetime(CONFIG['END_DATE'])
    if finished > end_date:
        finished = end_date - timedelta(hours=random.randint(1, 168))
    
    due_by = scheduled_for + timedelta(hours=random.randint(24, 168))
    modified_on = finished + timedelta(hours=random.randint(1, 4))
    
    # Get period information based on logged date
    period, period_week, period_year = get_period_info(logged_on)
    
    # If force_period_cross, adjust to cross period boundary
    if force_period_cross:
        # Find next period start
        current_period = int(period.replace('P', ''))
        # Adjust logged_on to be near period boundary
        # This is simplified - in production, use actual period boundaries
        pass
    
    # Sample data for various fields
    reporters = ['John Smith', 'Jane Doe', 'Bob Wilson', 'Alice Brown', 'Charlie Davis']
    reporter = random.choice(reporters)
    reporter_email = f"{reporter.lower().replace(' ', '.')}@rail.com"
    
    instruction_codes = ['MAINT', 'CLEAN', 'REPAIR', 'INSPECT', 'REPLACE']
    
    # Create task record
    task = {
        'TaskId': task_id,
        'RecordID': record_id,
        'Instruction_Code': random.choice(instruction_codes),
        'Building': station['StationCode'],
        'BuildingName': station['StationName'],
        'LocationName': f"{station['StationName']} - {random.choice(['Platform', 'Concourse', 'Entrance', 'Office'])}",
        'ShortDescription': f"{kpi_info['KPIDescription']} at {station['StationName']}",
        'LongDescription': f"Complete {kpi_info['KPIDescription'].lower()} work at {station['StationName']}. Duration: {duration_hours} hours.",
        'Reporter': reporter,
        'ReporterEmail': reporter_email,
        'Notes': f"Task completed. Duration category: {duration_category}",
        'ReportedDate': reported_date,
        'DueBy': due_by,
        'ScheduledFor': scheduled_for,
        'Finished': finished,
        'Status': 'COMP',  # All tasks are completed
        'LoggedBy': f"System-{random.randint(1, 10)}",
        'LoggedOn': logged_on,
        'ModifiedOn': modified_on,
        'SLAStatus': random.choice(['Met', 'Met', 'Met', 'Missed']),  # 75% met
        'CreatedTimestamp': logged_on,
        'LastUploaded': modified_on + timedelta(hours=1),
        'IsCurrent': 1,  # bit field
        'Period': period,
        'PeriodWeek': period_week,
        'PeriodYear': period_year,
        'StationSection': station.get('Section', 'Unknown'),
        'KPIDescription': kpi_info['KPIDescription'],
        'KPICategory': kpi_info['KPICategory']
    }
    
    return task

print("Task generation function loaded successfully!")

## Generate Test Data

In [None]:
# ============================================================================
# GENERATE ALL TASKS
# ============================================================================

print("Starting test data generation...\n")
print("=" * 80)

# Calculate distribution
total_records = CONFIG['TOTAL_RECORDS']
num_kpi_codes = len(kpi_codes)

# Apply frequency weights if configured
if CONFIG['KPI_FREQUENCY_WEIGHTS']:
    weights = [CONFIG['KPI_FREQUENCY_WEIGHTS'].get(k['KPICode'], 1.0) for k in kpi_codes]
else:
    weights = [1.0] * num_kpi_codes

# Normalize weights
total_weight = sum(weights)
normalized_weights = [w / total_weight for w in weights]

# Calculate tasks per KPI code
tasks_per_kpi = [int(total_records * w) for w in normalized_weights]

# Adjust for rounding
while sum(tasks_per_kpi) < total_records:
    tasks_per_kpi[random.randint(0, len(tasks_per_kpi) - 1)] += 1

print(f"Generating {total_records} tasks across {num_kpi_codes} KPI codes")
print(f"Average tasks per KPI code: {total_records / num_kpi_codes:.0f}")
print(f"Stations available: {len(stations)}")
print("=" * 80)

# Track generated tasks
all_tasks = []
fy_spanning_tasks = set()  # Track which KPI codes have FY-spanning tasks
tasks_by_station = {}  # For duplicate testing

# Generate tasks for each KPI code
for kpi_idx, kpi_info in enumerate(kpi_codes):
    kpi_code = kpi_info['KPICode']
    num_tasks = tasks_per_kpi[kpi_idx]
    
    print(f"\nGenerating {num_tasks} tasks for {kpi_code}...")
    
    # Ensure at least one of each duration category
    duration_categories = list(CONFIG['DURATION_DISTRIBUTION'].keys())
    tasks_generated = 0
    
    # 1. Generate FY-spanning task (if configured)
    if CONFIG['ENSURE_FY_SPANNING_TASKS'] and kpi_code not in fy_spanning_tasks:
        station = random.choice(stations)
        task = create_task(kpi_info, station, force_fy_span=True)
        all_tasks.append(task)
        fy_spanning_tasks.add(kpi_code)
        tasks_generated += 1
    
    # 2. Generate at least one task of each duration category
    for duration_cat in duration_categories:
        if tasks_generated >= num_tasks:
            break
        station = random.choice(stations)
        task = create_task(kpi_info, station, duration_category=duration_cat)
        all_tasks.append(task)
        tasks_generated += 1
    
    # 3. Generate remaining tasks with random distribution
    remaining_tasks = num_tasks - tasks_generated
    
    # Calculate how many should be duplicates
    num_duplicates = int(remaining_tasks * CONFIG['DUPLICATE_TEST_PERCENTAGE'])
    num_regular = remaining_tasks - num_duplicates
    
    # Generate regular tasks
    for _ in range(num_regular):
        station = random.choice(stations)
        task = create_task(kpi_info, station)
        all_tasks.append(task)
        
        # Store for potential duplicate creation
        station_code = station['StationCode']
        if station_code not in tasks_by_station:
            tasks_by_station[station_code] = []
        tasks_by_station[station_code].append(task)
    
    # Generate duplicate test tasks (overlapping times at same station)
    for _ in range(num_duplicates):
        # Pick a station that has existing tasks
        available_stations = [s for s in tasks_by_station.keys() if tasks_by_station[s]]
        if available_stations:
            station_code = random.choice(available_stations)
            reference_task = random.choice(tasks_by_station[station_code])
            station = next(s for s in stations if s['StationCode'] == station_code)
            task = create_task(kpi_info, station, reference_task=reference_task)
            all_tasks.append(task)
        else:
            # Fallback to regular task
            station = random.choice(stations)
            task = create_task(kpi_info, station)
            all_tasks.append(task)
    
    print(f"  ✓ Generated {num_tasks} tasks for {kpi_code}")
    print(f"    - FY spanning: {'Yes' if kpi_code in fy_spanning_tasks else 'No'}")
    print(f"    - Duplicate test tasks: {num_duplicates}")

print("\n" + "=" * 80)
print(f"Total tasks generated: {len(all_tasks)}")
print(f"FY-spanning tasks: {len(fy_spanning_tasks)}")
print("=" * 80)

## Create DataFrame and Validate

In [None]:
# ============================================================================
# CREATE DATAFRAME AND VALIDATE
# ============================================================================

print("Creating DataFrame...")
df = pd.DataFrame(all_tasks)

# Ensure correct data types
datetime_columns = ['ReportedDate', 'DueBy', 'ScheduledFor', 'Finished', 
                   'LoggedOn', 'ModifiedOn', 'CreatedTimestamp', 'LastUploaded']
for col in datetime_columns:
    df[col] = pd.to_datetime(df[col])

# Validation
print("\n" + "=" * 80)
print("DATA VALIDATION")
print("=" * 80)

print(f"\nTotal records: {len(df)}")
print(f"Date range: {df['LoggedOn'].min()} to {df['Finished'].max()}")
print(f"\nKPI codes distribution:")
print(df['KPIDescription'].value_counts())

print(f"\nStation distribution (top 10):")
print(df['Building'].value_counts().head(10))

print(f"\nPeriod distribution:")
print(df['Period'].value_counts().sort_index())

print(f"\nStatus check:")
print(df['Status'].value_counts())

# Check for FY-spanning tasks
df['Duration_Days'] = (df['Finished'] - df['LoggedOn']).dt.total_seconds() / (24 * 3600)
print(f"\nDuration statistics:")
print(df['Duration_Days'].describe())

# Check for period crossings
df['Finished_Period'] = df['Finished'].apply(lambda x: get_period_info(x)[0])
period_crossings = (df['Period'] != df['Finished_Period']).sum()
print(f"\nTasks crossing period boundaries: {period_crossings} ({period_crossings/len(df)*100:.1f}%)")

# Check for potential duplicates
df['LoggedOn_Hour'] = df['LoggedOn'].dt.floor('H')
duplicates = df.groupby(['Building', 'LoggedOn_Hour']).size()
duplicate_stations = (duplicates > 1).sum()
print(f"\nStation-time combinations with multiple tasks (for duplicate testing): {duplicate_stations}")

print("\n" + "=" * 80)
print("SAMPLE RECORDS")
print("=" * 80)
print(df.head(10).to_string())

# Clean up temporary columns
df = df.drop(['Finished_Period', 'LoggedOn_Hour', 'Duration_Days'], axis=1, errors='ignore')

## Export to Lakehouse (Validation)

In [None]:
# ============================================================================
# EXPORT TO LAKEHOUSE FOR VALIDATION
# ============================================================================

if CONFIG['OUTPUT_TO_LH']:
    print("\nWriting to Lakehouse for validation...")
    
    # Convert to Spark DataFrame
    spark_df = spark.createDataFrame(df)
    
    # Write to Lakehouse
    table_name = CONFIG['LH_TABLE_NAME']
    
    try:
        spark_df.write \
            .mode("overwrite") \
            .format("delta") \
            .saveAsTable(table_name)
        
        print(f"✓ Successfully wrote {len(df)} records to Lakehouse table: {table_name}")
        print(f"\nValidation query:")
        print(f"SELECT * FROM {table_name} LIMIT 100")
    except Exception as e:
        print(f"✗ Error writing to Lakehouse: {e}")
        print(f"Saving to local CSV instead...")
        df.to_csv(f"{table_name}.csv", index=False)
        print(f"✓ Saved to {table_name}.csv")
else:
    print("\nLakehouse output disabled. Saving to CSV...")
    df.to_csv("potential_failures_test_data.csv", index=False)
    print("✓ Saved to potential_failures_test_data.csv")

## Optional: Export to SQL Server

In [None]:
# ============================================================================
# EXPORT TO SQL SERVER (After Validation)
# ============================================================================

if CONFIG['OUTPUT_TO_SQL']:
    print("\n" + "=" * 80)
    print("EXPORTING TO SQL SERVER")
    print("=" * 80)
    
    try:
        # Create connection string
        conn_str = (
            f"DRIVER={{ODBC Driver 17 for SQL Server}};"
            f"SERVER={CONFIG['SQL_SERVER']};"
            f"DATABASE={CONFIG['SQL_DATABASE']};"
            f"UID={CONFIG['SQL_USERNAME']};"
            f"PWD={CONFIG['SQL_PASSWORD']}"
        )
        
        # Connect
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        
        # Create table if not exists
        create_table_sql = f"""
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{CONFIG['SQL_TABLE_NAME']}')
        CREATE TABLE {CONFIG['SQL_TABLE_NAME']} (
            TaskId nvarchar(255),
            RecordID nvarchar(255),
            Instruction_Code nvarchar(50),
            Building nvarchar(100),
            BuildingName nvarchar(255),
            LocationName nvarchar(255),
            ShortDescription nvarchar(500),
            LongDescription nvarchar(max),
            Reporter nvarchar(255),
            ReporterEmail nvarchar(255),
            Notes nvarchar(max),
            ReportedDate datetime2,
            DueBy datetime2,
            ScheduledFor datetime2,
            Finished datetime2,
            Status nvarchar(50),
            LoggedBy nvarchar(255),
            LoggedOn datetime2,
            ModifiedOn datetime2,
            SLAStatus nvarchar(50),
            CreatedTimestamp datetime2,
            LastUploaded datetime2,
            IsCurrent bit,
            Period nvarchar(10),
            PeriodWeek bigint,
            PeriodYear bigint,
            StationSection nvarchar(100),
            KPIDescription nvarchar(255),
            KPICategory nvarchar(100)
        )
        """
        cursor.execute(create_table_sql)
        conn.commit()
        
        print(f"✓ Table {CONFIG['SQL_TABLE_NAME']} ready")
        
        # Insert data in batches
        batch_size = 1000
        total_batches = (len(df) + batch_size - 1) // batch_size
        
        print(f"\nInserting {len(df)} records in {total_batches} batches...")
        
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i+batch_size]
            
            # Prepare insert statement
            insert_sql = f"""
            INSERT INTO {CONFIG['SQL_TABLE_NAME']} 
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """
            
            # Execute batch insert
            cursor.executemany(insert_sql, batch.values.tolist())
            conn.commit()
            
            if (i // batch_size + 1) % 10 == 0:
                print(f"  Batch {i // batch_size + 1}/{total_batches} completed")
        
        print(f"\n✓ Successfully inserted {len(df)} records into {CONFIG['SQL_TABLE_NAME']}")
        
        cursor.close()
        conn.close()
        
    except Exception as e:
        print(f"✗ Error writing to SQL Server: {e}")
        print("Data is still available in Lakehouse for validation.")
else:
    print("\nSQL Server export disabled.")
    print("Set CONFIG['OUTPUT_TO_SQL'] = True to enable after validation.")

## Optional: Generate Status Progression Files

In [None]:
# ============================================================================
# OPTIONAL: GENERATE STATUS PROGRESSION FILES
# ============================================================================

if CONFIG['SIMULATE_STATUS_PROGRESSION']:
    print("\n" + "=" * 80)
    print("GENERATING STATUS PROGRESSION FILES")
    print("=" * 80)
    
    status_steps = CONFIG['STATUS_PROGRESSION_STEPS']
    
    for step_idx, status in enumerate(status_steps):
        print(f"\nGenerating snapshot {step_idx + 1}: {status} status")
        
        # Create a copy of the dataframe
        df_snapshot = df.copy()
        
        # Update status
        df_snapshot['Status'] = status
        
        # Adjust Finished date based on status
        if status != 'COMP':
            df_snapshot['Finished'] = pd.NaT  # Not finished yet
        
        # Adjust ModifiedOn to simulate progression
        time_offset = timedelta(hours=step_idx * 24)  # Each step is a day apart
        df_snapshot['ModifiedOn'] = df_snapshot['LoggedOn'] + time_offset
        
        # Save to file
        filename = f"potential_failures_snapshot_{step_idx + 1}_{status}.csv"
        df_snapshot.to_csv(filename, index=False)
        print(f"  ✓ Saved {len(df_snapshot)} records to {filename}")
    
    print(f"\n✓ Generated {len(status_steps)} status progression snapshots")
    print("These files can be used to test backload changes.")
else:
    print("\nStatus progression simulation disabled.")
    print("Set CONFIG['SIMULATE_STATUS_PROGRESSION'] = True to enable.")

## Summary and Next Steps

In [None]:
# ============================================================================
# SUMMARY
# ============================================================================

print("\n" + "=" * 80)
print("GENERATION COMPLETE")
print("=" * 80)

print(f"\n📊 Summary:")
print(f"  - Total records generated: {len(df):,}")
print(f"  - KPI codes covered: {df['KPIDescription'].nunique()}")
print(f"  - Stations covered: {df['Building'].nunique()}")
print(f"  - Date range: {df['LoggedOn'].min().date()} to {df['Finished'].max().date()}")
print(f"  - Periods covered: {df['Period'].nunique()}")
print(f"  - FY-spanning tasks: {len(fy_spanning_tasks)}")

print(f"\n📝 Next Steps:")
print(f"  1. Validate data in Lakehouse table: {CONFIG['LH_TABLE_NAME']}")
print(f"  2. Check for data quality issues")
print(f"  3. Verify period crossings and FY spanning tasks")
print(f"  4. Test duplicate detection logic")
print(f"  5. Once validated, set CONFIG['OUTPUT_TO_SQL'] = True to export to SQL Server")

print(f"\n🔧 Configuration Tips:")
print(f"  - Adjust CONFIG['TOTAL_RECORDS'] to scale up/down")
print(f"  - Use CONFIG['SELECTED_KPI_CODES'] to focus on specific codes")
print(f"  - Modify CONFIG['KPI_FREQUENCY_WEIGHTS'] to adjust distribution")
print(f"  - Enable CONFIG['SIMULATE_STATUS_PROGRESSION'] for history testing")

print("\n" + "=" * 80)
print("✨ Test data generation completed successfully! ✨")
print("=" * 80)