In [5]:
import pandas as pd
import numpy as np
import os
import shutil
import time
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
import glob

# ==========================================
# 1. CONFIGURATION & SCHEMA DEFINITION
# ==========================================

# File Names
LOCATIONS_FILE = 'locations_dummy.csv'
ALLOCATIONS_FILE = 'allocations.csv'
PARTS_FILE = 'synthetic_parts_generated.csv'

# STRICT Schema Definition
# These are the columns strictly required to run the Full Validation as requested.
# If these are missing, we will flag it as a Critical Data Error.
REQUIRED_SCHEMA = {
    'LOCATIONS': {
        'file': LOCATIONS_FILE,
        'columns': [
            'loc_inst_code', # Key
            'width', 'depth', 'height', # Dims
            'x', 'y', 'z' # Coordinates
        ]
    },
    'ALLOCATIONS': {
        'file': ALLOCATIONS_FILE,
        'columns': [
            'LOCATION_ID', 'SKU', # Keys
            'GRID_X', 'GRID_Y', 'GRID_Z', # Stack Counts
            'ORIENT_X_MM', 'ORIENT_Y_MM', 'ORIENT_Z_MM', # Orientation Dims
            'INIT_UNITS' # Quantity
        ]
    },
    'PARTS': {
        'file': PARTS_FILE,
        'columns': [
            'ITEM_ID', # Key
            'LEN_MM', 'WID_MM', 'DEP_MM', # Dims
            'WT_KG' # Weight
        ]
    }
}

OUTPUT_DIR = 'validation_results'
MAX_EXECUTION_TIME_SEC = 300  # 5 minutes

# Report Buffer
report_buffer = []

def log(message):
    """Prints to console and appends to report buffer."""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    msg = f"[{timestamp}] {message}"
    print(msg)
    report_buffer.append(msg)

def setup_environment():
    if os.path.exists(OUTPUT_DIR):
        shutil.rmtree(OUTPUT_DIR)
    os.makedirs(OUTPUT_DIR)
    log(f"Output folder '{OUTPUT_DIR}' ready.")

# ==========================================
# 2. DATA LOADING & STRICT SCHEMA CHECK
# ==========================================

def load_and_validate_dataset(key, config):
    filepath = config['file']
    required_cols = config['columns']
    
    log(f"--- Loading {key} ({filepath}) ---")
    
    # 1. Check File Existence
    if not os.path.exists(filepath):
        log(f"CRITICAL FAIL: File {filepath} not found.")
        # Debug helper: print what IS there
        log(f"       Info: Files in directory: {glob.glob('*.*')}")
        return None, False

    try:
        # 2. Read File (Auto-detect separator, but don't rename columns automatically)
        df = pd.read_csv(filepath, sep=None, engine='python', dtype=str)
        
        # 3. Clean Headers (Remove hidden BOM characters or whitespace)
        df.columns = df.columns.str.strip().str.replace('^ï»¿', '', regex=True)
        
        # 4. Strict Schema Validation
        missing_cols = [c for c in required_cols if c not in df.columns]
        
        if missing_cols:
            log(f"CRITICAL SCHEMA ERROR in {key}:")
            log(f"   Missing Required Columns: {missing_cols}")
            log(f"   Columns Found in file:    {list(df.columns)}")
            log(f"   -> Action: Rename columns in CSV to match expected schema.")
            return df, False
        else:
            log(f"SUCCESS: {key} loaded and matches schema ({len(df)} rows).")
            return df, True

    except Exception as e:
        log(f"CRITICAL ERROR reading {filepath}: {e}")
        return None, False

def convert_numeric(df, cols):
    for col in cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    return df

def load_all_data():
    setup_environment()
    
    datasets = {}
    valid_flags = {}
    
    # Load Locations
    df, valid = load_and_validate_dataset('LOCATIONS', REQUIRED_SCHEMA['LOCATIONS'])
    if valid:
        df = convert_numeric(df, ['width', 'depth', 'height', 'x', 'y', 'z'])
    datasets['LOCATIONS'] = df
    valid_flags['LOCATIONS'] = valid

    # Load Allocations
    df, valid = load_and_validate_dataset('ALLOCATIONS', REQUIRED_SCHEMA['ALLOCATIONS'])
    if valid:
        num_cols = ['GRID_X', 'GRID_Y', 'GRID_Z', 'ORIENT_X_MM', 'ORIENT_Y_MM', 'ORIENT_Z_MM', 'INIT_UNITS']
        df = convert_numeric(df, num_cols)
    datasets['ALLOCATIONS'] = df
    valid_flags['ALLOCATIONS'] = valid

    # Load Parts
    df, valid = load_and_validate_dataset('PARTS', REQUIRED_SCHEMA['PARTS'])
    if valid:
        df = convert_numeric(df, ['LEN_MM', 'WID_MM', 'DEP_MM', 'WT_KG'])
    datasets['PARTS'] = df
    valid_flags['PARTS'] = valid
    
    return datasets, valid_flags

# ==========================================
# 3. VALIDATION LOGIC
# ==========================================

def estimate_and_sample(df, check_name, validation_func, *args):
    """
    Time estimation wrapper. Random samples if process > 5 mins.
    """
    log(f"Starting Check: {check_name}...")
    
    total_rows = len(df)
    test_size = min(1000, total_rows)
    
    if test_size == 0:
        return pd.DataFrame()

    # Test Run
    start_time = time.time()
    try:
        _ = validation_func(df.head(test_size), *args, quiet=True)
    except Exception as e:
        log(f"ERROR executing logic for {check_name}: {e}")
        return pd.DataFrame() # Stop this specific check
        
    duration = time.time() - start_time
    if duration == 0: duration = 0.001
    
    estimated_total_time = (duration / test_size) * total_rows
    log(f"   Estimated time: {estimated_total_time:.2f}s")

    if estimated_total_time > MAX_EXECUTION_TIME_SEC:
        safe_rows = int((MAX_EXECUTION_TIME_SEC / duration) * test_size)
        log(f"   WARNING: Time limit exceeded. Sampling {safe_rows} rows.")
        df_to_process = df.sample(n=safe_rows, random_state=42)
    else:
        df_to_process = df

    return validation_func(df_to_process, *args, quiet=False)

def check_referential_integrity(datasets):
    log("--- Checking Referential Integrity ---")
    
    # Allocations -> Locations
    df_alloc = datasets['ALLOCATIONS']
    df_loc = datasets['LOCATIONS']
    
    if df_alloc is not None and df_loc is not None:
        orphans = df_alloc[~df_alloc['LOCATION_ID'].isin(df_loc['loc_inst_code'])]
        if len(orphans) > 0:
            log(f"FAIL: {len(orphans)} Allocations point to unknown Locations.")
            orphans.to_csv(f"{OUTPUT_DIR}/integrity_fail_alloc_loc.csv", index=False)
        else:
            log("PASS: All Allocations match valid Locations.")
            
    # Allocations -> Parts
    df_parts = datasets['PARTS']
    if df_alloc is not None and df_parts is not None:
        orphans = df_alloc[~df_alloc['SKU'].isin(df_parts['ITEM_ID'])]
        if len(orphans) > 0:
            log(f"FAIL: {len(orphans)} Allocations point to unknown SKUs.")
            orphans.to_csv(f"{OUTPUT_DIR}/integrity_fail_alloc_sku.csv", index=False)
        else:
            log("PASS: All Allocations match valid Parts.")

def func_geometric_fit(df_alloc, df_loc, quiet=False):
    """
    STRICT CHECK: Does Grid * Orientation fit in Location?
    """
    merged = df_alloc.merge(df_loc, left_on='LOCATION_ID', right_on='loc_inst_code', how='left')
    issues = []
    
    iterator = tqdm(merged.iterrows(), total=merged.shape[0]) if not quiet else merged.iterrows()
    
    for idx, row in iterator:
        if pd.isna(row['width']): continue 

        # Strict Logic: We rely on the GRID and ORIENT columns.
        used_x = row['GRID_X'] * row['ORIENT_X_MM']
        used_y = row['GRID_Y'] * row['ORIENT_Y_MM']
        used_z = row['GRID_Z'] * row['ORIENT_Z_MM']
        
        tolerance = 1.0 # 1mm tolerance
        
        fail_x = used_x > (row['width'] + tolerance)
        fail_y = used_y > (row['depth'] + tolerance)
        fail_z = used_z > (row['height'] + tolerance)
        
        if fail_x or fail_y or fail_z:
            issues.append({
                'LOCATION_ID': row['LOCATION_ID'],
                'SKU': row['SKU'],
                'Issue': 'Does Not Fit',
                'Loc_Dims': f"{row['width']}x{row['depth']}x{row['height']}",
                'Stack_Dims': f"{used_x:.1f}x{used_y:.1f}x{used_z:.1f}"
            })

    return pd.DataFrame(issues)

def check_unallocated(datasets):
    log("--- Checking Unallocated Items ---")
    df_parts = datasets['PARTS']
    df_alloc = datasets['ALLOCATIONS']
    df_loc = datasets['LOCATIONS']
    
    # Identify Unallocated
    all_skus = set(df_parts['ITEM_ID'])
    alloc_skus = set(df_alloc['SKU'])
    unallocated = all_skus - alloc_skus
    
    if not unallocated:
        log("PASS: All items are allocated.")
        return

    log(f"INFO: {len(unallocated)} items are NOT allocated. Checking if they physically fit in empty bins...")
    
    # Identify Empty Bins
    occupied_locs = set(df_alloc['LOCATION_ID'])
    empty_locs = df_loc[~df_loc['loc_inst_code'].isin(occupied_locs)].copy()
    
    if len(empty_locs) == 0:
        log("FAIL: Items unallocated, but no empty bins exist.")
        return

    # Feasibility Check
    # We check if Volume of Part < Volume of Bin AND Max Dim of Part < Max Dim of Bin
    sample_unalloc = df_parts[df_parts['ITEM_ID'].isin(list(unallocated)[:20])] # Check first 20
    
    fits_found = 0
    for _, part in sample_unalloc.iterrows():
        p_vol = part['LEN_MM'] * part['WID_MM'] * part['DEP_MM']
        p_max = max(part['LEN_MM'], part['WID_MM'], part['DEP_MM'])
        
        # Vectorized check against all empty bins
        # 1. Volume fits?
        # 2. Max dimension fits? (Rough heuristic for "can it get inside")
        
        # Calculate empty loc volumes
        empty_locs['vol'] = empty_locs['width'] * empty_locs['depth'] * empty_locs['height']
        empty_locs['max_dim'] = empty_locs[['width', 'depth', 'height']].max(axis=1)
        
        matches = empty_locs[
            (empty_locs['vol'] >= p_vol) & 
            (empty_locs['max_dim'] >= p_max)
        ]
        
        if len(matches) > 0:
            fits_found += 1
            
    if fits_found > 0:
        log(f"WARN: Unallocated items exist, and {fits_found} of sampled items (20) physically fit in available empty bins.")
        log("      -> This suggests logic errors in the allocation algorithm (why weren't these placed?).")
    else:
        log("INFO: Unallocated items exist, but they appear too large for the available empty bins.")

# ==========================================
# 4. MAIN RUNNER
# ==========================================

def run_full_diagnostic():
    datasets, valid_flags = load_all_data()
    
    # If any dataset failed SCHEMA validation, we pause and report.
    # We do NOT proceed to logic that requires those columns.
    
    if not valid_flags['LOCATIONS']:
        log("STOPPING: Locations dataset has schema errors (see above). Fix columns to proceed.")
        return
    if not valid_flags['ALLOCATIONS']:
        log("STOPPING: Allocations dataset has schema errors (see above). Fix columns to proceed.")
        return
    if not valid_flags['PARTS']:
        log("STOPPING: Parts dataset has schema errors (see above). Fix columns to proceed.")
        return

    log("\nAll Schemas Valid. Proceeding to Logic Checks...\n")

    # 1. Duplicates
    log("--- Duplicate Checks ---")
    for key, col in [('LOCATIONS', 'loc_inst_code'), ('ALLOCATIONS', 'LOCATION_ID'), ('PARTS', 'ITEM_ID')]:
        dupes = datasets[key][datasets[key].duplicated(subset=col, keep=False)]
        if len(dupes) > 0:
            log(f"FAIL: {len(dupes)} duplicates in {key} (Column: {col}).")
        else:
            log(f"PASS: No duplicates in {key}.")

    # 2. Integrity
    check_referential_integrity(datasets)

    # 3. Geometric Fit (The "Real" Check)
    log("--- Geometric Fit Analysis ---")
    fit_issues = estimate_and_sample(
        datasets['ALLOCATIONS'], 
        "Geometric Fit", 
        func_geometric_fit, 
        datasets['LOCATIONS']
    )
    
    if not fit_issues.empty:
        log(f"FAIL: Found {len(fit_issues)} allocations that do not physically fit.")
        fit_issues.to_csv(f"{OUTPUT_DIR}/fit_issues.csv", index=False)
        try:
            plt.figure()
            fit_issues['Issue'].value_counts().plot(kind='bar')
            plt.title("Fit Issues")
            plt.savefig(f"{OUTPUT_DIR}/fit_chart.png")
        except: pass
    else:
        log("PASS: All checked allocations fit geometrically.")

    # 4. Unallocated Logic
    check_unallocated(datasets)

    # Save Log
    with open(f"{OUTPUT_DIR}/validation_report.txt", "w") as f:
        f.write("\n".join(report_buffer))
    log(f"\nValidation finished. Report saved to {OUTPUT_DIR}/validation_report.txt")

if __name__ == "__main__":
    run_full_diagnostic()

[2025-12-28 14:06:51] Output folder 'validation_results' ready.
[2025-12-28 14:06:51] --- Loading LOCATIONS (locations_dummy.csv) ---
[2025-12-28 14:06:51] SUCCESS: LOCATIONS loaded and matches schema (357 rows).
[2025-12-28 14:06:51] --- Loading ALLOCATIONS (allocations.csv) ---
[2025-12-28 14:06:51] SUCCESS: ALLOCATIONS loaded and matches schema (10 rows).
[2025-12-28 14:06:51] --- Loading PARTS (synthetic_parts_generated.csv) ---
[2025-12-28 14:06:51] SUCCESS: PARTS loaded and matches schema (75 rows).
[2025-12-28 14:06:51] 
All Schemas Valid. Proceeding to Logic Checks...

[2025-12-28 14:06:51] --- Duplicate Checks ---
[2025-12-28 14:06:51] PASS: No duplicates in LOCATIONS.
[2025-12-28 14:06:51] PASS: No duplicates in ALLOCATIONS.
[2025-12-28 14:06:51] PASS: No duplicates in PARTS.
[2025-12-28 14:06:51] --- Checking Referential Integrity ---
[2025-12-28 14:06:51] PASS: All Allocations match valid Locations.
[2025-12-28 14:06:51] FAIL: 10 Allocations point to unknown SKUs.
[2025-12-

100%|████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 6707.67it/s]

[2025-12-28 14:06:51] PASS: All checked allocations fit geometrically.
[2025-12-28 14:06:51] --- Checking Unallocated Items ---
[2025-12-28 14:06:51] INFO: 75 items are NOT allocated. Checking if they physically fit in empty bins...
[2025-12-28 14:06:51] WARN: Unallocated items exist, and 20 of sampled items (20) physically fit in available empty bins.
[2025-12-28 14:06:51]       -> This suggests logic errors in the allocation algorithm (why weren't these placed?).
[2025-12-28 14:06:51] 
Validation finished. Report saved to validation_results/validation_report.txt



