In [1]:
# =============================================================================
# CELL 1: Configuration and Setup
# =============================================================================
import os
import time
from datetime import datetime

try:
    import duckdb
    print(f"✓ DuckDB version: {duckdb.__version__}")
except ImportError:
    print("Installing DuckDB...")
    !pip install duckdb
    import duckdb
    print(f"✓ DuckDB installed: {duckdb.__version__}")

# Input: merged country files from country_merge_all_production
INPUT_DIR = r"D:\ECB_ESMA_BY_COUNTRY_ALL"

# Output: sorted country files
OUTPUT_DIR = r"D:\ECB_ESMA_BY_COUNTRY_SORTED"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Sort columns - Preserves time series for each loan-collateral pair
# Sort order: Loan → Collateral → Date
# NOTE: Using RREL6 (unified date column) instead of Pool_Cutoff_Date
#       - RREL6 has data from BOTH ECB (AR1->RREL6) and ESMA sources
#       - Pool_Cutoff_Date only exists in ESMA source files (empty for ECB data)
# NOTE: RREC9 (Property Type) is NOT included - it's a time-varying attribute,
#       not part of the key. (RREL3, RREC3, RREL6) should be unique.
SORT_COLUMNS = [
    "RREL3",           # 1. Loan Identifier (individual loan)
    "RREC3",           # 2. Collateral Identifier (keeps each collateral's time series together)
    "RREL6",           # 3. Data Cut-Off Date (chronological within each loan-collateral pair)
]
# NOTE: RREL1 is Pool Identifier (securitization deal), NOT individual loan

print(f"Input directory: {INPUT_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Sort by: {', '.join(SORT_COLUMNS)}")

✓ DuckDB version: 1.4.3
Input directory: D:\ECB_ESMA_BY_COUNTRY_ALL
Output directory: D:\ECB_ESMA_BY_COUNTRY_SORTED
Sort by: RREL3, RREC3, RREL6


In [2]:
# =============================================================================
# CELL 2: Discover Country Files to Sort
# =============================================================================
country_files = []

if os.path.exists(INPUT_DIR):
    for fname in os.listdir(INPUT_DIR):
        if fname.endswith('.csv') and not fname.endswith('.tmp'):
            filepath = os.path.join(INPUT_DIR, fname)
            size_gb = os.path.getsize(filepath) / (1024**3)
            country_code = fname.replace('.csv', '')
            country_files.append({
                'country': country_code,
                'filename': fname,
                'filepath': filepath,
                'size_gb': size_gb
            })

# Sort by size (smallest first for testing)
country_files.sort(key=lambda x: x['size_gb'])

print(f"Found {len(country_files)} country files to sort:")
print()
total_size = 0
for f in country_files:
    print(f"  {f['country']}: {f['size_gb']:.2f} GB")
    total_size += f['size_gb']
print(f"\nTotal size: {total_size:.2f} GB")

Found 10 country files to sort:

  UNKNOWN: 0.15 GB
  PT: 0.85 GB
  IE: 4.31 GB
  IT: 9.24 GB
  ES: 12.19 GB
  NL: 45.37 GB
  BE: 53.66 GB
  DE: 53.73 GB
  UK: 56.84 GB
  FR: 91.68 GB

Total size: 328.03 GB


In [3]:
# =============================================================================
# CELL 3: Sort Function using DuckDB (all VARCHAR to avoid type issues)
# =============================================================================
def sort_country_file(input_path, output_path, sort_columns):
    """
    Sort a country CSV file by multiple columns for deterministic ordering.
    Sort order: RREL3 (Loan ID), RREC3 (Collateral ID), RREL6 (Date)
    Uses DuckDB with PERSISTENT DATABASE FILE for true disk-based sorting.
    All columns read as VARCHAR to avoid type inference issues.
    Auto-detects system resources for optimal performance.
    """
    import psutil
    start_time = time.time()
    
    # Use a persistent database file on D: drive for large file sorting
    db_path = 'D:/duckdb_temp/sort_temp.duckdb'
    temp_dir = 'D:/duckdb_temp'
    os.makedirs(temp_dir, exist_ok=True)
    
    # Remove old database file if exists
    if os.path.exists(db_path):
        os.remove(db_path)
    if os.path.exists(db_path + '.wal'):
        os.remove(db_path + '.wal')
    
    # Auto-detect system resources
    available_ram_gb = psutil.virtual_memory().available / (1024**3)
    cpu_count = psutil.cpu_count(logical=False) or 4
    
    # Use 70% of available RAM, between 8GB and 64GB
    memory_limit_gb = int(available_ram_gb * 0.7)
    memory_limit_gb = max(8, min(memory_limit_gb, 64))
    
    # Use half of physical cores
    threads = max(2, cpu_count // 2)
    
    try:
        # Create PERSISTENT connection - this enables true disk-based operations
        conn = duckdb.connect(db_path)
        
        # Configure for large file handling - optimized settings
        conn.execute("SET preserve_insertion_order = false")
        conn.execute(f"SET threads = {threads}")
        conn.execute(f"SET memory_limit = '{memory_limit_gb}GB'")
        conn.execute(f"SET temp_directory = '{temp_dir}'")
        # Don't limit temp directory size - let it use available disk
        
        # Read with ALL columns as VARCHAR to avoid type inference errors
        # First get column names (use quote='"' to handle fields with commas inside quotes)
        cols_query = f"SELECT * FROM read_csv_auto('{input_path}', sample_size=100, quote='\"') LIMIT 0"
        cols_result = conn.execute(cols_query).description
        available_cols = [col[0] for col in cols_result]
        
        # Build ORDER BY clause from provided sort columns
        order_cols = []
        for col in sort_columns:
            if col in available_cols:
                order_cols.append(col)
            else:
                print(f"  Warning: {col} not found in file, skipping")
        
        if not order_cols:
            raise ValueError("No sort columns found in file")
        
        order_clause = ", ".join(order_cols)
        
        # Count rows (with all_varchar=true and quote='"')
        count_query = f"SELECT COUNT(*) FROM read_csv('{input_path}', all_varchar=true, header=true, quote='\"')"
        row_count = conn.execute(count_query).fetchone()[0]
        print(f"  Row count: {row_count:,}")
        
        # Sort and export (with all_varchar=true and quote='"' to handle commas in quoted fields)
        temp_path = output_path + '.tmp'
        
        export_query = f"""
            COPY (
                SELECT * FROM read_csv('{input_path}', all_varchar=true, header=true, quote='"')
                ORDER BY {order_clause}
            ) TO '{temp_path}' (HEADER, DELIMITER ',', QUOTE '"')
        """
        print(f"  Sorting by {order_cols}...")
        conn.execute(export_query)
        conn.close()
        
        # Clean up database file
        if os.path.exists(db_path):
            os.remove(db_path)
        if os.path.exists(db_path + '.wal'):
            os.remove(db_path + '.wal')
        
        # Rename temp to final
        if os.path.exists(output_path):
            os.remove(output_path)
        os.rename(temp_path, output_path)
        
        elapsed = time.time() - start_time
        
        return {
            'success': True,
            'row_count': row_count,
            'elapsed_seconds': elapsed,
            'sort_columns': order_cols
        }
        
    except Exception as e:
        elapsed = time.time() - start_time
        # Clean up on failure
        try:
            if os.path.exists(db_path):
                os.remove(db_path)
            if os.path.exists(db_path + '.wal'):
                os.remove(db_path + '.wal')
        except:
            pass
        return {
            'success': False,
            'error': str(e),
            'elapsed_seconds': elapsed
        }

print("✓ Sort function defined (PERSISTENT DB for large files, auto-detect resources)")

✓ Sort function defined (PERSISTENT DB for large files, auto-detect resources)


In [4]:
# =============================================================================
# CELL 4: Test with Smallest Country File
# =============================================================================
if country_files:
    test_file = country_files[0]  # Smallest file
    
    print(f"Testing sort on smallest file: {test_file['country']}")
    print(f"  Input: {test_file['filepath']}")
    print(f"  Size: {test_file['size_gb']:.2f} GB")
    print()
    
    output_path = os.path.join(OUTPUT_DIR, test_file['filename'])
    print(f"  Output: {output_path}")
    print()
    print("Sorting...")
    
    result = sort_country_file(
        test_file['filepath'],
        output_path,
        SORT_COLUMNS
    )
    
    if result['success']:
        print(f"\n✓ SUCCESS")
        print(f"  Rows: {result['row_count']:,}")
        print(f"  Sort columns: {result['sort_columns']}")
        print(f"  Time: {result['elapsed_seconds']:.1f} seconds")
        
        output_size = os.path.getsize(output_path) / (1024**3)
        print(f"  Output size: {output_size:.2f} GB")
    else:
        print(f"\n✗ FAILED: {result['error']}")
else:
    print("No country files found to test")

Testing sort on smallest file: UNKNOWN
  Input: D:\ECB_ESMA_BY_COUNTRY_ALL\UNKNOWN.csv
  Size: 0.15 GB

  Output: D:\ECB_ESMA_BY_COUNTRY_SORTED\UNKNOWN.csv

Sorting...
  Row count: 205,991
  Sorting by ['RREL3', 'RREC3', 'RREL6']...

✓ SUCCESS
  Rows: 205,991
  Sort columns: ['RREL3', 'RREC3', 'RREL6']
  Time: 1.5 seconds
  Output size: 0.15 GB


In [5]:
# =============================================================================
# CELL 5: Verify Sorted Output
# =============================================================================
if country_files:
    test_file = country_files[0]
    output_path = os.path.join(OUTPUT_DIR, test_file['filename'])
    
    if os.path.exists(output_path):
        print(f"Verifying sorted file: {output_path}")
        print()
        
        # Read sample to verify sort order
        conn = duckdb.connect(':memory:')
        
        # Check first 30 rows - RREL3 is Loan ID, RREC3 is Collateral ID, RREL6 is Date
        sample_query = f"""
            SELECT RREL3 as Loan_ID, RREC3 as Collateral_ID, RREL6 as Date, RREL1 as Pool_ID
            FROM read_csv_auto('{output_path}')
            LIMIT 30
        """
        df_sample = conn.execute(sample_query).fetchdf()
        print("First 30 rows (sorted by RREL3 → RREC3 → RREL6):")
        print(df_sample.to_string())
        print()
        
        # Check sort order violations (RREL3 → RREC3 → RREL6)
        verify_query = f"""
            WITH sorted_check AS (
                SELECT 
                    RREL3, RREC3, RREL6,
                    LAG(RREL3) OVER () as prev_rrel3,
                    LAG(RREC3) OVER () as prev_rrec3,
                    LAG(RREL6) OVER () as prev_rrel6
                FROM read_csv_auto('{output_path}')
            )
            SELECT COUNT(*) as violations
            FROM sorted_check
            WHERE prev_rrel3 IS NOT NULL 
              AND (
                  RREL3 < prev_rrel3 
                  OR (RREL3 = prev_rrel3 AND RREC3 < prev_rrec3)
                  OR (RREL3 = prev_rrel3 AND RREC3 = prev_rrec3 AND RREL6 < prev_rrel6)
              )
        """
        violations = conn.execute(verify_query).fetchone()[0]
        if violations == 0:
            print(f"✓ SORT VERIFIED: No violations found!")
        else:
            print(f"✗ WARNING: {violations:,} sort order violations found!")
        
        conn.close()
    else:
        print(f"Output file not found: {output_path}")

Verifying sorted file: D:\ECB_ESMA_BY_COUNTRY_SORTED\UNKNOWN.csv

First 30 rows (sorted by RREL3 → RREC3 → RREL6):
    Loan_ID  Collateral_ID       Date                      Pool_ID
0      10.0        26226.0 2023-02-28  213800WQJJDCAN4BCO57N202001
1      10.0        26226.0 2023-05-31  213800WQJJDCAN4BCO57N202001
2      10.0        26226.0 2023-08-31  213800WQJJDCAN4BCO57N202001
3      10.0        26226.0 2023-11-30  213800WQJJDCAN4BCO57N202001
4      10.0        26226.0 2024-05-31  213800WQJJDCAN4BCO57N202001
5      10.0        26226.0 2024-08-31  213800WQJJDCAN4BCO57N202001
6      10.0        26226.0 2024-11-30  213800WQJJDCAN4BCO57N202001
7      10.0        26226.0 2021-05-31  213800WQJJDCAN4BCO57N202001
8      10.0        26226.0 2021-08-31  213800WQJJDCAN4BCO57N202001
9      10.0        26226.0 2021-11-30  213800WQJJDCAN4BCO57N202001
10     10.0        26226.0 2022-02-28  213800WQJJDCAN4BCO57N202001
11     10.0        26226.0 2022-05-31  213800WQJJDCAN4BCO57N202001
12     10.0   

In [6]:
# =============================================================================
# CELL 5b: Test on PT (0.85 GB) - second smallest
# =============================================================================
if len(country_files) > 1:
    test_file = country_files[1]  # PT
    
    print(f"Testing sort on: {test_file['country']}")
    print(f"  Input: {test_file['filepath']}")
    print(f"  Size: {test_file['size_gb']:.2f} GB")
    print()
    
    output_path = os.path.join(OUTPUT_DIR, test_file['filename'])
    print(f"  Output: {output_path}")
    print()
    print("Sorting...")
    
    result = sort_country_file(
        test_file['filepath'],
        output_path,
        SORT_COLUMNS
    )
    
    if result['success']:
        print(f"\n✓ SUCCESS")
        print(f"  Rows: {result['row_count']:,}")
        print(f"  Sort columns: {result['sort_columns']}")
        print(f"  Time: {result['elapsed_seconds']:.1f} seconds ({result['elapsed_seconds']/60:.1f} min)")
        
        output_size = os.path.getsize(output_path) / (1024**3)
        print(f"  Output size: {output_size:.2f} GB")
        print(f"  Speed: {test_file['size_gb'] / result['elapsed_seconds'] * 60:.1f} GB/min")
    else:
        print(f"\n✗ FAILED: {result['error']}")

Testing sort on: PT
  Input: D:\ECB_ESMA_BY_COUNTRY_ALL\PT.csv
  Size: 0.85 GB

  Output: D:\ECB_ESMA_BY_COUNTRY_SORTED\PT.csv

Sorting...
  Row count: 952,510
  Sorting by ['RREL3', 'RREC3', 'RREL6']...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


✓ SUCCESS
  Rows: 952,510
  Sort columns: ['RREL3', 'RREC3', 'RREL6']
  Time: 8.8 seconds (0.1 min)
  Output size: 0.85 GB
  Speed: 5.8 GB/min


In [7]:
# =============================================================================
# Verify PT Sort: RREL3 (Loan ID), RREC3 (Collateral ID), RREL6 (Date)
# =============================================================================
conn = duckdb.connect(':memory:')

print("=" * 70)
print("VERIFICATION: PT.csv sorted by RREL3 → RREC3 → RREL6")
print("=" * 70)

# Check first 50 rows to see if loans are grouped
print("\n1. First 50 rows (should be sorted by Loan → Collateral → Date):")
query1 = """
SELECT 
    RREL3,
    RREC3,
    RREL6,
    RREL1
FROM read_csv('D:/ECB_ESMA_BY_COUNTRY_SORTED/PT.csv', all_varchar=true, header=true)
LIMIT 50
"""
df1 = conn.execute(query1).fetchdf()
print(df1.to_string())

# Check a specific loan to see its complete time series
print("\n\n2. Sample loan-collateral time series (dates should be ascending):")
query2 = """
WITH loan_collateral AS (
    SELECT RREL3, RREC3, COUNT(*) as obs
    FROM read_csv('D:/ECB_ESMA_BY_COUNTRY_SORTED/PT.csv', all_varchar=true, header=true)
    WHERE RREL3 IS NOT NULL AND RREL3 != '' AND RREC3 IS NOT NULL AND RREC3 != ''
    GROUP BY RREL3, RREC3
    HAVING COUNT(*) > 3
    LIMIT 1
)
SELECT t.RREL3, t.RREC3, t.RREL6, t.RREL1
FROM read_csv('D:/ECB_ESMA_BY_COUNTRY_SORTED/PT.csv', all_varchar=true, header=true) t
INNER JOIN loan_collateral lc ON t.RREL3 = lc.RREL3 AND t.RREC3 = lc.RREC3
ORDER BY t.RREL3, t.RREC3, t.RREL6
"""
df2 = conn.execute(query2).fetchdf()
print(df2.to_string())

# Check that the file is actually sorted
print("\n\n3. Verify sort order (RREL3 → RREC3 → RREL6):")
query3 = """
WITH sorted_check AS (
    SELECT 
        RREL3,
        RREC3,
        RREL6,
        LAG(RREL3) OVER () as prev_rrel3,
        LAG(RREC3) OVER () as prev_rrec3,
        LAG(RREL6) OVER () as prev_rrel6
    FROM read_csv('D:/ECB_ESMA_BY_COUNTRY_SORTED/PT.csv', all_varchar=true, header=true)
)
SELECT COUNT(*) as violations
FROM sorted_check
WHERE prev_rrel3 IS NOT NULL 
  AND (
      RREL3 < prev_rrel3 
      OR (RREL3 = prev_rrel3 AND RREC3 < prev_rrec3)
      OR (RREL3 = prev_rrel3 AND RREC3 = prev_rrec3 AND RREL6 < prev_rrel6)
  )
"""
df3 = conn.execute(query3).fetchdf()
violations = df3['violations'].iloc[0]
if violations == 0:
    print(f"✓ SORT VERIFIED: No violations found - file is correctly sorted!")
else:
    print(f"✗ ERROR: {violations} sort order violations found!")

conn.close()

VERIFICATION: PT.csv sorted by RREL3 → RREC3 → RREL6

1. First 50 rows (should be sorted by Loan → Collateral → Date):
          RREL3          RREC3       RREL6                        RREL1
0           100        0074920  2024-07-31  2138004FIUXU3B2MR537N200801
1           100        0074920  2024-08-31  2138004FIUXU3B2MR537N200801
2           100        0074920  2024-09-30  2138004FIUXU3B2MR537N200801
3        100099        0026188  2024-07-31  2138004FIUXU3B2MR537N200801
4        100099        0026188  2024-08-31  2138004FIUXU3B2MR537N200801
5        100099        0026188  2024-09-30  2138004FIUXU3B2MR537N200801
6        100134        0022049  2024-07-31  2138004FIUXU3B2MR537N200801
7        100134        0022049  2024-08-31  2138004FIUXU3B2MR537N200801
8        100134        0022049  2024-09-30  2138004FIUXU3B2MR537N200801
9         10016        0050272  2024-07-31  2138004FIUXU3B2MR537N200801
10        10016        0050272  2024-08-31  2138004FIUXU3B2MR537N200801
11        10016  

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ SORT VERIFIED: No violations found - file is correctly sorted!


In [8]:
# =============================================================================
# Investigate: Why multiple rows per Loan ID per Date?
# =============================================================================
# Check what differentiates rows with same RREL1 + Pool_Cutoff_Date

conn = duckdb.connect(':memory:')

# Look at PT sorted file - check rows for same loan+date
query = """
SELECT 
    RREL1,
    Pool_Cutoff_Date,
    RREL3,  -- Loan Identifier (might be different per exposure)
    RREC6,  -- Collateral ID / Geographic region
    source,
    COUNT(*) as cnt
FROM read_csv('D:\\ECB_ESMA_BY_COUNTRY_SORTED\\PT.csv', all_varchar=true, header=true)
WHERE Pool_Cutoff_Date = '2024-07-31'
GROUP BY RREL1, Pool_Cutoff_Date, RREL3, RREC6, source
ORDER BY RREL1, RREL3
LIMIT 50
"""
print("Checking what makes rows unique within same RREL1 + Date:")
print()
df = conn.execute(query).fetchdf()
print(df.to_string())

print("\n" + "="*60)
print("Let's check RREL3 (individual loan identifier) distribution:")
query2 = """
SELECT 
    RREL1,
    Pool_Cutoff_Date,
    COUNT(DISTINCT RREL3) as unique_loans,
    COUNT(*) as total_rows
FROM read_csv('D:\\ECB_ESMA_BY_COUNTRY_SORTED\\PT.csv', all_varchar=true, header=true)
WHERE Pool_Cutoff_Date = '2024-07-31'
GROUP BY RREL1, Pool_Cutoff_Date
ORDER BY total_rows DESC
LIMIT 10
"""
df2 = conn.execute(query2).fetchdf()
print(df2.to_string())

conn.close()

Checking what makes rows unique within same RREL1 + Date:

                          RREL1 Pool_Cutoff_Date   RREL3  RREC6 source  cnt
0   2138004FIUXU3B2MR537N200801       2024-07-31     100  PT150   ESMA    1
1   2138004FIUXU3B2MR537N200801       2024-07-31  100099  PT116   ESMA    1
2   2138004FIUXU3B2MR537N200801       2024-07-31  100134  PT171   ESMA    1
3   2138004FIUXU3B2MR537N200801       2024-07-31   10016  PT161   ESMA    1
4   2138004FIUXU3B2MR537N200801       2024-07-31   10017  PT16B   ESMA    1
5   2138004FIUXU3B2MR537N200801       2024-07-31  100197  PT114   ESMA    1
6   2138004FIUXU3B2MR537N200801       2024-07-31   10020  PT171   ESMA    1
7   2138004FIUXU3B2MR537N200801       2024-07-31  100246  PT116   ESMA    1
8   2138004FIUXU3B2MR537N200801       2024-07-31  100429  PT16B   ESMA    1
9   2138004FIUXU3B2MR537N200801       2024-07-31  100459  PT16B   ESMA    1
10  2138004FIUXU3B2MR537N200801       2024-07-31  100575  PT112   ESMA    1
11  2138004FIUXU3B2MR537N2008

In [9]:
# =============================================================================
# CELL 6: Sort All Country Files (Production Run)
# =============================================================================
# CAUTION: This may take many hours for large files!
# Recommend running overnight.

RUN_ALL = True  # Set to True to process all files

if RUN_ALL and country_files:
    print("="*60)
    print("SORTING ALL COUNTRY FILES")
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("="*60)
    print()
    
    results = []
    total_start = time.time()
    
    for i, f in enumerate(country_files):
        output_path = os.path.join(OUTPUT_DIR, f['filename'])
        
        # Skip if already sorted
        if os.path.exists(output_path):
            print(f"[{i+1}/{len(country_files)}] {f['country']}: Already exists, skipping")
            results.append({'country': f['country'], 'status': 'skipped'})
            continue
        
        print(f"[{i+1}/{len(country_files)}] {f['country']}: {f['size_gb']:.2f} GB")
        
        result = sort_country_file(
            f['filepath'],
            output_path,
            SORT_COLUMNS
        )
        
        if result['success']:
            print(f"    ✓ {result['row_count']:,} rows in {result['elapsed_seconds']:.1f}s")
            results.append({
                'country': f['country'],
                'status': 'success',
                'rows': result['row_count'],
                'time': result['elapsed_seconds']
            })
        else:
            print(f"    ✗ FAILED: {result['error']}")
            results.append({
                'country': f['country'],
                'status': 'failed',
                'error': result['error']
            })
        
        print()
    
    total_elapsed = time.time() - total_start
    
    print("="*60)
    print("COMPLETE")
    print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Total time: {total_elapsed/3600:.1f} hours")
    print("="*60)
    
    # Summary
    success_count = sum(1 for r in results if r['status'] == 'success')
    failed_count = sum(1 for r in results if r['status'] == 'failed')
    skipped_count = sum(1 for r in results if r['status'] == 'skipped')
    
    print(f"\nSuccess: {success_count}")
    print(f"Failed: {failed_count}")
    print(f"Skipped: {skipped_count}")
else:
    print("Set RUN_ALL = True to process all country files")
    print(f"Files to process: {len(country_files)}")

SORTING ALL COUNTRY FILES
Started: 2025-12-15 14:59:06

[1/10] UNKNOWN: Already exists, skipping
[2/10] PT: Already exists, skipping
[3/10] IE: 4.31 GB


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  Row count: 4,849,990
  Sorting by ['RREL3', 'RREC3', 'RREL6']...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ 4,849,990 rows in 85.6s

[4/10] IT: 9.24 GB


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  Row count: 11,029,781
  Sorting by ['RREL3', 'RREC3', 'RREL6']...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ 11,029,781 rows in 471.0s

[5/10] ES: 12.19 GB


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  Row count: 13,545,314
  Sorting by ['RREL3', 'RREC3', 'RREL6']...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ 13,545,314 rows in 815.5s

[6/10] NL: 45.37 GB


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  Row count: 39,952,401
  Sorting by ['RREL3', 'RREC3', 'RREL6']...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ 39,952,401 rows in 2818.3s

[7/10] BE: 53.66 GB


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  Row count: 54,977,394
  Sorting by ['RREL3', 'RREC3', 'RREL6']...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ 54,977,394 rows in 4237.0s

[8/10] DE: 53.73 GB


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  Row count: 53,242,506
  Sorting by ['RREL3', 'RREC3', 'RREL6']...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ 53,242,506 rows in 4956.2s

[9/10] UK: 56.84 GB


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  Row count: 67,514,154
  Sorting by ['RREL3', 'RREC3', 'RREL6']...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ 67,514,154 rows in 6804.2s

[10/10] FR: Already exists, skipping
COMPLETE
Finished: 2025-12-15 20:35:34
Total time: 5.6 hours

Success: 7
Failed: 0
Skipped: 3


In [10]:
# =============================================================================
# CELL 7: Summary Report
# =============================================================================
print("="*60)
print("SORTED FILES SUMMARY")
print("="*60)
print()

sorted_files = []
if os.path.exists(OUTPUT_DIR):
    for fname in os.listdir(OUTPUT_DIR):
        if fname.endswith('.csv'):
            filepath = os.path.join(OUTPUT_DIR, fname)
            size_gb = os.path.getsize(filepath) / (1024**3)
            sorted_files.append({
                'country': fname.replace('.csv', ''),
                'size_gb': size_gb
            })

sorted_files.sort(key=lambda x: x['size_gb'], reverse=True)

print(f"Sorted files in {OUTPUT_DIR}:")
print()
total_sorted = 0
for f in sorted_files:
    print(f"  {f['country']}: {f['size_gb']:.2f} GB")
    total_sorted += f['size_gb']

print(f"\nTotal sorted: {len(sorted_files)} files, {total_sorted:.2f} GB")

# Compare to input
print(f"\nOriginal files: {len(country_files)}")
remaining = len(country_files) - len(sorted_files)
if remaining > 0:
    print(f"Remaining to sort: {remaining}")

SORTED FILES SUMMARY

Sorted files in D:\ECB_ESMA_BY_COUNTRY_SORTED:

  FR: 91.59 GB
  UK: 56.77 GB
  DE: 53.68 GB
  BE: 53.61 GB
  NL: 45.34 GB
  ES: 12.18 GB
  IT: 9.23 GB
  IE: 4.31 GB
  PT: 0.85 GB
  UNKNOWN: 0.15 GB

Total sorted: 10 files, 327.71 GB

Original files: 10


In [2]:
# =============================================================================
# VERIFICATION: Check sort order for ALL sorted country files
# =============================================================================
import duckdb
import os

OUTPUT_DIR = r"D:\ECB_ESMA_BY_COUNTRY_SORTED"

print("=" * 80)
print("VERIFICATION: Checking sort order for ALL sorted country files")
print("Sort key: RREL3 (Loan) → RREC3 (Collateral) → RREL6 (Date)")
print("=" * 80)
print()

# Get all sorted files
sorted_files = []
for fname in os.listdir(OUTPUT_DIR):
    if fname.endswith('.csv'):
        filepath = os.path.join(OUTPUT_DIR, fname)
        size_gb = os.path.getsize(filepath) / (1024**3)
        sorted_files.append({
            'country': fname.replace('.csv', ''),
            'filepath': filepath,
            'size_gb': size_gb
        })

# Sort by size (smallest first for faster initial feedback)
sorted_files.sort(key=lambda x: x['size_gb'])

results = []
total_start = time.time()

for i, f in enumerate(sorted_files):
    print(f"[{i+1}/{len(sorted_files)}] {f['country']} ({f['size_gb']:.2f} GB)...", end=" ", flush=True)
    
    start = time.time()
    
    try:
        conn = duckdb.connect(':memory:')
        
        # Count rows
        count_query = f"""
            SELECT COUNT(*) FROM read_csv('{f['filepath']}', all_varchar=true, header=true, quote='"')
        """
        row_count = conn.execute(count_query).fetchone()[0]
        
        # Check sort order violations
        verify_query = f"""
            WITH sorted_check AS (
                SELECT 
                    RREL3, RREC3, RREL6,
                    LAG(RREL3) OVER () as prev_rrel3,
                    LAG(RREC3) OVER () as prev_rrec3,
                    LAG(RREL6) OVER () as prev_rrel6
                FROM read_csv('{f['filepath']}', all_varchar=true, header=true, quote='"')
            )
            SELECT COUNT(*) as violations
            FROM sorted_check
            WHERE prev_rrel3 IS NOT NULL 
              AND (
                  RREL3 < prev_rrel3 
                  OR (RREL3 = prev_rrel3 AND RREC3 < prev_rrec3)
                  OR (RREL3 = prev_rrel3 AND RREC3 = prev_rrec3 AND RREL6 < prev_rrel6)
              )
        """
        violations = conn.execute(verify_query).fetchone()[0]
        
        # Check ASCII order (symbols < digits < letters)
        ascii_query = f"""
            WITH numbered AS (
                SELECT RREL3, ROW_NUMBER() OVER () as rn
                FROM read_csv('{f['filepath']}', all_varchar=true, header=true, quote='"')
                WHERE RREL3 IS NOT NULL AND RREL3 != ''
            ),
            typed AS (
                SELECT 
                    rn,
                    CASE 
                        WHEN SUBSTRING(RREL3, 1, 1) BETWEEN '0' AND '9' THEN 'digit'
                        WHEN SUBSTRING(RREL3, 1, 1) BETWEEN 'A' AND 'Z' THEN 'upper'
                        WHEN SUBSTRING(RREL3, 1, 1) BETWEEN 'a' AND 'z' THEN 'lower'
                        ELSE 'symbol'
                    END as char_type
                FROM numbered
            )
            SELECT char_type, MIN(rn) as first_row, MAX(rn) as last_row, COUNT(*) as cnt
            FROM typed
            GROUP BY char_type
            ORDER BY first_row
        """
        ascii_df = conn.execute(ascii_query).fetchdf()
        
        conn.close()
        
        elapsed = time.time() - start
        
        if violations == 0:
            print(f"✓ PASS ({row_count:,} rows, {elapsed:.1f}s)")
            results.append({
                'country': f['country'],
                'status': 'PASS',
                'rows': row_count,
                'violations': 0,
                'time': elapsed,
                'ascii_order': ascii_df.to_dict('records')
            })
        else:
            print(f"✗ FAIL - {violations:,} violations!")
            results.append({
                'country': f['country'],
                'status': 'FAIL',
                'rows': row_count,
                'violations': violations,
                'time': elapsed
            })
            
    except Exception as e:
        elapsed = time.time() - start
        print(f"✗ ERROR: {str(e)[:50]}")
        results.append({
            'country': f['country'],
            'status': 'ERROR',
            'error': str(e),
            'time': elapsed
        })

total_elapsed = time.time() - total_start

print()
print("=" * 80)
print("SUMMARY")
print("=" * 80)

passed = sum(1 for r in results if r['status'] == 'PASS')
failed = sum(1 for r in results if r['status'] == 'FAIL')
errors = sum(1 for r in results if r['status'] == 'ERROR')
total_rows = sum(r.get('rows', 0) for r in results)

print(f"Total countries: {len(results)}")
print(f"  ✓ Passed: {passed}")
print(f"  ✗ Failed: {failed}")
print(f"  ⚠ Errors: {errors}")
print(f"Total rows verified: {total_rows:,}")
print(f"Total verification time: {total_elapsed/60:.1f} minutes")

if failed > 0:
    print("\nFailed countries:")
    for r in results:
        if r['status'] == 'FAIL':
            print(f"  {r['country']}: {r['violations']:,} violations")

if errors > 0:
    print("\nError details:")
    for r in results:
        if r['status'] == 'ERROR':
            print(f"  {r['country']}: {r['error']}")

print()
print("=" * 80)

VERIFICATION: Checking sort order for ALL sorted country files
Sort key: RREL3 (Loan) → RREC3 (Collateral) → RREL6 (Date)

[1/10] UNKNOWN (0.15 GB)... ✓ PASS (205,991 rows, 1.9s)
[2/10] PT (0.85 GB)... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ PASS (952,510 rows, 5.4s)
[3/10] IE (4.31 GB)... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ PASS (4,849,990 rows, 20.5s)
[4/10] IT (9.23 GB)... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ PASS (11,029,781 rows, 47.4s)
[5/10] ES (12.18 GB)... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ PASS (13,545,314 rows, 62.4s)
[6/10] NL (45.34 GB)... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ PASS (39,952,401 rows, 312.3s)
[7/10] BE (53.61 GB)... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ PASS (54,977,394 rows, 331.6s)
[8/10] DE (53.68 GB)... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ PASS (53,242,506 rows, 303.4s)
[9/10] UK (56.77 GB)... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ PASS (67,514,154 rows, 367.4s)
[10/10] FR (91.59 GB)... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ PASS (92,034,390 rows, 528.7s)

SUMMARY
Total countries: 10
  ✓ Passed: 10
  ✗ Failed: 0
  ⚠ Errors: 0
Total rows verified: 338,304,431
Total verification time: 33.0 minutes



In [None]:
# =============================================================================
# COMPREHENSIVE DATA INTEGRITY CHECK: Input vs Output Comparison
# =============================================================================
# This verifies NO DATA was lost or corrupted during sorting:
#   1. Row count: input == output
#   2. Column count: input == output  
#   3. Unique key count: (RREL3, RREC3, RREL6) preserved
#   4. Distinct values in key columns preserved
#   5. Checksums on sample data columns

import duckdb
import os
import time

INPUT_DIR = r"D:\ECB_ESMA_BY_COUNTRY_ALL"
OUTPUT_DIR = r"D:\ECB_ESMA_BY_COUNTRY_SORTED"

print("=" * 90)
print("COMPREHENSIVE DATA INTEGRITY CHECK: Comparing Input vs Output")
print("=" * 90)
print()

# Get matched input/output pairs
files_to_check = []
for fname in os.listdir(OUTPUT_DIR):
    if fname.endswith('.csv'):
        input_path = os.path.join(INPUT_DIR, fname)
        output_path = os.path.join(OUTPUT_DIR, fname)
        if os.path.exists(input_path):
            size_gb = os.path.getsize(output_path) / (1024**3)
            files_to_check.append({
                'country': fname.replace('.csv', ''),
                'input': input_path,
                'output': output_path,
                'size_gb': size_gb
            })

# Sort smallest first
files_to_check.sort(key=lambda x: x['size_gb'])

print(f"Checking {len(files_to_check)} countries for data integrity...")
print()

results = []
total_start = time.time()

for i, f in enumerate(files_to_check):
    print(f"[{i+1}/{len(files_to_check)}] {f['country']} ({f['size_gb']:.2f} GB)")
    
    start = time.time()
    issues = []
    
    try:
        conn = duckdb.connect(':memory:')
        conn.execute("SET threads = 4")
        
        # 1. ROW COUNT CHECK
        input_rows = conn.execute(f"""
            SELECT COUNT(*) FROM read_csv('{f['input']}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        output_rows = conn.execute(f"""
            SELECT COUNT(*) FROM read_csv('{f['output']}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        if input_rows != output_rows:
            issues.append(f"ROW COUNT MISMATCH: input={input_rows:,}, output={output_rows:,}, diff={input_rows - output_rows:,}")
        
        # 2. COLUMN COUNT CHECK
        input_cols = conn.execute(f"""
            SELECT * FROM read_csv('{f['input']}', all_varchar=true, header=true, quote='"') LIMIT 0
        """).description
        input_col_count = len(input_cols)
        input_col_names = [c[0] for c in input_cols]
        
        output_cols = conn.execute(f"""
            SELECT * FROM read_csv('{f['output']}', all_varchar=true, header=true, quote='"') LIMIT 0
        """).description
        output_col_count = len(output_cols)
        output_col_names = [c[0] for c in output_cols]
        
        if input_col_count != output_col_count:
            issues.append(f"COLUMN COUNT MISMATCH: input={input_col_count}, output={output_col_count}")
        
        if input_col_names != output_col_names:
            issues.append(f"COLUMN NAMES DIFFER")
        
        # 3. UNIQUE KEY COUNT CHECK (RREL3, RREC3, RREL6)
        input_unique_keys = conn.execute(f"""
            SELECT COUNT(DISTINCT RREL3 || '|' || COALESCE(RREC3,'') || '|' || COALESCE(RREL6,''))
            FROM read_csv('{f['input']}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        output_unique_keys = conn.execute(f"""
            SELECT COUNT(DISTINCT RREL3 || '|' || COALESCE(RREC3,'') || '|' || COALESCE(RREL6,''))
            FROM read_csv('{f['output']}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        if input_unique_keys != output_unique_keys:
            issues.append(f"UNIQUE KEY COUNT MISMATCH: input={input_unique_keys:,}, output={output_unique_keys:,}")
        
        # 4. DISTINCT VALUES CHECK for key columns
        for col in ['RREL3', 'RREC3', 'RREL6', 'RREL1', 'source']:
            if col in input_col_names:
                input_distinct = conn.execute(f"""
                    SELECT COUNT(DISTINCT {col}) FROM read_csv('{f['input']}', all_varchar=true, header=true, quote='"')
                """).fetchone()[0]
                
                output_distinct = conn.execute(f"""
                    SELECT COUNT(DISTINCT {col}) FROM read_csv('{f['output']}', all_varchar=true, header=true, quote='"')
                """).fetchone()[0]
                
                if input_distinct != output_distinct:
                    issues.append(f"DISTINCT {col} MISMATCH: input={input_distinct:,}, output={output_distinct:,}")
        
        # 5. CHECKSUM on text columns (hash of all RREL3 values)
        input_hash = conn.execute(f"""
            SELECT MD5(STRING_AGG(COALESCE(RREL3,''), '' ORDER BY RREL3, RREC3, RREL6))
            FROM read_csv('{f['input']}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        output_hash = conn.execute(f"""
            SELECT MD5(STRING_AGG(COALESCE(RREL3,''), '' ORDER BY RREL3, RREC3, RREL6))
            FROM read_csv('{f['output']}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        if input_hash != output_hash:
            issues.append(f"RREL3 CHECKSUM MISMATCH (data may be corrupted)")
        
        # 6. SPOT CHECK: Compare random sample of full rows
        # Get 5 random RREL3 values and compare all columns
        sample_keys = conn.execute(f"""
            SELECT DISTINCT RREL3 FROM read_csv('{f['input']}', all_varchar=true, header=true, quote='"')
            WHERE RREL3 IS NOT NULL AND RREL3 != ''
            USING SAMPLE 5
        """).fetchall()
        
        for (key,) in sample_keys:
            key_escaped = key.replace("'", "''")
            
            input_sample = conn.execute(f"""
                SELECT * FROM read_csv('{f['input']}', all_varchar=true, header=true, quote='"')
                WHERE RREL3 = '{key_escaped}'
                ORDER BY RREC3, RREL6
            """).fetchall()
            
            output_sample = conn.execute(f"""
                SELECT * FROM read_csv('{f['output']}', all_varchar=true, header=true, quote='"')
                WHERE RREL3 = '{key_escaped}'
                ORDER BY RREC3, RREL6
            """).fetchall()
            
            if input_sample != output_sample:
                issues.append(f"SPOT CHECK FAILED: Data differs for RREL3='{key[:20]}...'")
                break
        
        conn.close()
        elapsed = time.time() - start
        
        if len(issues) == 0:
            print(f"    ✓ PASS - {input_rows:,} rows, {input_col_count} cols, {elapsed:.1f}s")
            results.append({
                'country': f['country'],
                'status': 'PASS',
                'rows': input_rows,
                'cols': input_col_count,
                'unique_keys': input_unique_keys,
                'time': elapsed
            })
        else:
            print(f"    ✗ FAIL - {len(issues)} issue(s):")
            for issue in issues:
                print(f"      - {issue}")
            results.append({
                'country': f['country'],
                'status': 'FAIL',
                'issues': issues,
                'time': elapsed
            })
            
    except Exception as e:
        elapsed = time.time() - start
        print(f"    ✗ ERROR: {str(e)[:60]}")
        results.append({
            'country': f['country'],
            'status': 'ERROR',
            'error': str(e),
            'time': elapsed
        })

total_elapsed = time.time() - total_start

print()
print("=" * 90)
print("DATA INTEGRITY SUMMARY")
print("=" * 90)

passed = sum(1 for r in results if r['status'] == 'PASS')
failed = sum(1 for r in results if r['status'] == 'FAIL')
errors = sum(1 for r in results if r['status'] == 'ERROR')
total_rows = sum(r.get('rows', 0) for r in results if r['status'] == 'PASS')

print(f"Total countries: {len(results)}")
print(f"  ✓ Passed: {passed}")
print(f"  ✗ Failed: {failed}")
print(f"  ⚠ Errors: {errors}")
print(f"Total rows verified: {total_rows:,}")
print(f"Total time: {total_elapsed/60:.1f} minutes")

if passed == len(results):
    print()
    print("🎉 ALL DATA INTEGRITY CHECKS PASSED!")
    print("   - Row counts match")
    print("   - Column counts match")
    print("   - Unique key counts match")
    print("   - Distinct value counts match")
    print("   - Checksums match")
    print("   - Spot checks passed")

if failed > 0:
    print()
    print("⚠️  FAILED COUNTRIES - DATA MAY BE CORRUPTED:")
    for r in results:
        if r['status'] == 'FAIL':
            print(f"\n  {r['country']}:")
            for issue in r['issues']:
                print(f"    - {issue}")

print()
print("=" * 90)

COMPREHENSIVE DATA INTEGRITY CHECK: Comparing Input vs Output

Checking 10 countries for data integrity...

[1/10] UNKNOWN (0.15 GB)
    ✓ PASS - 205,991 rows, 192 cols, 7.5s
[2/10] PT (0.85 GB)
    ✓ PASS - 952,510 rows, 328 cols, 24.2s
[3/10] IE (4.31 GB)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✗ FAIL - 1 issue(s):
      - SPOT CHECK FAILED: Data differs for RREL3='Auhr4UxXtz5dPg4nJ2Zc...'
[4/10] IT (9.23 GB)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ PASS - 11,029,781 rows, 216 cols, 295.0s
[5/10] ES (12.18 GB)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ PASS - 13,545,314 rows, 216 cols, 386.7s
[6/10] NL (45.34 GB)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ PASS - 39,952,401 rows, 217 cols, 1533.9s
[7/10] BE (53.61 GB)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ PASS - 54,977,394 rows, 217 cols, 1870.1s
[8/10] DE (53.68 GB)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    ✓ PASS - 53,242,506 rows, 216 cols, 2245.4s
[9/10] UK (56.77 GB)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [2]:
# =============================================================================
# VERIFY REMAINING COUNTRIES: UK and FR
# =============================================================================
import duckdb
import os
import time

INPUT_DIR = r"D:\ECB_ESMA_BY_COUNTRY_ALL"
OUTPUT_DIR = r"D:\ECB_ESMA_BY_COUNTRY_SORTED"

# Only check UK and FR
countries_to_check = ['UK', 'FR']

print("=" * 90)
print("DATA INTEGRITY CHECK: UK and FR")
print("=" * 90)
print()

for country in countries_to_check:
    input_path = os.path.join(INPUT_DIR, f"{country}.csv")
    output_path = os.path.join(OUTPUT_DIR, f"{country}.csv")
    
    if not os.path.exists(input_path) or not os.path.exists(output_path):
        print(f"{country}: Files not found, skipping")
        continue
    
    size_gb = os.path.getsize(output_path) / (1024**3)
    print(f"Checking {country} ({size_gb:.2f} GB)...")
    
    start = time.time()
    issues = []
    
    try:
        conn = duckdb.connect(':memory:')
        conn.execute("SET threads = 4")
        
        # 1. ROW COUNT CHECK
        print("  - Row count...", end=" ", flush=True)
        input_rows = conn.execute(f"""
            SELECT COUNT(*) FROM read_csv('{input_path}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        output_rows = conn.execute(f"""
            SELECT COUNT(*) FROM read_csv('{output_path}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        if input_rows != output_rows:
            issues.append(f"ROW COUNT MISMATCH: input={input_rows:,}, output={output_rows:,}")
            print("✗")
        else:
            print(f"✓ ({input_rows:,})")
        
        # 2. COLUMN COUNT CHECK
        print("  - Column count...", end=" ", flush=True)
        input_cols = conn.execute(f"""
            SELECT * FROM read_csv('{input_path}', all_varchar=true, header=true, quote='"') LIMIT 0
        """).description
        input_col_count = len(input_cols)
        
        output_cols = conn.execute(f"""
            SELECT * FROM read_csv('{output_path}', all_varchar=true, header=true, quote='"') LIMIT 0
        """).description
        output_col_count = len(output_cols)
        
        if input_col_count != output_col_count:
            issues.append(f"COLUMN COUNT MISMATCH: input={input_col_count}, output={output_col_count}")
            print("✗")
        else:
            print(f"✓ ({input_col_count})")
        
        # 3. UNIQUE KEY COUNT CHECK
        print("  - Unique keys...", end=" ", flush=True)
        input_unique_keys = conn.execute(f"""
            SELECT COUNT(DISTINCT RREL3 || '|' || COALESCE(RREC3,'') || '|' || COALESCE(RREL6,''))
            FROM read_csv('{input_path}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        output_unique_keys = conn.execute(f"""
            SELECT COUNT(DISTINCT RREL3 || '|' || COALESCE(RREC3,'') || '|' || COALESCE(RREL6,''))
            FROM read_csv('{output_path}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        if input_unique_keys != output_unique_keys:
            issues.append(f"UNIQUE KEY COUNT MISMATCH: input={input_unique_keys:,}, output={output_unique_keys:,}")
            print("✗")
        else:
            print(f"✓ ({input_unique_keys:,})")
        
        # 4. DISTINCT VALUES CHECK for key columns
        print("  - Distinct values...", end=" ", flush=True)
        for col in ['RREL3', 'RREC3', 'RREL6', 'source']:
            input_distinct = conn.execute(f"""
                SELECT COUNT(DISTINCT {col}) FROM read_csv('{input_path}', all_varchar=true, header=true, quote='"')
            """).fetchone()[0]
            
            output_distinct = conn.execute(f"""
                SELECT COUNT(DISTINCT {col}) FROM read_csv('{output_path}', all_varchar=true, header=true, quote='"')
            """).fetchone()[0]
            
            if input_distinct != output_distinct:
                issues.append(f"DISTINCT {col} MISMATCH: input={input_distinct:,}, output={output_distinct:,}")
        
        if not any('DISTINCT' in i for i in issues):
            print("✓")
        else:
            print("✗")
        
        # 5. CHECKSUM
        print("  - Checksum...", end=" ", flush=True)
        input_hash = conn.execute(f"""
            SELECT MD5(STRING_AGG(COALESCE(RREL3,''), '' ORDER BY RREL3, RREC3, RREL6))
            FROM read_csv('{input_path}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        output_hash = conn.execute(f"""
            SELECT MD5(STRING_AGG(COALESCE(RREL3,''), '' ORDER BY RREL3, RREC3, RREL6))
            FROM read_csv('{output_path}', all_varchar=true, header=true, quote='"')
        """).fetchone()[0]
        
        if input_hash != output_hash:
            issues.append(f"CHECKSUM MISMATCH")
            print("✗")
        else:
            print("✓")
        
        conn.close()
        elapsed = time.time() - start
        
        if len(issues) == 0:
            print(f"\n  ✓ {country} PASSED - {input_rows:,} rows in {elapsed:.1f}s\n")
        else:
            print(f"\n  ✗ {country} FAILED:")
            for issue in issues:
                print(f"    - {issue}")
            print()
            
    except Exception as e:
        elapsed = time.time() - start
        print(f"\n  ✗ {country} ERROR: {str(e)[:80]}\n")

print("=" * 90)
print("DONE")
print("=" * 90)

DATA INTEGRITY CHECK: UK and FR

Checking UK (56.77 GB)...
  - Row count... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ (67,514,154)
  - Column count... ✓ (213)
  - Unique keys... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ (60,060,878)
  - Distinct values... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓
  - Checksum... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓

  ✓ UK PASSED - 67,514,154 rows in 1119.7s

Checking FR (91.59 GB)...
  - Row count... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ (92,034,390)
  - Column count... ✓ (215)
  - Unique keys... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ (92,016,994)
  - Distinct values... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓
  - Checksum... 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓

  ✓ FR PASSED - 92,034,390 rows in 1797.6s

DONE
