In [3]:
from pathlib import Path
import duckdb

# --- CONFIG ---
DATA_DIR = Path.home() / "opensource/sql-server-samples/samples"
DATA_DIR = DATA_DIR / "databases/adventure-works/data-warehouse-install-script"
PARQUET_DIR = Path("../data")
PARQUET_DIR.mkdir(exist_ok=True)
flattened_tables = {
    "databaselog2": {
        "columns": [
            "DatabaseLogID",
            "PostTime",
            "DatabaseUser",
            "Event",
            "Schema",
            "Object",
            "TSQL",
            "XmlEvent",
        ],
        "types": {
            "DatabaseLogID": "INTEGER",
            "PostTime": "TIMESTAMP",
            "DatabaseUser": "TEXT",
            "Event": "TEXT",
            "Schema": "TEXT",
            "Object": "TEXT",
            "TSQL": "TEXT",
            "XmlEvent": "TEXT",
        },
    },
    "adventureworksdwbuildversion": {
        "columns": ["DBVersion", "VersionDate"],
        "types": {"DBVersion": "TEXT", "VersionDate": "TIMESTAMP"},
    },
    "dimaccount": {
        "columns": [
            "AccountKey",
            "ParentAccountKey",
            "AccountCodeAlternateKey",
            "ParentAccountCodeAlternateKey",
            "AccountDescription",
            "AccountType",
            "Operator",
            "CustomMembers",
            "ValueType",
            "CustomMemberOptions",
        ],
        "types": {
            "AccountKey": "INTEGER",
            "ParentAccountKey": "INTEGER",
            "AccountCodeAlternateKey": "INTEGER",
            "ParentAccountCodeAlternateKey": "INTEGER",
            "AccountDescription": "TEXT",
            "AccountType": "TEXT",
            "Operator": "TEXT",
            "CustomMembers": "TEXT",
            "ValueType": "TEXT",
            "CustomMemberOptions": "TEXT",
        },
    },
    "dimcurrency": {
        "columns": ["CurrencyKey", "CurrencyAlternateKey", "CurrencyName"],
        "types": {
            "CurrencyKey": "INTEGER",
            "CurrencyAlternateKey": "TEXT",
            "CurrencyName": "TEXT",
        },
    },
    "dimcustomer": {
        "columns": [
            "CustomerKey",
            "GeographyKey",
            "CustomerAlternateKey",
            "Title",
            "FirstName",
            "MiddleName",
            "LastName",
            "NameStyle",
            "BirthDate",
            "MaritalStatus",
            "Suffix",
            "Gender",
            "EmailAddress",
            "YearlyIncome",
            "TotalChildren",
            "NumberChildrenAtHome",
            "EnglishEducation",
            "SpanishEducation",
            "FrenchEducation",
            "EnglishOccupation",
            "SpanishOccupation",
            "FrenchOccupation",
            "HouseOwnerFlag",
            "NumberCarsOwned",
            "AddressLine1",
            "AddressLine2",
            "Phone",
            "DateFirstPurchase",
            "CommuteDistance",
        ],
        "types": {
            "CustomerKey": "INTEGER",
            "GeographyKey": "INTEGER",
            "CustomerAlternateKey": "TEXT",
            "Title": "TEXT",
            "FirstName": "TEXT",
            "MiddleName": "TEXT",
            "LastName": "TEXT",
            "NameStyle": "BOOLEAN",
            "BirthDate": "TIMESTAMP",
            "MaritalStatus": "TEXT",
            "Suffix": "TEXT",
            "Gender": "TEXT",
            "EmailAddress": "TEXT",
            "YearlyIncome": "DOUBLE",
            "TotalChildren": "INTEGER",
            "NumberChildrenAtHome": "INTEGER",
            "EnglishEducation": "TEXT",
            "SpanishEducation": "TEXT",
            "FrenchEducation": "TEXT",
            "EnglishOccupation": "TEXT",
            "SpanishOccupation": "TEXT",
            "FrenchOccupation": "TEXT",
            "HouseOwnerFlag": "TEXT",
            "NumberCarsOwned": "INTEGER",
            "AddressLine1": "TEXT",
            "AddressLine2": "TEXT",
            "Phone": "TEXT",
            "DateFirstPurchase": "TIMESTAMP",
            "CommuteDistance": "TEXT",
        },
    },
    "dimdate": {
        "columns": [
            "DateKey",
            "FullDateAlternateKey",
            "DayNumberOfWeek",
            "EnglishDayNameOfWeek",
            "SpanishDayNameOfWeek",
            "FrenchDayNameOfWeek",
            "DayNumberOfMonth",
            "DayNumberOfYear",
            "WeekNumberOfYear",
            "EnglishMonthName",
            "SpanishMonthName",
            "FrenchMonthName",
            "MonthNumberOfYear",
            "CalendarQuarter",
            "CalendarYear",
            "CalendarSemester",
            "FiscalQuarter",
            "FiscalYear",
            "FiscalSemester",
        ],
        "types": {
            "DateKey": "INTEGER",
            "FullDateAlternateKey": "TIMESTAMP",
            "DayNumberOfWeek": "INTEGER",
            "EnglishDayNameOfWeek": "TEXT",
            "SpanishDayNameOfWeek": "TEXT",
            "FrenchDayNameOfWeek": "TEXT",
            "DayNumberOfMonth": "INTEGER",
            "DayNumberOfYear": "INTEGER",
            "WeekNumberOfYear": "INTEGER",
            "EnglishMonthName": "TEXT",
            "SpanishMonthName": "TEXT",
            "FrenchMonthName": "TEXT",
            "MonthNumberOfYear": "INTEGER",
            "CalendarQuarter": "INTEGER",
            "CalendarYear": "INTEGER",
            "CalendarSemester": "INTEGER",
            "FiscalQuarter": "INTEGER",
            "FiscalYear": "INTEGER",
            "FiscalSemester": "INTEGER",
        },
    },
    "dimdepartmentgroup": {
        "columns": [
            "DepartmentGroupKey",
            "ParentDepartmentGroupKey",
            "DepartmentGroupName",
        ],
        "types": {
            "DepartmentGroupKey": "INTEGER",
            "ParentDepartmentGroupKey": "INTEGER",
            "DepartmentGroupName": "TEXT",
        },
    },
    "dimemployee": {
        "columns": [
            "EmployeeKey",
            "ParentEmployeeKey",
            "EmployeeNationalIDAlternateKey",
            "ParentEmployeeNationalIDAlternateKey",
            "SalesTerritoryKey",
            "FirstName",
            "LastName",
            "MiddleName",
            "NameStyle",
            "Title",
            "HireDate",
            "BirthDate",
            "LoginID",
            "EmailAddress",
            "Phone",
            "MaritalStatus",
            "EmergencyContactName",
            "EmergencyContactPhone",
            "SalariedFlag",
            "Gender",
            "PayFrequency",
            "BaseRate",
            "VacationHours",
            "SickLeaveHours",
            "CurrentFlag",
            "SalesPersonFlag",
            "DepartmentName",
            "StartDate",
            "EndDate",
            "Status",
            "EmployeePhoto",
        ],
        "types": {
            "EmployeeKey": "INTEGER",
            "ParentEmployeeKey": "INTEGER",
            "EmployeeNationalIDAlternateKey": "TEXT",
            "ParentEmployeeNationalIDAlternateKey": "TEXT",
            "SalesTerritoryKey": "INTEGER",
            "FirstName": "TEXT",
            "LastName": "TEXT",
            "MiddleName": "TEXT",
            "NameStyle": "BOOLEAN",
            "Title": "TEXT",
            "HireDate": "TIMESTAMP",
            "BirthDate": "TIMESTAMP",
            "LoginID": "TEXT",
            "EmailAddress": "TEXT",
            "Phone": "TEXT",
            "MaritalStatus": "TEXT",
            "EmergencyContactName": "TEXT",
            "EmergencyContactPhone": "TEXT",
            "SalariedFlag": "BOOLEAN",
            "Gender": "TEXT",
            "PayFrequency": "INTEGER",
            "BaseRate": "DOUBLE",
            "VacationHours": "INTEGER",
            "SickLeaveHours": "INTEGER",
            "CurrentFlag": "BOOLEAN",
            "SalesPersonFlag": "BOOLEAN",
            "DepartmentName": "TEXT",
            "StartDate": "TIMESTAMP",
            "EndDate": "TIMESTAMP",
            "Status": "TEXT",
            "EmployeePhoto": "TEXT",
        },
    },
    "dimgeography": {
        "columns": [
            "GeographyKey",
            "City",
            "StateProvinceCode",
            "StateProvinceName",
            "CountryRegionCode",
            "EnglishCountryRegionName",
            "SpanishCountryRegionName",
            "FrenchCountryRegionName",
            "PostalCode",
            "SalesTerritoryKey",
            "IpAddressLocator",
        ],
        "types": {
            "GeographyKey": "INTEGER",
            "City": "TEXT",
            "StateProvinceCode": "TEXT",
            "StateProvinceName": "TEXT",
            "CountryRegionCode": "TEXT",
            "EnglishCountryRegionName": "TEXT",
            "SpanishCountryRegionName": "TEXT",
            "FrenchCountryRegionName": "TEXT",
            "PostalCode": "TEXT",
            "SalesTerritoryKey": "INTEGER",
            "IpAddressLocator": "TEXT",
        },
    },
    "dimorganization": {
        "columns": [
            "OrganizationKey",
            "ParentOrganizationKey",
            "PercentageOfOwnership",
            "OrganizationName",
            "CurrencyKey",
        ],
        "types": {
            "OrganizationKey": "INTEGER",
            "ParentOrganizationKey": "INTEGER",
            "PercentageOfOwnership": "TEXT",
            "OrganizationName": "TEXT",
            "CurrencyKey": "INTEGER",
        },
    },
    "dimproduct": {
        "columns": [
            "ProductKey",
            "ProductAlternateKey",
            "ProductSubcategoryKey",
            "WeightUnitMeasureCode",
            "SizeUnitMeasureCode",
            "EnglishProductName",
            "SpanishProductName",
            "FrenchProductName",
            "StandardCost",
            "FinishedGoodsFlag",
            "Color",
            "SafetyStockLevel",
            "ReorderPoint",
            "ListPrice",
            "Size",
            "SizeRange",
            "Weight",
            "DaysToManufacture",
            "ProductLine",
            "DealerPrice",
            "Class",
            "Style",
            "ModelName",
            "LargePhoto",
            "EnglishDescription",
            "FrenchDescription",
            "ChineseDescription",
            "ArabicDescription",
            "HebrewDescription",
            "ThaiDescription",
            "GermanDescription",
            "JapaneseDescription",
            "TurkishDescription",
            "StartDate",
            "EndDate",
            "Status",
        ],
        "types": {
            "ProductKey": "INTEGER",
            "ProductAlternateKey": "TEXT",
            "ProductSubcategoryKey": "INTEGER",
            "WeightUnitMeasureCode": "TEXT",
            "SizeUnitMeasureCode": "TEXT",
            "EnglishProductName": "TEXT",
            "SpanishProductName": "TEXT",
            "FrenchProductName": "TEXT",
            "StandardCost": "DOUBLE",
            "FinishedGoodsFlag": "BOOLEAN",
            "Color": "TEXT",
            "SafetyStockLevel": "INTEGER",
            "ReorderPoint": "INTEGER",
            "ListPrice": "DOUBLE",
            "Size": "TEXT",
            "SizeRange": "TEXT",
            "Weight": "DOUBLE",
            "DaysToManufacture": "INTEGER",
            "ProductLine": "TEXT",
            "DealerPrice": "DOUBLE",
            "Class": "TEXT",
            "Style": "TEXT",
            "ModelName": "TEXT",
            "LargePhoto": "TEXT",
            "EnglishDescription": "TEXT",
            "FrenchDescription": "TEXT",
            "ChineseDescription": "TEXT",
            "ArabicDescription": "TEXT",
            "HebrewDescription": "TEXT",
            "ThaiDescription": "TEXT",
            "GermanDescription": "TEXT",
            "JapaneseDescription": "TEXT",
            "TurkishDescription": "TEXT",
            "StartDate": "TIMESTAMP",
            "EndDate": "TIMESTAMP",
            "Status": "TEXT",
        },
    },
    "dimproductcategory": {
        "columns": [
            "ProductCategoryKey",
            "ProductCategoryAlternateKey",
            "EnglishProductCategoryName",
            "SpanishProductCategoryName",
            "FrenchProductCategoryName",
        ],
        "types": {
            "ProductCategoryKey": "INTEGER",
            "ProductCategoryAlternateKey": "INTEGER",
            "EnglishProductCategoryName": "TEXT",
            "SpanishProductCategoryName": "TEXT",
            "FrenchProductCategoryName": "TEXT",
        },
    },
    "dimproductsubcategory": {
        "columns": [
            "ProductSubcategoryKey",
            "ProductSubcategoryAlternateKey",
            "EnglishProductSubcategoryName",
            "SpanishProductSubcategoryName",
            "FrenchProductSubcategoryName",
            "ProductCategoryKey",
        ],
        "types": {
            "ProductSubcategoryKey": "INTEGER",
            "ProductSubcategoryAlternateKey": "INTEGER",
            "EnglishProductSubcategoryName": "TEXT",
            "SpanishProductSubcategoryName": "TEXT",
            "FrenchProductSubcategoryName": "TEXT",
            "ProductCategoryKey": "INTEGER",
        },
    },
    "dimpromotion": {
        "columns": [
            "PromotionKey",
            "PromotionAlternateKey",
            "EnglishPromotionName",
            "SpanishPromotionName",
            "FrenchPromotionName",
            "DiscountPct",
            "EnglishPromotionType",
            "SpanishPromotionType",
            "FrenchPromotionType",
            "EnglishPromotionCategory",
            "SpanishPromotionCategory",
            "FrenchPromotionCategory",
            "StartDate",
            "EndDate",
            "MinQty",
            "MaxQty",
        ],
        "types": {
            "PromotionKey": "INTEGER",
            "PromotionAlternateKey": "INTEGER",
            "EnglishPromotionName": "TEXT",
            "SpanishPromotionName": "TEXT",
            "FrenchPromotionName": "TEXT",
            "DiscountPct": "DOUBLE",
            "EnglishPromotionType": "TEXT",
            "SpanishPromotionType": "TEXT",
            "FrenchPromotionType": "TEXT",
            "EnglishPromotionCategory": "TEXT",
            "SpanishPromotionCategory": "TEXT",
            "FrenchPromotionCategory": "TEXT",
            "StartDate": "TIMESTAMP",
            "EndDate": "TIMESTAMP",
            "MinQty": "INTEGER",
            "MaxQty": "INTEGER",
        },
    },
    "dimreseller": {
        "columns": [
            "ResellerKey",
            "GeographyKey",
            "ResellerAlternateKey",
            "Phone",
            "BusinessType",
            "ResellerName",
            "NumberEmployees",
            "OrderFrequency",
            "OrderMonth",
            "FirstOrderYear",
            "LastOrderYear",
            "ProductLine",
            "AddressLine1",
            "AddressLine2",
            "AnnualSales",
            "BankName",
            "MinPaymentType",
            "MinPaymentAmount",
            "AnnualRevenue",
            "YearOpened",
        ],
        "types": {
            "ResellerKey": "INTEGER",
            "GeographyKey": "INTEGER",
            "ResellerAlternateKey": "TEXT",
            "Phone": "TEXT",
            "BusinessType": "TEXT",
            "ResellerName": "TEXT",
            "NumberEmployees": "INTEGER",
            "OrderFrequency": "TEXT",
            "OrderMonth": "INTEGER",
            "FirstOrderYear": "INTEGER",
            "LastOrderYear": "INTEGER",
            "ProductLine": "TEXT",
            "AddressLine1": "TEXT",
            "AddressLine2": "TEXT",
            "AnnualSales": "DOUBLE",
            "BankName": "TEXT",
            "MinPaymentType": "INTEGER",
            "MinPaymentAmount": "DOUBLE",
            "AnnualRevenue": "DOUBLE",
            "YearOpened": "INTEGER",
        },
    },
    "dimsalesreason": {
        "columns": [
            "SalesReasonKey",
            "SalesReasonAlternateKey",
            "SalesReasonName",
            "SalesReasonReasonType",
        ],
        "types": {
            "SalesReasonKey": "INTEGER",
            "SalesReasonAlternateKey": "INTEGER",
            "SalesReasonName": "TEXT",
            "SalesReasonReasonType": "TEXT",
        },
    },
    "dimsalesterritory": {
        "columns": [
            "SalesTerritoryKey",
            "SalesTerritoryAlternateKey",
            "SalesTerritoryRegion",
            "SalesTerritoryCountry",
            "SalesTerritoryGroup",
            "SalesTerritoryImage",
        ],
        "types": {
            "SalesTerritoryKey": "INTEGER",
            "SalesTerritoryAlternateKey": "INTEGER",
            "SalesTerritoryRegion": "TEXT",
            "SalesTerritoryCountry": "TEXT",
            "SalesTerritoryGroup": "TEXT",
            "SalesTerritoryImage": "TEXT",
        },
    },
    "dimscenario": {
        "columns": ["ScenarioKey", "ScenarioName"],
        "types": {"ScenarioKey": "INTEGER", "ScenarioName": "TEXT"},
    },
    "factadditionalinternationalproductdescription": {
        "columns": ["ProductKey", "CultureName", "ProductDescription"],
        "types": {
            "ProductKey": "INTEGER",
            "CultureName": "TEXT",
            "ProductDescription": "TEXT",
        },
    },
    "factcallcenter": {
        "columns": [
            "FactCallCenterID",
            "DateKey",
            "WageType",
            "Shift",
            "LevelOneOperators",
            "LevelTwoOperators",
            "TotalOperators",
            "Calls",
            "AutomaticResponses",
            "Orders",
            "IssuesRaised",
            "AverageTimePerIssue",
            "ServiceGrade",
            "Date",
        ],
        "types": {
            "FactCallCenterID": "INTEGER",
            "DateKey": "INTEGER",
            "WageType": "TEXT",
            "Shift": "TEXT",
            "LevelOneOperators": "INTEGER",
            "LevelTwoOperators": "INTEGER",
            "TotalOperators": "INTEGER",
            "Calls": "INTEGER",
            "AutomaticResponses": "INTEGER",
            "Orders": "INTEGER",
            "IssuesRaised": "INTEGER",
            "AverageTimePerIssue": "INTEGER",
            "ServiceGrade": "DOUBLE",
            "Date": "TIMESTAMP",
        },
    },
    "factcurrencyrate": {
        "columns": ["CurrencyKey", "DateKey", "AverageRate", "EndOfDayRate", "Date"],
        "types": {
            "CurrencyKey": "INTEGER",
            "DateKey": "INTEGER",
            "AverageRate": "DOUBLE",
            "EndOfDayRate": "DOUBLE",
            "Date": "TIMESTAMP",
        },
    },
    "factfinance": {
        "columns": [
            "FinanceKey",
            "DateKey",
            "OrganizationKey",
            "DepartmentGroupKey",
            "ScenarioKey",
            "AccountKey",
            "Amount",
            "Date",
        ],
        "types": {
            "FinanceKey": "INTEGER",
            "DateKey": "INTEGER",
            "OrganizationKey": "INTEGER",
            "DepartmentGroupKey": "INTEGER",
            "ScenarioKey": "INTEGER",
            "AccountKey": "INTEGER",
            "Amount": "DOUBLE",
            "Date": "TIMESTAMP",
        },
    },
    "factinternetsales": {
        "columns": [
            "ProductKey",
            "OrderDateKey",
            "DueDateKey",
            "ShipDateKey",
            "CustomerKey",
            "PromotionKey",
            "CurrencyKey",
            "SalesTerritoryKey",
            "SalesOrderNumber",
            "SalesOrderLineNumber",
            "RevisionNumber",
            "OrderQuantity",
            "UnitPrice",
            "ExtendedAmount",
            "UnitPriceDiscountPct",
            "DiscountAmount",
            "ProductStandardCost",
            "TotalProductCost",
            "SalesAmount",
            "TaxAmt",
            "Freight",
            "CarrierTrackingNumber",
            "CustomerPONumber",
            "OrderDate",
            "DueDate",
            "ShipDate",
        ],
        "types": {
            "ProductKey": "INTEGER",
            "OrderDateKey": "INTEGER",
            "DueDateKey": "INTEGER",
            "ShipDateKey": "INTEGER",
            "CustomerKey": "INTEGER",
            "PromotionKey": "INTEGER",
            "CurrencyKey": "INTEGER",
            "SalesTerritoryKey": "INTEGER",
            "SalesOrderNumber": "TEXT",
            "SalesOrderLineNumber": "INTEGER",
            "RevisionNumber": "INTEGER",
            "OrderQuantity": "INTEGER",
            "UnitPrice": "DOUBLE",
            "ExtendedAmount": "DOUBLE",
            "UnitPriceDiscountPct": "DOUBLE",
            "DiscountAmount": "DOUBLE",
            "ProductStandardCost": "DOUBLE",
            "TotalProductCost": "DOUBLE",
            "SalesAmount": "DOUBLE",
            "TaxAmt": "DOUBLE",
            "Freight": "DOUBLE",
            "CarrierTrackingNumber": "TEXT",
            "CustomerPONumber": "TEXT",
            "OrderDate": "TIMESTAMP",
            "DueDate": "TIMESTAMP",
            "ShipDate": "TIMESTAMP",
        },
    },
    "factinternetsalesreason": {
        "columns": ["SalesOrderNumber", "SalesOrderLineNumber", "SalesReasonKey"],
        "types": {
            "SalesOrderNumber": "TEXT",
            "SalesOrderLineNumber": "INTEGER",
            "SalesReasonKey": "INTEGER",
        },
    },
    "factproductinventory": {
        "columns": [
            "ProductKey",
            "DateKey",
            "MovementDate",
            "UnitCost",
            "UnitsIn",
            "UnitsOut",
            "UnitsBalance",
        ],
        "types": {
            "ProductKey": "INTEGER",
            "DateKey": "INTEGER",
            "MovementDate": "TIMESTAMP",
            "UnitCost": "DOUBLE",
            "UnitsIn": "INTEGER",
            "UnitsOut": "INTEGER",
            "UnitsBalance": "INTEGER",
        },
    },
    "factresellersales": {
        "columns": [
            "ProductKey",
            "OrderDateKey",
            "DueDateKey",
            "ShipDateKey",
            "ResellerKey",
            "EmployeeKey",
            "PromotionKey",
            "CurrencyKey",
            "SalesTerritoryKey",
            "SalesOrderNumber",
            "SalesOrderLineNumber",
            "RevisionNumber",
            "OrderQuantity",
            "UnitPrice",
            "ExtendedAmount",
            "UnitPriceDiscountPct",
            "DiscountAmount",
            "ProductStandardCost",
            "TotalProductCost",
            "SalesAmount",
            "TaxAmt",
            "Freight",
            "CarrierTrackingNumber",
            "CustomerPONumber",
            "OrderDate",
            "DueDate",
            "ShipDate",
        ],
        "types": {
            "ProductKey": "INTEGER",
            "OrderDateKey": "INTEGER",
            "DueDateKey": "INTEGER",
            "ShipDateKey": "INTEGER",
            "ResellerKey": "INTEGER",
            "EmployeeKey": "INTEGER",
            "PromotionKey": "INTEGER",
            "CurrencyKey": "INTEGER",
            "SalesTerritoryKey": "INTEGER",
            "SalesOrderNumber": "TEXT",
            "SalesOrderLineNumber": "INTEGER",
            "RevisionNumber": "INTEGER",
            "OrderQuantity": "INTEGER",
            "UnitPrice": "DOUBLE",
            "ExtendedAmount": "DOUBLE",
            "UnitPriceDiscountPct": "DOUBLE",
            "DiscountAmount": "DOUBLE",
            "ProductStandardCost": "DOUBLE",
            "TotalProductCost": "DOUBLE",
            "SalesAmount": "DOUBLE",
            "TaxAmt": "DOUBLE",
            "Freight": "DOUBLE",
            "CarrierTrackingNumber": "TEXT",
            "CustomerPONumber": "TEXT",
            "OrderDate": "TIMESTAMP",
            "DueDate": "TIMESTAMP",
            "ShipDate": "TIMESTAMP",
        },
    },
    "factsalesquota": {
        "columns": [
            "SalesQuotaKey",
            "EmployeeKey",
            "DateKey",
            "CalendarYear",
            "CalendarQuarter",
            "SalesAmountQuota",
            "Date",
        ],
        "types": {
            "SalesQuotaKey": "INTEGER",
            "EmployeeKey": "INTEGER",
            "DateKey": "INTEGER",
            "CalendarYear": "INTEGER",
            "CalendarQuarter": "INTEGER",
            "SalesAmountQuota": "DOUBLE",
            "Date": "TIMESTAMP",
        },
    },
    "factsurveyresponse": {
        "columns": [
            "SurveyResponseKey",
            "DateKey",
            "CustomerKey",
            "ProductCategoryKey",
            "EnglishProductCategoryName",
            "ProductSubcategoryKey",
            "EnglishProductSubcategoryName",
            "Date",
        ],
        "types": {
            "SurveyResponseKey": "INTEGER",
            "DateKey": "INTEGER",
            "CustomerKey": "INTEGER",
            "ProductCategoryKey": "INTEGER",
            "EnglishProductCategoryName": "TEXT",
            "ProductSubcategoryKey": "INTEGER",
            "EnglishProductSubcategoryName": "TEXT",
            "Date": "TIMESTAMP",
        },
    },
    "newfactcurrencyrate": {
        "columns": [
            "AverageRate",
            "CurrencyID",
            "CurrencyDate",
            "EndOfDayRate",
            "CurrencyKey",
            "DateKey",
        ],
        "types": {
            "AverageRate": "DOUBLE",
            "CurrencyID": "TEXT",
            "CurrencyDate": "TIMESTAMP",
            "EndOfDayRate": "DOUBLE",
            "CurrencyKey": "INTEGER",
            "DateKey": "INTEGER",
        },
    },
    "prospectivebuyer": {
        "columns": [
            "ProspectiveBuyerKey",
            "ProspectAlternateKey",
            "FirstName",
            "MiddleName",
            "LastName",
            "BirthDate",
            "MaritalStatus",
            "Gender",
            "EmailAddress",
            "YearlyIncome",
            "TotalChildren",
            "NumberChildrenAtHome",
            "Education",
            "Occupation",
            "HouseOwnerFlag",
            "NumberCarsOwned",
            "AddressLine1",
            "AddressLine2",
            "City",
            "StateProvinceCode",
            "PostalCode",
            "Phone",
            "Salutation",
            "Unknown",
        ],
        "types": {
            "ProspectiveBuyerKey": "INTEGER",
            "ProspectAlternateKey": "TEXT",
            "FirstName": "TEXT",
            "MiddleName": "TEXT",
            "LastName": "TEXT",
            "BirthDate": "TIMESTAMP",
            "MaritalStatus": "TEXT",
            "Gender": "TEXT",
            "EmailAddress": "TEXT",
            "YearlyIncome": "DOUBLE",
            "TotalChildren": "INTEGER",
            "NumberChildrenAtHome": "INTEGER",
            "Education": "TEXT",
            "Occupation": "TEXT",
            "HouseOwnerFlag": "TEXT",
            "NumberCarsOwned": "INTEGER",
            "AddressLine1": "TEXT",
            "AddressLine2": "TEXT",
            "City": "TEXT",
            "StateProvinceCode": "TEXT",
            "PostalCode": "TEXT",
            "Phone": "TEXT",
            "Salutation": "TEXT",
            "Unknown": "INTEGER",
        },
    },
    "sysdiagrams": {
        "columns": ["name", "principal_id", "diagram_id", "version", "definition"],
        "types": {
            "name": "TEXT",
            "principal_id": "INTEGER",
            "diagram_id": "INTEGER",
            "version": "INTEGER",
            "definition": "TEXT",
        },
    },
}


for file in DATA_DIR.glob("*.csv"):
    if file.name.endswith(".clean.csv"):
        file.unlink()
        continue
    # Skip files that are already cleaned
    with open(file, "rb") as f:
        raw = f.read()

    # Try decoding as UTF-16LE or fallback
    try:
        text = raw.decode("utf-16le")
    except UnicodeDecodeError:
        try:
            text = raw.decode("utf-8-sig")  # handles BOM
        except UnicodeDecodeError:
            print(f"⚠️ Could not decode {file.name}")
            continue

    # Write clean UTF-8
    clean_file = file.with_suffix(".clean.csv")
    with open(clean_file, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"✅ Converted {file.name} → {clean_file.name}")

# In-memory DB for quick work (change to 'mydb.duckdb' to persist)
DB_PATH = Path("adventureworks.duckdb")
con = duckdb.connect(DB_PATH.as_posix())

# Find all CSV files in the data directory
csv_files = [item for item in DATA_DIR.glob("*.csv") if not item.name.startswith("sys")]
[item.name for item in csv_files]

✅ Converted DimCustomer.csv → DimCustomer.clean.csv
✅ Converted DimOrganization.csv → DimOrganization.clean.csv
✅ Converted DimPromotion.csv → DimPromotion.clean.csv
✅ Converted DimSalesTerritory.csv → DimSalesTerritory.clean.csv
✅ Converted FactCallCenter.csv → FactCallCenter.clean.csv
✅ Converted DimDate.csv → DimDate.clean.csv
✅ Converted DimSalesReason.csv → DimSalesReason.clean.csv
✅ Converted DimScenario.csv → DimScenario.clean.csv
✅ Converted NewFactCurrencyRate.csv → NewFactCurrencyRate.clean.csv
✅ Converted FactResellerSales.csv → FactResellerSales.clean.csv
✅ Converted FactAdditionalInternationalProductDescription.csv → FactAdditionalInternationalProductDescription.clean.csv
✅ Converted FactInternetSales.csv → FactInternetSales.clean.csv
✅ Converted ProspectiveBuyer.csv → ProspectiveBuyer.clean.csv
✅ Converted DatabaseLog.csv → DatabaseLog.clean.csv
✅ Converted FactFinance.csv → FactFinance.clean.csv
✅ Converted DimGeography.csv → DimGeography.clean.csv
✅ Converted FactCurren

['FactSurveyResponse.clean.csv',
 'DimCustomer.csv',
 'FactCallCenter.clean.csv',
 'DimOrganization.csv',
 'DimPromotion.csv',
 'DimSalesTerritory.csv',
 'FactCallCenter.csv',
 'DimDate.csv',
 'FactProductInventory.clean.csv',
 'DimProduct.clean.csv',
 'DimSalesReason.clean.csv',
 'ProspectiveBuyer.clean.csv',
 'FactInternetSalesReason.clean.csv',
 'DimSalesReason.csv',
 'FactAdditionalInternationalProductDescription.clean.csv',
 'NewFactCurrencyRate.clean.csv',
 'DimAccount.clean.csv',
 'DimProductSubcategory.clean.csv',
 'DimDepartmentGroup.clean.csv',
 'FactSalesQuota.clean.csv',
 'DimScenario.csv',
 'NewFactCurrencyRate.csv',
 'FactResellerSales.csv',
 'DimPromotion.clean.csv',
 'FactAdditionalInternationalProductDescription.csv',
 'DimOrganization.clean.csv',
 'DimDate.clean.csv',
 'FactInternetSales.clean.csv',
 'FactFinance.clean.csv',
 'DimCurrency.clean.csv',
 'FactInternetSales.csv',
 'ProspectiveBuyer.csv',
 'DatabaseLog.csv',
 'FactFinance.csv',
 'DimGeography.csv',
 'FactC

In [4]:
[item.name for item in DATA_DIR.glob("*.*")]

['FactSurveyResponse.clean.csv',
 'DimCustomer.csv',
 'FactCallCenter.clean.csv',
 'DimOrganization.csv',
 'DimPromotion.csv',
 'DimSalesTerritory.csv',
 'FactCallCenter.csv',
 'DimDate.csv',
 'FactProductInventory.clean.csv',
 'DimProduct.clean.csv',
 'DimSalesReason.clean.csv',
 'ProspectiveBuyer.clean.csv',
 'FactInternetSalesReason.clean.csv',
 'DimSalesReason.csv',
 'FactAdditionalInternationalProductDescription.clean.csv',
 'NewFactCurrencyRate.clean.csv',
 'DimAccount.clean.csv',
 'DimProductSubcategory.clean.csv',
 'DimDepartmentGroup.clean.csv',
 'FactSalesQuota.clean.csv',
 'DimScenario.csv',
 'NewFactCurrencyRate.csv',
 'FactResellerSales.csv',
 'DimPromotion.clean.csv',
 'FactAdditionalInternationalProductDescription.csv',
 'DimOrganization.clean.csv',
 'DimDate.clean.csv',
 'FactInternetSales.clean.csv',
 'FactFinance.clean.csv',
 'DimCurrency.clean.csv',
 'FactInternetSales.csv',
 'ProspectiveBuyer.csv',
 'DatabaseLog.csv',
 'sysdiagrams.clean.csv',
 'FactFinance.csv',
 '

In [3]:
print(flattened_tables.keys())
flattened_tables["dimaccount"]

dict_keys(['databaselog2', 'adventureworksdwbuildversion', 'dimaccount', 'dimcurrency', 'dimcustomer', 'dimdate', 'dimdepartmentgroup', 'dimemployee', 'dimgeography', 'dimorganization', 'dimproduct', 'dimproductcategory', 'dimproductsubcategory', 'dimpromotion', 'dimreseller', 'dimsalesreason', 'dimsalesterritory', 'dimscenario', 'factadditionalinternationalproductdescription', 'factcallcenter', 'factcurrencyrate', 'factfinance', 'factinternetsales', 'factinternetsalesreason', 'factproductinventory', 'factresellersales', 'factsalesquota', 'factsurveyresponse', 'newfactcurrencyrate', 'prospectivebuyer', 'sysdiagrams'])


{'columns': ['AccountKey',
  'ParentAccountKey',
  'AccountCodeAlternateKey',
  'ParentAccountCodeAlternateKey',
  'AccountDescription',
  'AccountType',
  'Operator',
  'CustomMembers',
  'ValueType',
  'CustomMemberOptions'],
 'types': {'AccountKey': 'INTEGER',
  'ParentAccountKey': 'INTEGER',
  'AccountCodeAlternateKey': 'INTEGER',
  'ParentAccountCodeAlternateKey': 'INTEGER',
  'AccountDescription': 'TEXT',
  'AccountType': 'TEXT',
  'Operator': 'TEXT',
  'CustomMembers': 'TEXT',
  'ValueType': 'TEXT',
  'CustomMemberOptions': 'TEXT'}}

In [6]:
for csv_file in DATA_DIR.glob("*.clean.csv"):
    table_name = csv_file.name.replace(".clean.csv", "").lower()

    if table_name not in flattened_tables:
        print(f"⚠️  Skipping {csv_file.name} — no schema found.")
        continue
    if table_name == "sysdiagrams":
        print(f"⚠️  Skipping {csv_file.name} — no schema found.")
        continue

    schema = flattened_tables[table_name]
    columns = schema["columns"]
    types = schema["types"]

    col_defs = ",\n  ".join(f'"{k}" {v}' for k, v in types.items())
    con.execute(f"CREATE OR REPLACE TABLE {table_name} ({col_defs})")

    # 2. Load the file and insert rows
    col_list = ", ".join(f'"{c}"' for c in columns)
    type_list = ", ".join(f'"{k}": "{v}"' for k, v in types.items())

    con.execute(
        f"""
        INSERT INTO {table_name}
        SELECT {col_list}
        FROM read_csv(
            '{csv_file.as_posix()}',
            delim='|',
            header=False,
            quote='',
            escape='',
            ignore_errors=True,
            null_padding=True,
            columns={{ {type_list} }}
        )
    """
    )

    parquet_path = PARQUET_DIR / f"{table_name}.parquet"
    con.execute(f"COPY {table_name} TO '{parquet_path.as_posix()}' (FORMAT PARQUET)")

    print(f"✅ Wrote {parquet_path.name}")

print("🎯 All tables loaded. Current DuckDB tables:")
print(con.execute("SHOW TABLES").fetchdf())

✅ Wrote factsurveyresponse.parquet
✅ Wrote factcallcenter.parquet
✅ Wrote factproductinventory.parquet
✅ Wrote dimproduct.parquet
✅ Wrote dimsalesreason.parquet
✅ Wrote prospectivebuyer.parquet
✅ Wrote factinternetsalesreason.parquet
✅ Wrote factadditionalinternationalproductdescription.parquet
✅ Wrote newfactcurrencyrate.parquet
✅ Wrote dimaccount.parquet
✅ Wrote dimproductsubcategory.parquet
✅ Wrote dimdepartmentgroup.parquet
✅ Wrote factsalesquota.parquet
✅ Wrote dimpromotion.parquet
✅ Wrote dimorganization.parquet
✅ Wrote dimdate.parquet
✅ Wrote factinternetsales.parquet
✅ Wrote factfinance.parquet
✅ Wrote dimcurrency.parquet
⚠️  Skipping sysdiagrams.clean.csv — no schema found.
⚠️  Skipping DatabaseLog.clean.csv — no schema found.
✅ Wrote dimreseller.parquet
✅ Wrote factcurrencyrate.parquet
✅ Wrote dimproductcategory.parquet
✅ Wrote dimscenario.parquet
✅ Wrote dimgeography.parquet
✅ Wrote dimsalesterritory.parquet
✅ Wrote dimemployee.parquet
✅ Wrote factresellersales.parquet
✅ Wro