In [None]:
# %% [markdown]
# # Adventure Works ETL
#
# **Key Requirements**:
# 1. DimProduct: fill missing "Color" with "NA".
# 2. DimSalesTerritory: drop entire row if any column is missing.
# 3. FactInternetSales & FactResellerSales: drop entire row if any column is missing.
# 4. Remove "CarrierTrackingNumber" from FactInternetSales.
# 5. Preserve data types by specifying dtype when reading Excel, parse date columns in fact tables.
# 6. Validate foreign keys in fact tables, drop only invalid references.
# 7. Load final tables into MySQL.

# %% [code]
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# -------------------------------
# 1. MySQL connection parameters
# -------------------------------
username = 'root'
password = '12345'
host = 'localhost'
port = '3306'
database = 'case7'
engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

# -------------------------------
# 2. Helper functions
# -------------------------------
def check_uniqueness(df, key_column, table_name):
    """
    Ensures the primary key column is unique.
    Drops duplicates based on the PK and prints how many were dropped.
    """
    before = df.shape[0]
    df_clean = df.drop_duplicates(subset=[key_column])
    after = df_clean.shape[0]
    dropped = before - after
    if dropped > 0:
        print(f"{dropped} duplicate row(s) dropped from {table_name} based on primary key '{key_column}'.")
    else:
        print(f"All rows in {table_name} have a unique '{key_column}'.")
    return df_clean

def validate_fk(fact_df, dim_df, fact_fk, dim_pk, fact_table_name, dim_table_name):
    """
    Drops rows in the fact table where the FK is not in the dimension's PK set.
    Prints the number of rows dropped.
    """
    valid_ids = set(dim_df[dim_pk].unique())
    before = fact_df.shape[0]
    fact_df_clean = fact_df[fact_df[fact_fk].isin(valid_ids)]
    after = fact_df_clean.shape[0]
    dropped = before - after
    if dropped > 0:
        print(f"{dropped} row(s) dropped from {fact_table_name} due to invalid '{fact_fk}' not found in {dim_table_name}.")
    else:
        print(f"All rows in {fact_table_name} have a valid foreign key '{fact_fk}'.")
    return fact_df_clean

# -------------------------------
# 3. Read Dimension Tables
# -------------------------------
print("Loading dimension tables...")

dim_customer_df = pd.read_excel(
    'DimTables.xlsx',
    sheet_name='DimCustomer',
    dtype={
        'CustomerKey': 'Int64',
        'GeographyKey': 'Int64',
        'CustomerName': 'string',
        'BirthDate': 'string',
        'MaritalStatus': 'string',
        'Gender': 'string',
        'EmailAddress': 'string',
        'YearlyIncome': 'float',
        'Education': 'string',
        'Occupation': 'string',
        'HouseOwnerFlag': 'string',
        'Address': 'string',
        'FirstPurchaseDate': 'string'
    }
)

dim_employee_df = pd.read_excel(
    'DimTables.xlsx',
    sheet_name='DimEmployee',
    dtype={
        'EmployeeKey': 'Int64',
        'ParentEmployeeKey': 'Int64',
        'SalesTerritoryKey': 'Int64',
        'EmployeeName': 'string',
        'Title': 'string',
        'EmailAddress': 'string',
        'DepartmentName': 'string',
        'HireDate': 'string',
        'BirthDate': 'string'
    }
)

dim_geography_df = pd.read_excel(
    'DimTables.xlsx',
    sheet_name='DimGeography',
    dtype={
        'GeographyKey': 'Int64',
        'City': 'string',
        'State': 'string',
        'Country': 'string',
        'PostalCode': 'string',
        'SalesTerritoryKey': 'Int64'
    }
)

dim_product_df = pd.read_excel(
    'DimTables.xlsx',
    sheet_name='DimProduct',
    dtype={
        'ProductKey': 'Int64',
        'ProductSubcategoryKey': 'Int64',
        'Product': 'string',
        'Color': 'string',
        'Model': 'string',
        'Subcategory': 'string',
        'Category': 'string'
    }
)

dim_reseller_df = pd.read_excel(
    'DimTables.xlsx',
    sheet_name='DimReseller',
    dtype={
        'ResellerKey': 'Int64',
        'GeographyKey': 'Int64',
        'BusinessType': 'string',
        'ResellerName': 'string'
    }
)

dim_salesterritory_df = pd.read_excel(
    'DimTables.xlsx',
    sheet_name='DimSalesTerritory',
    dtype={
        'SalesTerritoryKey': 'Int64',
        'SalesTerritoryRegion': 'string',
        'SalesTerritoryCountry': 'string',
        'SalesTerritoryGroup': 'string'
    }
)

# -------------------------------
# 4. Read Fact Tables
# -------------------------------
print("Loading fact tables...")

fact_internet_sales_df = pd.read_excel(
    'FactInternetSales.xlsx',
    dtype={
        'ProductKey': 'Int64',
        'CustomerKey': 'Int64',
        'SalesTerritoryKey': 'Int64',
        'SalesOrderNumber': 'string',
        'SalesOrderLineNumber': 'Int64',
        'DiscountAmount': 'float',
        'TotalProductCost': 'float',
        'SalesAmount': 'float',
        'Freight': 'float',
        'CarrierTrackingNumber': 'string',  # Will remove
    },
    parse_dates=['OrderDate', 'DueDate', 'ShipDate']
)

fact_reseller_sales_df = pd.read_excel(
    'FactResellerSales.xlsx',
    dtype={
        'ProductKey': 'Int64',
        'ResellerKey': 'Int64',
        'EmployeeKey': 'Int64',
        'SalesTerritoryKey': 'Int64',
        'SalesOrderNumber': 'string',
        'SalesOrderLineNumber': 'Int64',
        'DiscountAmount': 'float',
        'TotalProductCost': 'float',
        'SalesAmount': 'float',
        'Freight': 'float',
        'CarrierTrackingNumber': 'string',
    },
    parse_dates=['OrderDate', 'DueDate', 'ShipDate']
)

# -------------------------------
# 5. Remove 'CarrierTrackingNumber' from FactInternetSales
# -------------------------------
if 'CarrierTrackingNumber' in fact_internet_sales_df.columns:
    fact_internet_sales_df.drop(columns=['CarrierTrackingNumber'], inplace=True)
    print("Removed 'CarrierTrackingNumber' column from FactInternetSales.")

# -------------------------------
# 6. Check Primary Key Uniqueness
# -------------------------------
print("\n-- Checking PK uniqueness in dimension tables --")
dim_customer_df = check_uniqueness(dim_customer_df, 'CustomerKey', 'DimCustomer')
dim_employee_df = check_uniqueness(dim_employee_df, 'EmployeeKey', 'DimEmployee')
dim_geography_df = check_uniqueness(dim_geography_df, 'GeographyKey', 'DimGeography')
dim_product_df = check_uniqueness(dim_product_df, 'ProductKey', 'DimProduct')
dim_reseller_df = check_uniqueness(dim_reseller_df, 'ResellerKey', 'DimReseller')
dim_salesterritory_df = check_uniqueness(dim_salesterritory_df, 'SalesTerritoryKey', 'DimSalesTerritory')

# -------------------------------
# 7. Handle Missing Data
# -------------------------------
#
# Per your requirement:
# - DimProduct: fill missing "Color" with "NA"
# - DimSalesTerritory: drop entire row if there's any missing data
# - FactInternetSales & FactResellerSales: drop entire row if there's any missing data

# 7a. DimProduct -> fill missing color with "NA"
if 'Color' in dim_product_df.columns:
    missing_color = dim_product_df['Color'].isnull().sum()
    if missing_color > 0:
        print(f"Filling {missing_color} missing 'Color' cells in DimProduct with 'NA'.")
        dim_product_df['Color'] = dim_product_df['Color'].fillna('NA')

# 7b. DimSalesTerritory -> drop entire row if there's missing data
before_dst = dim_salesterritory_df.shape[0]
dim_salesterritory_df.dropna(inplace=True)
after_dst = dim_salesterritory_df.shape[0]
dropped_dst = before_dst - after_dst
if dropped_dst > 0:
    print(f"Dropped {dropped_dst} row(s) from DimSalesTerritory due to missing data.")

# 7c. FactInternetSales -> drop entire row if there's missing data
before_fis = fact_internet_sales_df.shape[0]
fact_internet_sales_df.dropna(inplace=True)
after_fis = fact_internet_sales_df.shape[0]
dropped_fis = before_fis - after_fis
if dropped_fis > 0:
    print(f"Dropped {dropped_fis} row(s) from FactInternetSales due to missing data.")

# 7d. FactResellerSales -> drop entire row if there's missing data
before_frs = fact_reseller_sales_df.shape[0]
fact_reseller_sales_df.dropna(inplace=True)
after_frs = fact_reseller_sales_df.shape[0]
dropped_frs = before_frs - after_frs
if dropped_frs > 0:
    print(f"Dropped {dropped_frs} row(s) from FactResellerSales due to missing data.")

# -------------------------------
# 8. Validate Foreign Keys
# -------------------------------
print("\n-- Validating foreign keys in FactInternetSales --")
fact_internet_sales_df = validate_fk(
    fact_internet_sales_df,
    dim_customer_df,
    'CustomerKey',
    'CustomerKey',
    'FactInternetSales',
    'DimCustomer'
)
fact_internet_sales_df = validate_fk(
    fact_internet_sales_df,
    dim_product_df,
    'ProductKey',
    'ProductKey',
    'FactInternetSales',
    'DimProduct'
)
fact_internet_sales_df = validate_fk(
    fact_internet_sales_df,
    dim_salesterritory_df,
    'SalesTerritoryKey',
    'SalesTerritoryKey',
    'FactInternetSales',
    'DimSalesTerritory'
)

print("\n-- Validating foreign keys in FactResellerSales --")
fact_reseller_sales_df = validate_fk(
    fact_reseller_sales_df,
    dim_reseller_df,
    'ResellerKey',
    'ResellerKey',
    'FactResellerSales',
    'DimReseller'
)
fact_reseller_sales_df = validate_fk(
    fact_reseller_sales_df,
    dim_employee_df,
    'EmployeeKey',
    'EmployeeKey',
    'FactResellerSales',
    'DimEmployee'
)
fact_reseller_sales_df = validate_fk(
    fact_reseller_sales_df,
    dim_product_df,
    'ProductKey',
    'ProductKey',
    'FactResellerSales',
    'DimProduct'
)
fact_reseller_sales_df = validate_fk(
    fact_reseller_sales_df,
    dim_salesterritory_df,
    'SalesTerritoryKey',
    'SalesTerritoryKey',
    'FactResellerSales',
    'DimSalesTerritory'
)

# -------------------------------
# 9. Load to MySQL
# -------------------------------
print("\n-- Loading data into MySQL --")

dim_customer_df.to_sql('dimcustomer', con=engine, if_exists='replace', index=False)
dim_employee_df.to_sql('dimemployee', con=engine, if_exists='replace', index=False)
dim_geography_df.to_sql('dimgeography', con=engine, if_exists='replace', index=False)
dim_product_df.to_sql('dimproduct', con=engine, if_exists='replace', index=False)
dim_reseller_df.to_sql('dimreseller', con=engine, if_exists='replace', index=False)
dim_salesterritory_df.to_sql('dimsalesterritory', con=engine, if_exists='replace', index=False)

fact_internet_sales_df.to_sql('fact_internetsales', con=engine, if_exists='replace', index=False)
fact_reseller_sales_df.to_sql('fact_resellersales', con=engine, if_exists='replace', index=False)

print("\nETL process completed successfully!")
print("Your star schema now has 6 dimension tables and 2 fact tables.")


Loading dimension tables...
Loading fact tables...
Removed 'CarrierTrackingNumber' column from FactInternetSales.

-- Checking PK uniqueness in dimension tables --
All rows in DimCustomer have a unique 'CustomerKey'.
All rows in DimEmployee have a unique 'EmployeeKey'.
All rows in DimGeography have a unique 'GeographyKey'.
All rows in DimProduct have a unique 'ProductKey'.
All rows in DimReseller have a unique 'ResellerKey'.
All rows in DimSalesTerritory have a unique 'SalesTerritoryKey'.
Filling 56 missing 'Color' cells in DimProduct with 'NA'.
Dropped 1 row(s) from DimSalesTerritory due to missing data.
Dropped 44 row(s) from FactResellerSales due to missing data.

-- Validating foreign keys in FactInternetSales --
All rows in FactInternetSales have a valid foreign key 'CustomerKey'.
All rows in FactInternetSales have a valid foreign key 'ProductKey'.
All rows in FactInternetSales have a valid foreign key 'SalesTerritoryKey'.

-- Validating foreign keys in FactResellerSales --
All ro

In [18]:
def debug_missing(df, table_name):
    # Find columns that still contain missing values
    missing_cols = df.columns[df.isnull().any()].tolist()
    if len(missing_cols) == 0:
        print(f"No missing values remain in {table_name}")
    else:
        print(f"Columns still missing in {table_name}: {missing_cols}")
        for col in missing_cols:
            count = df[col].isnull().sum()
            print(f"  -> {col} has {count} missing values")

# After your fill_missing_dimension() or fill_missing_fact() calls, do:
debug_missing(dim_product_df, "DimProduct")
debug_missing(dim_salesterritory_df, "DimSalesTerritory")
debug_missing(fact_reseller_sales_df, "FactResellerSales")

No missing values remain in DimProduct
No missing values remain in DimSalesTerritory
No missing values remain in FactResellerSales
