In [None]:
# ==============================================================
# fix_data.py  ‚Üê  SELF-HEALING SCRIPT (FINAL, CLEAN, SAFE)
# ==============================================================

import sqlite3
import pandas as pd
import logging


DB = r"E:\work\DEPI\graduation promax\data\warehouse_final\ivf_star_schema.db"
LOG = r"E:\work\DEPI\graduation promax\src\ETL\logs\repair_log.txt"

logging.basicConfig(
    filename=LOG,
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)

STATUS_FLAG = -1   # validation result passed from outside


# ==============================================================
# STEP 1 ‚Üí FIX NULL VALUES (ONLY FOR REQUIRED COLUMNS)
# ==============================================================

REQUIRED_COLS = {
    "fact_ivf_cycle": ["case_id", "female_id", "male_id"],
    "fact_transfer": ["case_id"],
    "fact_transfer_embryo": ["transfer_sk"]   # embryo_id allowed NULL
}

def fix_required_nulls(conn):
    print("\nüõ† Fixing NULL in required columns...")

    for table, cols in REQUIRED_COLS.items():
        df = pd.read_sql(f"SELECT * FROM {table}", conn)

        fixed = False
        for col in cols:
            null_count = df[col].isnull().sum()
            if null_count > 0:
                fixed = True
                logging.warning(f"{table}: fixing {null_count} NULL in {col}")
                print(f"   - {table}.{col}: fixing {null_count} NULL ‚Üí removed rows")

                # safest fix ‚Üí remove rows with missing required PK
                df = df[df[col].notnull()]

        if fixed:
            df.to_sql(table, conn, if_exists="replace", index=False)


# ==============================================================
# STEP 2 ‚Üí REMOVE DUPLICATES
# ==============================================================

def fix_duplicates(conn):
    print("\nüõ† Fixing duplicate rows...")
    tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", conn)

    for tbl in tables["name"]:
        if tbl.startswith("sqlite_"):
            continue

        df = pd.read_sql(f"SELECT * FROM {tbl}", conn)
        dup = df.duplicated().sum()

        if dup > 0:
            print(f"   - {tbl}: removed {dup} duplicates")
            logging.warning(f"{tbl}: removed {dup} duplicates")

            df = df.drop_duplicates()
            df.to_sql(tbl, conn, if_exists="replace", index=False)


# ==============================================================
# STEP 3 ‚Üí FIX MISSING TIME FK
# ==============================================================

def fix_missing_time_fk(conn):
    print("\nüõ† Fixing missing time foreign keys...")

    # -------------------- fact_transfer --------------------
    conn.execute("""
        UPDATE fact_transfer
        SET transfer_time_fk = NULL
        WHERE transfer_time_fk IS NOT NULL
          AND transfer_time_fk NOT IN (SELECT time_id FROM dim_time)
    """)
    conn.commit()

    # -------------------- fact_ivf_cycle --------------------
    conn.execute("""
        UPDATE fact_ivf_cycle
        SET cycle_start_time_id = NULL
        WHERE cycle_start_time_id IS NOT NULL
          AND cycle_start_time_id NOT IN (SELECT time_id FROM dim_time)
    """)
    conn.commit()

    print("   ‚úî Missing time FK fixed")


# ==============================================================
# STEP 4 ‚Üí FIX MISSING DOCTOR FK
# ==============================================================

def fix_missing_doctor_fk(conn):
    print("\nüõ† Fixing missing doctor foreign keys...")

    # ÿ£Ÿä doctor_id ŸÖÿ¥ ŸÖŸàÿ¨ŸàÿØ ŸÅŸä dim_doctor ŸÜÿÆŸÑŸäŸá NULL
    conn.execute("""
        UPDATE fact_transfer
        SET doctor_id = NULL
        WHERE doctor_id IS NOT NULL
          AND doctor_id NOT IN (SELECT doctor_id FROM dim_doctor)
    """)
    conn.commit()

    conn.execute("""
        UPDATE fact_ivf_cycle
        SET doctor_id = NULL
        WHERE doctor_id IS NOT NULL
          AND doctor_id NOT IN (SELECT doctor_id FROM dim_doctor)
    """)
    conn.commit()

    print("   ‚úî Missing doctor FK fixed")


# ==============================================================
# STEP 5 ‚Üí FIX fact_transfer_embryo
# ==============================================================

def fix_fact_transfer_embryo(conn):
    print("\nüõ† Cleaning orphan records in fact_transfer_embryo...")

    # ÿßÿ≠ÿ∞ŸÅ ÿßŸÑÿµŸÅŸàŸÅ ÿßŸÑŸÑŸä transfer_sk ÿ®ÿ™ÿßÿπŸáÿß ŸÖÿ¥ ŸÖŸàÿ¨ŸàÿØ
    conn.execute("""
        DELETE FROM fact_transfer_embryo
        WHERE transfer_sk NOT IN (SELECT transfer_sk FROM fact_transfer)
    """)
    conn.commit()

    # ÿßÿ≠ÿ∞ŸÅ ÿßŸÑÿµŸÅŸàŸÅ ÿßŸÑŸÑŸä embryo_id ŸÖÿ¥ ŸÖŸàÿ¨ŸàÿØ ŸÅŸä dim_embryo (ŸÑŸà ŸÖÿ≥ÿ™ÿÆÿØŸÖŸäŸÜ embryo IDs ÿ≠ŸÇŸäŸÇŸäÿ©)
    conn.execute("""
        DELETE FROM fact_transfer_embryo
        WHERE embryo_id IS NOT NULL
          AND embryo_id NOT IN (SELECT embryo_id FROM dim_embryo)
    """)
    conn.commit()

    print("   ‚úî fact_transfer_embryo cleaned")


# ==============================================================
# MAIN FIX CONTROLLER
# ==============================================================

def run_fixes():
    global STATUS_FLAG

    if STATUS_FLAG == 1:
        print("üíö Database is already CLEAN ‚Äî No fixes needed.")
        return 1

    print("\n‚ö† Problems detected ‚Äî APPLYING FIXES...\n")
    conn = sqlite3.connect(DB)

    # 1) fix required NULLs
    fix_required_nulls(conn)

    # 2) remove duplicates
    fix_duplicates(conn)

    # 3) fix invalid time fk
    fix_missing_time_fk(conn)

    # 4) fix doctor fk
    fix_missing_doctor_fk(conn)

    # 5) fix embryo table relations
    fix_fact_transfer_embryo(conn)

    conn.close()

    print("\n‚ú® All fixes applied successfully!")
    logging.info("All fixes applied successfully.")

    STATUS_FLAG = 1
    print("üìå FINAL STATUS = 1 (CLEAN)")
    return 1


if __name__ == "__main__":
    run_fixes()



‚ö† Problems detected ‚Äî APPLYING FIXES...

üõ† Fixing 1 orphan records in fact_transfer...
üõ† Updating fact_transfer_embryo based on new transfer_sk...
üõ† Setting 1 invalid time_fk to NULL in fact_transfer...
‚úî Invalid time_fk set to NULL
üõ† Setting 1 invalid cycle_start_time_id to NULL in fact_ivf_cycle...
‚úî Invalid cycle_start_time_id set to NULL

‚ú® Fixing DONE ‚Äî Database is NOW CLEAN!
