In [None]:
import os
import logging
import sqlite3
from datetime import datetime
import pandas as pd
import numpy as np

# ===========================================================
#                 CONFIGURATION
# ===========================================================
BASE_PATH = r"E:\work\github graduation project local\Graduation-Project\graduation project final"
RAW_DB = fr"{BASE_PATH}\data\raw\ivf_patients_test.db"
STAR_DB = fr"{BASE_PATH}\data\warehouse_final\ivf_star_schema.db"
SCHEMA_SQL = fr"{BASE_PATH}\src\ETL\create_star_schema.sql"
LOG_FILE = fr"{BASE_PATH}\src\ETL\logs\etl_log_ivf.txt"

os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
os.makedirs(os.path.dirname(STAR_DB), exist_ok=True)

logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)

# ===========================================================
#                SCHEMA SQL (FULL REFRESH)
# ===========================================================
def run_schema_sql():
    conn = sqlite3.connect(STAR_DB)
    with open(SCHEMA_SQL, "r", encoding="utf-8") as f:
        conn.executescript(f.read())
    conn.commit()
    conn.close()
    logging.info("Schema (fresh) created successfully.")


# ===========================================================
#                   RAW LOADING
# ===========================================================
def load_raw_df():
    conn = sqlite3.connect(RAW_DB)
    df = pd.read_sql("SELECT * FROM ivf_patients", conn)
    conn.close()
    logging.info(f"Loaded {len(df)} raw rows.")
    return df


# ===========================================================
#                   CLEAN DATA
# ===========================================================
def clean_data(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    df = df.drop_duplicates()
    return df


# ===========================================================
#    CHECK REQUIRED COLUMNS (ONLY 3)
# ===========================================================
def check_required(df):
    required = ["case_id", "female_id", "male_id"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")


# ===========================================================
#   SET NON-CRITICAL IDs TO NULL
# ===========================================================
def handle_ids(df):
    id_cols = ["protocol_id", "outcome_id", "embryo_id"]
    for c in id_cols:
        if c not in df.columns:
            df[c] = None

    if "fresh_et_stage" not in df.columns:
        df["fresh_et_stage"] = None
    if "grading" not in df.columns:
        df["grading"] = None

    df["transfer_time_id"] = pd.to_datetime(df.get("et_date", None), errors="coerce") \
                                .dt.strftime("%Y-%m-%d")

    return df


# ===========================================================
#     MAP DOCTOR NAME ‚Üí AUTO INCREMENT doctor_id
# ===========================================================
def map_doctor_ids(df, conn):
    if "doctor_name" not in df.columns:
        df["doctor_name"] = "Unknown"

    # Read existing doctors
    existing = pd.read_sql("SELECT doctor_id, doctor_name FROM dim_doctor;", conn)
    name_to_id = dict(zip(existing["doctor_name"], existing["doctor_id"]))

    unique_names = df["doctor_name"].dropna().unique()
    new_doctors = [name for name in unique_names if name not in name_to_id]

    for name in new_doctors:
        cur = conn.execute(
            "INSERT INTO dim_doctor (doctor_name) VALUES (?)",
            (name,)
        )
        name_to_id[name] = cur.lastrowid

    df["doctor_id"] = df["doctor_name"].map(name_to_id)

    return df


# ===========================================================
#   SAFE INSERT ‚Üí NO DUPLICATION
# ===========================================================
def insert_or_ignore(table, df_subset, conn):
    cols = df_subset.columns.tolist()
    placeholders = ",".join("?" * len(cols))
    sql = f"INSERT OR IGNORE INTO {table} ({','.join(cols)}) VALUES ({placeholders})"
    conn.executemany(sql, df_subset.values.tolist())
    conn.commit()


# ===========================================================
#       DIMENSIONS LOADING
# ===========================================================
def load_dimensions(df, conn, refresh=True):
    dim_tables = {
        "dim_female":  ["female_id","female_age","female_bmi","amh_level","fsh_level","afc"],
        "dim_male":    ["male_id","male_age","male_factor","semen_count_mill_per_ml",
                        "motility_percent","morphology_percent"],
        "dim_protocol":["protocol_id","protocol_type","stimulation_days",
                        "total_fsh_dose","trigger_type","recommended_protocol"],
        "dim_outcome": ["outcome_id","risk_level","response_type",
                        "suggested_waiting_period_days","failure_reason"],
        "dim_embryo":  ["embryo_id","fresh_et_stage","grading","class_a_rate"]
    }

    for table, cols in dim_tables.items():
        subset = df[cols].drop_duplicates()

        if refresh:
            subset.to_sql(table, conn, if_exists="replace", index=False)
        else:
            insert_or_ignore(table, subset, conn)

        logging.info(f"{table}: {len(subset)} processed.")


# ===========================================================
#       DIM_TIME
# ===========================================================
def build_dim_time(df, conn):
    tmp = pd.to_datetime(df.get("et_date", None), errors="coerce").dropna().drop_duplicates()

    time_dim = pd.DataFrame({
        "full_date": tmp.dt.strftime("%Y-%m-%d"),
        "day": tmp.dt.day,
        "month": tmp.dt.month,
        "month_name": tmp.dt.month_name(),
        "quarter": tmp.dt.quarter,
        "year": tmp.dt.year,
        "week": tmp.dt.isocalendar().week.astype(int)
    })

    for _, row in time_dim.iterrows():
        conn.execute("""
            INSERT OR IGNORE INTO dim_time
            (full_date, day, month, month_name, quarter, year, week)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        """, tuple(row))
    conn.commit()


# ===========================================================
#              FACT TABLES
# ===========================================================
def load_fact_tables(df, conn):
    time_df = pd.read_sql("SELECT time_id, full_date FROM dim_time;", conn)
    date_to_id = dict(zip(time_df["full_date"], time_df["time_id"]))

    # Fill missing fact numeric columns with 0
    fact_needed = [
        "e2_on_trigger","endometrium_thickness","follicles_18mm",
        "gv_count","injected_m2","fertilized_oocytes",
        "cleavage_d3","blastocyst_d5","good_embryos"
    ]
    for col in fact_needed:
        if col not in df.columns:
            df[col] = 0

    df["cycle_start_time_id"] = df["transfer_time_id"].map(date_to_id)

    fact_cycle = df.drop_duplicates(subset=["case_id"])[[
        "case_id","female_id","male_id","protocol_id","doctor_id","outcome_id",
        "cycle_start_time_id","e2_on_trigger","endometrium_thickness",
        "follicles_18mm","retrieved_oocytes","m2_count","gv_count",
        "injected_m2","fertilized_oocytes","fertilization_rate",
        "cleavage_d3","blastocyst_d5","good_embryos"
    ]]
    insert_or_ignore("fact_ivf_cycle", fact_cycle, conn)

    # ---------------- FACT TRANSFER ----------------
    if all(col in df.columns for col in [
        "case_id","transfer_time_id","doctor_id","embryos_transferred"
    ]):
        tmp = df.drop_duplicates(subset=["case_id"]).copy()
        tmp["transfer_time_fk"] = tmp["transfer_time_id"].map(date_to_id)
        fact_transfer = tmp[[
            "case_id","transfer_time_fk","doctor_id",
            "embryos_transferred","pregnancy_test_result",
            "clinical_pregnancy","live_birth",
            "outcome_id","success_probability_score"
        ]]
        insert_or_ignore("fact_transfer", fact_transfer, conn)

    # ---------------- FACT TRANSFER EMBRYO ----------------
    try:
        existing_transfer = pd.read_sql("SELECT transfer_sk, case_id FROM fact_transfer;", conn)
        if not existing_transfer.empty and "embryo_id" in df.columns:
            df_merge = df.merge(existing_transfer, on="case_id", how="inner")
            fact_embryo = df_merge[["transfer_sk","embryo_id"]].drop_duplicates()
            insert_or_ignore("fact_transfer_embryo", fact_embryo, conn)
    except:
        logging.warning("fact_transfer_embryo skipped.")


# ===========================================================
#                    MAIN ETL
# ===========================================================
def run_full_etl(refresh=True):
    logging.info("===== ETL STARTED =====")

    if refresh:
        run_schema_sql()

    df = load_raw_df()
    df = clean_data(df)

    # Required columns check
    check_required(df)

    # Normalize MII/M2 naming
    rename_map = {"mii_count": "m2_count", "injected_mii": "injected_m2"}
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

    df = handle_ids(df)

    conn = sqlite3.connect(STAR_DB)

    # Doctor mapping
    df = map_doctor_ids(df, conn)

    # Dimensions
    load_dimensions(df, conn, refresh=refresh)
    build_dim_time(df, conn)
    load_fact_tables(df, conn)

    conn.close()

    logging.info("ETL COMPLETED SUCCESSFULLY.")
    print("ETL Done ‚úî")


if __name__ == "__main__":
    run_full_etl(refresh=False)


In [None]:
import sqlite3
import pandas as pd

DB = r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\warehouse_final\ivf_star_schema.db"
conn = sqlite3.connect(DB)

# 1Ô∏è‚É£  ÿπÿ±ÿ∂ ŸÉŸÑ ÿßŸÑÿ¨ÿØÿßŸàŸÑ ÿßŸÑŸÖŸàÿ¨ŸàÿØÿ© ŸÅÿπŸÑŸäŸãÿß:
print("\n--- ALL TABLES IN DB ---")
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(tables)

# 2Ô∏è‚É£  ÿπÿ±ÿ∂ ÿπÿØÿØ ÿßŸÑÿµŸÅŸàŸÅ ŸÑŸÉŸÑ ÿ¨ÿØŸàŸÑ:
print("\n--- ROW COUNTS ---")
for t in tables['name']:
    count = pd.read_sql(f"SELECT COUNT(*) as rows FROM {t};", conn)
    print(f"{t:<25} ‚Üí {count['rows'][0]} rows")

# 3Ô∏è‚É£  ÿπÿ±ÿ∂ ÿ£ŸàŸÑ 3 ÿµŸÅŸàŸÅ ŸÖŸÜ ŸÉŸÑ ÿ¨ÿØŸàŸÑ (ŸÑŸà ŸÅŸäŸá ÿ®ŸäÿßŸÜÿßÿ™):
print("\n--- SAMPLE DATA (LIMIT 3) ---")
for t in tables['name']:
    try:
        sample = pd.read_sql(f"SELECT * FROM {t} LIMIT 3;", conn)
        print(f"\nTABLE: {t}")
        print(sample)
    except:
        print(f"\nTABLE: {t} ‚Üí Error or no rows")

conn.close()



--- ALL TABLES IN DB ---
                    name
0        sqlite_sequence
1             dim_doctor
2               dim_time
3         fact_ivf_cycle
4          fact_transfer
5   fact_transfer_embryo
6             dim_female
7               dim_male
8           dim_protocol
9            dim_outcome
10            dim_embryo

--- ROW COUNTS ---
sqlite_sequence           ‚Üí 4 rows
dim_doctor                ‚Üí 6 rows
dim_time                  ‚Üí 3376 rows
fact_ivf_cycle            ‚Üí 10000 rows
fact_transfer             ‚Üí 10000 rows
fact_transfer_embryo      ‚Üí 10000 rows
dim_female                ‚Üí 10000 rows
dim_male                  ‚Üí 10000 rows
dim_protocol              ‚Üí 10000 rows
dim_outcome               ‚Üí 10000 rows
dim_embryo                ‚Üí 10000 rows

--- SAMPLE DATA (LIMIT 3) ---

TABLE: sqlite_sequence
             name    seq
0      dim_doctor      6
1        dim_time   3376
2  fact_ivf_cycle  10000

TABLE: dim_doctor
   doctor_id doctor_name              

In [4]:
import sqlite3, pandas as pd
conn = sqlite3.connect(STAR_DB)

df = pd.read_sql("PRAGMA table_info(dim_time);", conn)
print(df)

df = pd.read_sql("SELECT * FROM dim_time LIMIT 5;", conn)
print(df)


conn.close()

   cid        name     type  notnull dflt_value  pk
0    0     time_id  INTEGER        0       None   1
1    1   full_date     TEXT        0       None   0
2    2         day  INTEGER        0       None   0
3    3       month  INTEGER        0       None   0
4    4  month_name     TEXT        0       None   0
5    5     quarter  INTEGER        0       None   0
6    6        year  INTEGER        0       None   0
7    7        week  INTEGER        0       None   0
   time_id   full_date  day  month month_name  quarter  year  week
0        1  2022-03-26   26      3      March        1  2022    12
1        2  2016-03-30   30      3      March        1  2016    13
2        3  2018-09-06    6      9  September        3  2018    36
3        4  2018-03-05    5      3      March        1  2018    10
4        5  2023-03-07    7      3      March        1  2023    10


In [None]:
import sqlite3
import pandas as pd

DB_PATH = r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\warehouse_final\ivf_star_schema.db"

# Connect
conn = sqlite3.connect(DB_PATH)

queries = {
    "Total Female Patients": "SELECT COUNT(*) FROM dim_female;",
    "Total Male Patients": "SELECT COUNT(*) FROM dim_male;",
    "Protocol Distribution": """
        SELECT protocol_type, COUNT(*) 
        FROM dim_protocol
        GROUP BY protocol_type;
    """,
    "Success Outcome Counts": """
        SELECT outcome_id, COUNT(*) 
        FROM fact_transfer
        GROUP BY outcome_id;
    """,
    "Sample Dates (dim_time)": """
        SELECT * FROM dim_time LIMIT 5;
    """
}

for title, q in queries.items():
    print(f"\n--- {title} ---")
    try:
        df = pd.read_sql(q, conn)
        print(df)
    except Exception as e:
        print("Error:", e)

conn.close()
print("\nConnection closed.")



--- Total Female Patients ---
   COUNT(*)
0     10000

--- Total Male Patients ---
   COUNT(*)
0     10000

--- Protocol Distribution ---
  protocol_type  COUNT(*)
0    Antagonist      5307
1          Long      2519
2          Mild       783
3         Short      1158

--- Success Outcome Counts ---
  outcome_id  COUNT(*)
0       None     10000

--- Sample Dates (dim_time) ---
   time_id   full_date  day  month month_name  quarter  year  week
0        1  2022-03-26   26      3      March        1  2022    12
1        2  2016-03-30   30      3      March        1  2016    13
2        3  2018-09-06    6      9  September        3  2018    36
3        4  2018-03-05    5      3      March        1  2018    10
4        5  2023-03-07    7      3      March        1  2023    10

Connection closed.


In [9]:
df = load_raw_df()
df = clean_data(df)
df = apply_placeholder_and_ids(df)
print(df.columns)   # ÿπÿ¥ÿßŸÜ ŸÜÿ™ÿ£ŸÉÿØ ÿßŸÑÿ£ÿπŸÖÿØÿ© ŸÖŸàÿ¨ŸàÿØÿ©

print(df[["case_id", "transfer_time_id", "embryos_transferred"]].head())  # ŸÜÿ¥ŸàŸÅ ÿπŸäŸÜÿßÿ™

print(df["embryos_transferred"].value_counts())  # ŸÜÿπÿ±ŸÅ ŸÑŸà ŸÉŸÑŸáÿß ÿµŸÅÿ± ÿ£Ÿà ŸÅÿßÿ∂Ÿäÿ©



Index(['case_id', 'et_date', 'female_age', 'female_bmi', 'amh_level',
       'fsh_level', 'afc', 'male_age', 'male_factor',
       'semen_count_mill_per_ml', 'motility_percent', 'morphology_percent',
       'protocol_type', 'stimulation_days', 'total_fsh_dose', 'trigger_type',
       'e2_on_trigger', 'endometrium_thickness', 'follicles_18mm',
       'retrieved_oocytes', 'mii_count', 'mi_count', 'gv_count',
       'injected_mii', 'fertilized_oocytes', 'fertilization_rate',
       'cleavage_d3', 'blastocyst_d5', 'good_embryos', 'class_a_rate',
       'fresh_et_stage', 'embryos_transferred', 'grading',
       'pregnancy_test_result', 'clinical_pregnancy', 'live_birth',
       'success_probability_score', 'response_type', 'risk_level',
       'recommended_protocol', 'suggested_waiting_period_days',
       'failure_reason', 'doctor_recommendation', 'female_id', 'male_id',
       'transfer_time_id', 'protocol_id', 'doctor_id', 'outcome_id',
       'embryo_id'],
      dtype='object')
      ca

In [None]:
import sqlite3
import pandas as pd

# ÿ•ŸÜÿ¥ÿßÿ° DB ÿ¨ÿØŸäÿØÿ©
conn = sqlite3.connect(r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\raw\ivf_patients_test.db")

# ÿßŸÑÿØÿßÿ™ÿß ŸÉÿßŸÖŸÑÿ© ÿ®ŸÜŸÅÿ≥ ÿßŸÑÿ¥ŸÉŸÑ ÿßŸÑŸÖÿ∑ŸÑŸàÿ® ŸÑŸÑŸÄ ETL
data = [
    # ÿµŸÅ ŸÖŸÉÿ±ÿ± (ŸÑŸÑÿßÿÆÿ™ÿ®ÿßÿ±)
    ["CASE_TEST_001", "2023-03-07", 32, 24.8, 1.2, 5.0, 10, 38, "Normal", 55.0, 20.0, 5.0, 4, 4, 2, 0.5, 1, 1.0, "D5", 2, "B", "Negative", 0, 0, 0.76, "Poor", "High", "Antagonist", 90, "Unknown", "Good response"],
    ["CASE_TEST_001", "2023-03-07", 32, 24.8, 1.2, 5.0, 10, 38, "Normal", 55.0, 20.0, 5.0, 4, 4, 2, 0.5, 1, 1.0, "D5", 2, "B", "Negative", 0, 0, 0.76, "Poor", "High", "Antagonist", 90, "Unknown", "Good response"],

    # ÿµŸÅ ÿ¨ÿØŸäÿØ
    ["CASE_TEST_002", "2022-11-15", 30, 22.5, 0.9, 4.3, 12, 41, "OAT", 40.0, 18.0, 6.0, 5, 5, 3, 0.6, 2, 0.8, "D3", 1, "C", "Positive", 1, 0, 0.88, "Normal", "Medium", "Mild", 45, "Implantation Failure", "Monitor closely"],

    # ÿµŸÅ ÿ¨ÿØŸäÿØ (ŸÜÿµŸÅ ÿßŸÑÿ®ŸäÿßŸÜÿßÿ™ ŸÜÿßŸÇÿµ ‚Üí ÿßÿÆÿ™ÿ®ÿßÿ± null handling)
    ["CASE_TEST_003", None, 29, None, None, 3.1, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 0.00, None, None, None, None, None, None]
]

columns = [
    "case_id","et_date","female_age","female_bmi","amh_level","fsh_level","afc",
    "male_age","male_factor","semen_count_mill_per_ml","motility_percent","morphology_percent",
    "retrieved_oocytes","mii_count","num_embryos_generated","fertilization_rate",
    "good_embryos","class_a_rate","fresh_et_stage","embryos_transferred","grading",
    "pregnancy_test_result","clinical_pregnancy","live_birth","success_probability_score",
    "response_type","risk_level","recommended_protocol","suggested_waiting_period_days",
    "failure_reason","doctor_recommendation"
]

df = pd.DataFrame(data, columns=columns)
df.to_sql("ivf_patients", conn, if_exists="replace", index=False)

conn.close()
print("‚úî Test DB created successfully: ivf_patients_test.db")


‚úî Test DB created successfully: ivf_patients_test.db


In [None]:
# Load the database and display full table
import sqlite3
import pandas as pd

db_path = r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\raw\ivf_patients_test.db"
conn = sqlite3.connect(db_path)

df = pd.read_sql("SELECT * FROM ivf_patients", conn)
conn.close()

df


Unnamed: 0,case_id,et_date,female_age,female_bmi,amh_level,fsh_level,afc,male_age,male_factor,semen_count_mill_per_ml,...,pregnancy_test_result,clinical_pregnancy,live_birth,success_probability_score,response_type,risk_level,recommended_protocol,suggested_waiting_period_days,failure_reason,doctor_recommendation
0,CASE_TEST_001,2023-03-07,32,24.8,1.2,5.0,10.0,38.0,Normal,55.0,...,Negative,0.0,0.0,0.76,Poor,High,Antagonist,90.0,Unknown,Good response
1,CASE_TEST_001,2023-03-07,32,24.8,1.2,5.0,10.0,38.0,Normal,55.0,...,Negative,0.0,0.0,0.76,Poor,High,Antagonist,90.0,Unknown,Good response
2,CASE_TEST_002,2022-11-15,30,22.5,0.9,4.3,12.0,41.0,OAT,40.0,...,Positive,1.0,0.0,0.88,Normal,Medium,Mild,45.0,Implantation Failure,Monitor closely
3,CASE_TEST_003,,29,,,3.1,,,,,...,,,,0.0,,,,,,


In [None]:
import sqlite3
import pandas as pd

# ================================
# 1) PATHS
# ================================
db_path = r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\raw\ivf_database_updated.db"
excel_output = r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\raw\raw_database_dump.xlsx"

# ================================
# 2) CONNECT TO DB
# ================================
conn = sqlite3.connect(db_path)

# ================================
# 3) READ TABLE NAMES
# ================================
tables = pd.read_sql(
    "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'",
    conn
)

table_list = tables["name"].tolist()

# ================================
# 4) EXPORT TO EXCEL
# ================================
with pd.ExcelWriter(excel_output, engine="openpyxl") as writer:
    for tbl in table_list:
        df = pd.read_sql(f"SELECT * FROM {tbl}", conn)
        df.to_excel(writer, sheet_name=tbl[:31], index=False)  # Excel sheet name limit

# ================================
# 5) CLOSE CONNECTION
# ================================
conn.close()

print("Done! Excel saved at:", excel_output)


Done! Excel saved at: E:\work\DEPI\graduation promax\data\raw\raw_database_dump.xlsx


In [None]:
import os
import logging
import sqlite3
import uuid
from datetime import datetime
import pandas as pd
import numpy as np

# ===========================================================
#                 CONFIGURATION
# ===========================================================
BASE_PATH = r"E:\work\github graduation project local\Graduation-Project\graduation project final"
RAW_DB = fr"{BASE_PATH}\data\raw\ivf_database_updated.db"
STAR_DB = fr"{BASE_PATH}\data\warehouse_final\ivf_star_schema.db"
SCHEMA_SQL = fr"{BASE_PATH}\src\ETL\create_star_schema.sql"
LOG_FILE = fr"{BASE_PATH}\src\ETL\logs\etl_log_ivf.txt"

os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
os.makedirs(os.path.dirname(STAR_DB), exist_ok=True)

logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger("ivf_etl")

# ===========================================================
#                SCHEMA SQL (FULL REFRESH)
# ===========================================================
def run_schema_sql():
    try:
        conn = sqlite3.connect(STAR_DB)
        with open(SCHEMA_SQL, "r", encoding="utf-8") as f:
            conn.executescript(f.read())
        conn.commit()
        conn.close()
        logging.info("Schema (fresh) created successfully.")
    except Exception as e:
        logging.exception("run_schema_sql FAILED")
        raise

# ===========================================================
#                   RAW LOADING
# ===========================================================
def load_raw_df():
    try:
        conn = sqlite3.connect(RAW_DB)
        df = pd.read_sql("SELECT * FROM ivf_patients", conn)
        conn.close()
        logging.info(f"Loaded {len(df)} raw rows.")
        return df
    except Exception as e:
        logging.exception("load_raw_df FAILED")
        raise

# ===========================================================
#                   CLEAN DATA
# ===========================================================
def clean_data(df):
    try:
        df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
        df = df.drop_duplicates()
        return df
    except Exception as e:
        logging.exception("clean_data FAILED")
        raise

# ===========================================================
#    CHECK REQUIRED COLUMNS (ONLY 3)
# ===========================================================
def check_required(df):
    required = ["case_id", "female_id", "male_id"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        logging.error(f"Missing required columns: {missing}")
        raise ValueError(f"Missing required columns: {missing}")

# ===========================================================
#   GENERATE ID IF MISSING (preserve existing IDs)
# ===========================================================
def gen_if_missing_series(series, prefix):
    # returns a series where missing/empty entries are replaced with prefix_uuid
    def gen(v):
        if pd.isna(v) or str(v).strip() == "":
            return f"{prefix}_{uuid.uuid4().hex[:8]}"
        return v
    return series.apply(gen)

# ===========================================================
#   SET NON-CRITICAL IDs TO NULL + GEN FOR protocol/outcome/embryo
#   IMPORTANT: generate only when refresh=True (full refresh).
# ===========================================================
def handle_ids(df, refresh):
    try:
        # ensure columns exist
        for c in ["protocol_id", "outcome_id", "embryo_id"]:
            if c not in df.columns:
                df[c] = None

        # generate IDs only if refresh == True (full refresh)
        if refresh:
            df["protocol_id"] = gen_if_missing_series(df["protocol_id"], "prot")
            df["outcome_id"] = gen_if_missing_series(df["outcome_id"], "out")
            df["embryo_id"] = gen_if_missing_series(df["embryo_id"], "emb")
        else:
            # incremental mode: do NOT generate new IDs here
            # keep any existing IDs from raw; leave None if missing
            pass

        # optional columns
        if "fresh_et_stage" not in df.columns:
            df["fresh_et_stage"] = None
        if "grading" not in df.columns:
            df["grading"] = None

        # transfer_time_id from et_date (string YYYY-MM-DD) or None
        df["transfer_time_id"] = pd.to_datetime(df.get("et_date", None), errors="coerce").dt.strftime("%Y-%m-%d")

        return df
    except Exception as e:
        logging.exception("handle_ids FAILED")
        raise

# ===========================================================
#     MAP DOCTOR NAME ‚Üí AUTO INCREMENT doctor_id + recommendation
# ===========================================================
def map_doctor_ids(df, conn):
    try:
        # Ensure both columns exist
        if "doctor_name" not in df.columns:
            df["doctor_name"] = "Unknown"

        if "doctor_recommendation" not in df.columns:
            df["doctor_recommendation"] = None

        # Ensure table exists
        conn.execute("""
            CREATE TABLE IF NOT EXISTS dim_doctor (
                doctor_id INTEGER PRIMARY KEY AUTOINCREMENT,
                doctor_name TEXT,
                doctor_recommendation TEXT,
                UNIQUE (doctor_name, doctor_recommendation)
            )
        """)
        conn.commit()

        # Read existing doctor combinations
        existing = pd.read_sql(
            "SELECT doctor_id, doctor_name, doctor_recommendation FROM dim_doctor",
            conn
        )

        # Build natural key: name + recommendation
        existing["nat_key"] = (
            existing["doctor_name"].astype(str).str.strip()
            + "|" +
            existing["doctor_recommendation"].astype(str).str.strip()
        )

        df["nat_key"] = (
            df["doctor_name"].astype(str).str.strip()
            + "|" +
            df["doctor_recommendation"].astype(str).str.strip()
        )

        # Map existing nat_keys ‚Üí doctor_id
        nat_to_id = dict(zip(existing["nat_key"], existing["doctor_id"]))

        # Identify new doctor rows
        new_rows = df.loc[~df["nat_key"].isin(nat_to_id)][
            ["doctor_name", "doctor_recommendation", "nat_key"]
        ].drop_duplicates()

        # Insert new rows
        for _, row in new_rows.iterrows():
            cur = conn.execute(
                "INSERT OR IGNORE INTO dim_doctor (doctor_name, doctor_recommendation) VALUES (?, ?)",
                (row["doctor_name"], row["doctor_recommendation"])
            )
            conn.commit()

            # Retrieve id after insert
            result = conn.execute(
                "SELECT doctor_id FROM dim_doctor WHERE doctor_name = ? AND doctor_recommendation = ?",
                (row["doctor_name"], row["doctor_recommendation"])
            ).fetchone()
            if result:
                doctor_id = result[0]
            else:
                continue  

            nat_to_id[row["nat_key"]] = doctor_id

        # Assign final doctor_ids
        df["doctor_id"] = df["nat_key"].map(nat_to_id)

        logging.info(f"Mapped {len(nat_to_id)} unique doctor entries.")
        return df

    except Exception:
        logging.exception("map_doctor_ids FAILED")
        raise


# ===========================================================
#   SAFE INSERT ‚Üí NO DUPLICATION (helper)
# ===========================================================
def insert_or_ignore(table, df_subset, conn):
    if df_subset is None or df_subset.shape[0] == 0:
        return
    cols = df_subset.columns.tolist()
    placeholders = ",".join("?" * len(cols))
    sql = f"INSERT OR IGNORE INTO {table} ({','.join(cols)}) VALUES ({placeholders})"
    try:
        conn.executemany(sql, df_subset.values.tolist())
        conn.commit()
    except Exception:
        logging.exception(f"insert_or_ignore FAILED for {table}")
        raise

# ===========================================================
#       DIMENSIONS LOADING (kept behavior; fallback on error)
# ===========================================================
def load_dimensions(df, conn, refresh=True):
    dim_tables = {
        "dim_female":  ["female_id","female_age","female_bmi","amh_level","fsh_level","afc"],
        "dim_male":    ["male_id","male_age","male_factor","semen_count_mill_per_ml",
                        "motility_percent","morphology_percent"],
        "dim_protocol":["protocol_id","protocol_type","stimulation_days",
                        "total_fsh_dose","trigger_type","recommended_protocol"],
        "dim_outcome": ["outcome_id","risk_level","response_type",
                        "suggested_waiting_period_days","failure_reason"],
        "dim_embryo":  ["embryo_id","fresh_et_stage","grading","class_a_rate"]
    }

    for table, cols in dim_tables.items():

        # üîí ÿ∂ŸÖÿßŸÜ Ÿàÿ¨ŸàÿØ ŸÉŸÑ ÿßŸÑÿ£ÿπŸÖÿØÿ© ÿßŸÑŸÖÿ∑ŸÑŸàÿ®ÿ© ÿ≠ÿ™Ÿâ ŸÑŸà ŸÖÿ¥ ŸÖŸàÿ¨ŸàÿØÿ© ŸÅŸä ÿßŸÑŸÄ raw
        for c in cols:
            if c not in df.columns:
                df[c] = None

        subset = df[cols].drop_duplicates()

        try:
            if refresh:
                # original replace behavior
                subset.to_sql(table, conn, if_exists="replace", index=False)
            else:
                # incremental: safe insert-or-ignore
                insert_or_ignore(table, subset, conn)

            logging.info(f"{table}: {len(subset)} processed.")

        except sqlite3.IntegrityError:
            # fallback: insert rows one by one
            logging.warning(f"{table}: IntegrityError on bulk insert ‚Äî falling back to row-by-row INSERT OR IGNORE")
            placeholders = ",".join("?" * len(cols))
            columns = ",".join(cols)
            sql = f"INSERT OR IGNORE INTO {table} ({columns}) VALUES ({placeholders})"

            cur = conn.cursor()
            for row in subset.values.tolist():
                try:
                    cur.execute(sql, row)
                except Exception:
                    logging.exception(f"{table}: failed to insert row {row}")
            conn.commit()

        except Exception:
            logging.exception(f"load_dimensions failed for {table}")
            raise


# ===========================================================
#       DIM_TIME
# ===========================================================
def build_dim_time(df, conn):
    try:
        tmp = pd.to_datetime(df.get("et_date", None), errors="coerce").dropna().drop_duplicates()

        time_dim = pd.DataFrame({
            "full_date": tmp.dt.strftime("%Y-%m-%d"),
            "day": tmp.dt.day,
            "month": tmp.dt.month,
            "month_name": tmp.dt.month_name(),
            "quarter": tmp.dt.quarter,
            "year": tmp.dt.year,
            "week": tmp.dt.isocalendar().week.astype(int)
        })

        for _, row in time_dim.iterrows():
            conn.execute("""
                INSERT OR IGNORE INTO dim_time
                (full_date, day, month, month_name, quarter, year, week)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            """, tuple(row))
        conn.commit()
        logging.info("build_dim_time done.")
    except Exception:
        logging.exception("build_dim_time FAILED")
        raise

# ===========================================================
#              FACT TABLES
# ===========================================================
def load_fact_tables(df, conn):
    try:
        time_df = pd.read_sql("SELECT time_id, full_date FROM dim_time;", conn)
        date_to_id = dict(zip(time_df["full_date"], time_df["time_id"]))

        # Fill missing fact numeric columns with 0
        fact_needed = [
            "e2_on_trigger","endometrium_thickness","follicles_18mm",
            "gv_count","injected_m2","fertilized_oocytes",
            "cleavage_d3","blastocyst_d5","good_embryos"
        ]
        for col in fact_needed:
            if col not in df.columns:
                df[col] = 0

        df["cycle_start_time_id"] = df["transfer_time_id"].map(date_to_id)

        fact_cycle = df.drop_duplicates(subset=["case_id"])[[
            "case_id","female_id","male_id","protocol_id","doctor_id","outcome_id",
            "cycle_start_time_id","e2_on_trigger","endometrium_thickness",
            "follicles_18mm","retrieved_oocytes","m2_count","gv_count",
            "injected_m2","fertilized_oocytes","fertilization_rate",
            "cleavage_d3","blastocyst_d5","good_embryos"
        ]]
        insert_or_ignore("fact_ivf_cycle", fact_cycle, conn)

        # ---------------- FACT TRANSFER ----------------
        if all(col in df.columns for col in [
            "case_id","transfer_time_id","doctor_id","embryos_transferred"
        ]):
            tmp = df.drop_duplicates(subset=["case_id"]).copy()
            tmp["transfer_time_fk"] = tmp["transfer_time_id"].map(date_to_id)
            fact_transfer = tmp[[
                "case_id","transfer_time_fk","doctor_id",
                "embryos_transferred","pregnancy_test_result",
                "clinical_pregnancy","live_birth",
                "outcome_id","success_probability_score"
            ]]
            insert_or_ignore("fact_transfer", fact_transfer, conn)

        # ---------------- FACT TRANSFER EMBRYO ----------------
        try:
            existing_transfer = pd.read_sql("SELECT transfer_sk, case_id FROM fact_transfer;", conn)
            if not existing_transfer.empty and "embryo_id" in df.columns:
                df_merge = df.merge(existing_transfer, on="case_id", how="inner")
                fact_embryo = df_merge[["transfer_sk","embryo_id"]].drop_duplicates()
                insert_or_ignore("fact_transfer_embryo", fact_embryo, conn)
        except Exception:
            logging.exception("fact_transfer_embryo skipped.")
    except Exception:
        logging.exception("load_fact_tables FAILED")
        raise

# ===========================================================
#                    MAIN ETL
# ===========================================================
def run_full_etl(refresh=True):
    logging.info("===== ETL STARTED =====")
    try:
        if refresh:
            run_schema_sql()

        df = load_raw_df()
        df = clean_data(df)
        
        # === AUTO-GENERATE female_id / male_id IF MISSING ===
        if "female_id" not in df.columns:
            df["female_id"] = ["F_" + uuid.uuid4().hex[:8] for _ in range(len(df))]

        if "male_id" not in df.columns:
            df["male_id"] = ["M_" + uuid.uuid4().hex[:8] for _ in range(len(df))]


        # Required columns check
        check_required(df)

        # Normalize MII/M2 naming (safeguard)
        rename_map = {"mii_count": "m2_count", "injected_mii": "injected_m2"}
        df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

        # IDs and optional fields ‚Äî pass refresh so incremental won't generate IDs
        df = handle_ids(df, refresh)

        conn = sqlite3.connect(STAR_DB)

        # Map doctor (auto inc) and recommendation handling
        df = map_doctor_ids(df, conn)

        # Dimensions
        load_dimensions(df, conn, refresh=refresh)

        # Time dimension
        build_dim_time(df, conn)

        # Fact tables
        load_fact_tables(df, conn)

        conn.close()
        logging.info("ETL COMPLETED SUCCESSFULLY.")
        print("ETL Done ‚úî")
        return True
    except Exception as e:
        logging.exception("run_full_etl FAILED")
        print(f"ETL FAILED: {e}")
        return False

# ===========================================================
#                      RUN
# ===========================================================
if __name__ == "__main__":
    run_full_etl(refresh=True)


ETL Done ‚úî


In [None]:
import sqlite3
import pandas as pd

DB = r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\warehouse_final\ivf_star_schema.db"
conn = sqlite3.connect(DB)

# 1Ô∏è‚É£  ÿπÿ±ÿ∂ ŸÉŸÑ ÿßŸÑÿ¨ÿØÿßŸàŸÑ ÿßŸÑŸÖŸàÿ¨ŸàÿØÿ© ŸÅÿπŸÑŸäŸãÿß:
print("\n--- ALL TABLES IN DB ---")
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(tables)

# 2Ô∏è‚É£  ÿπÿ±ÿ∂ ÿπÿØÿØ ÿßŸÑÿµŸÅŸàŸÅ ŸÑŸÉŸÑ ÿ¨ÿØŸàŸÑ:
print("\n--- ROW COUNTS ---")
for t in tables['name']:
    count = pd.read_sql(f"SELECT COUNT(*) as rows FROM {t};", conn)
    print(f"{t:<25} ‚Üí {count['rows'][0]} rows")

# 3Ô∏è‚É£  ÿπÿ±ÿ∂ ÿ£ŸàŸÑ 3 ÿµŸÅŸàŸÅ ŸÖŸÜ ŸÉŸÑ ÿ¨ÿØŸàŸÑ (ŸÑŸà ŸÅŸäŸá ÿ®ŸäÿßŸÜÿßÿ™):
print("\n--- SAMPLE DATA (LIMIT 3) ---")
for t in tables['name']:
    try:
        sample = pd.read_sql(f"SELECT * FROM {t} LIMIT 3;", conn)
        print(f"\nTABLE: {t}")
        print(sample)
    except:
        print(f"\nTABLE: {t} ‚Üí Error or no rows")

conn.close()



--- ALL TABLES IN DB ---
                    name
0        sqlite_sequence
1             dim_doctor
2               dim_time
3         fact_ivf_cycle
4          fact_transfer
5   fact_transfer_embryo
6             dim_female
7               dim_male
8           dim_protocol
9            dim_outcome
10            dim_embryo

--- ROW COUNTS ---
sqlite_sequence           ‚Üí 4 rows
dim_doctor                ‚Üí 6 rows
dim_time                  ‚Üí 3376 rows
fact_ivf_cycle            ‚Üí 10000 rows
fact_transfer             ‚Üí 10000 rows
fact_transfer_embryo      ‚Üí 10000 rows
dim_female                ‚Üí 10000 rows
dim_male                  ‚Üí 10000 rows
dim_protocol              ‚Üí 10000 rows
dim_outcome               ‚Üí 10000 rows
dim_embryo                ‚Üí 10000 rows

--- SAMPLE DATA (LIMIT 3) ---

TABLE: sqlite_sequence
             name    seq
0      dim_doctor      6
1        dim_time   3376
2  fact_ivf_cycle  10000

TABLE: dim_doctor
   doctor_id doctor_name              

In [None]:
import sqlite3
import pandas as pd

# ================================
# 1) PATHS
# ================================
db_path = r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\warehouse_final\ivf_star_schema.db"
excel_output = r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\warehouse_final\database_dump.xlsx"

# ================================
# 2) CONNECT TO DB
# ================================
conn = sqlite3.connect(db_path)

# ================================
# 3) READ TABLE NAMES
# ================================
tables = pd.read_sql(
    "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'",
    conn
)

table_list = tables["name"].tolist()

# ================================
# 4) EXPORT TO EXCEL
# ================================
with pd.ExcelWriter(excel_output, engine="openpyxl") as writer:
    for tbl in table_list:
        df = pd.read_sql(f"SELECT * FROM {tbl}", conn)
        df.to_excel(writer, sheet_name=tbl[:31], index=False)  # Excel sheet name limit

# ================================
# 5) CLOSE CONNECTION
# ================================
conn.close()

print("Done! Excel saved at:", excel_output)


Done! Excel saved at: E:\work\DEPI\graduation promax\data\warehouse_final\database_dump.xlsx


In [None]:
import sqlite3
import pandas as pd

RAW_DB = r"E:\work\github graduation project local\Graduation-Project\graduation project final\data\raw\ivf_database_updated.db"

conn = sqlite3.connect(RAW_DB)

query = """
SELECT DISTINCT
    doctor_recommendation
FROM ivf_patients
WHERE doctor_recommendation IS NOT NULL;
"""

df = pd.read_sql(query, conn)
conn.close()

print(df)


                               doctor_recommendation
0  Increase gonadotropin dose in next cycle for b...
1  Proceed with embryo freezing for future transfer.
2  Monitor progesterone closely during luteal phase.
3  Consider switching to mild stimulation protoco...
4            Good prognosis, continue same protocol.
5  Optimize sperm selection for ICSI in next atte...
