In [2]:
import os
import logging
import sqlite3
from datetime import datetime
import pandas as pd
import numpy as np

# ===========================================================
#                 CONFIGURATION
# ===========================================================
BASE_PATH = r"E:\work\DEPI\graduation promax"
RAW_DB = fr"{BASE_PATH}\data\raw\ivf_patients_test.db"
STAR_DB = fr"{BASE_PATH}\data\warehouse_final\ivf_star_schema.db"  # Fixed DB name
SCHEMA_SQL = fr"{BASE_PATH}\src\ETL\create_star_schema.sql"
LOG_FILE = fr"{BASE_PATH}\src\ETL\logs\etl_log_ivf.txt"

os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
os.makedirs(os.path.dirname(STAR_DB), exist_ok=True)

logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)

# ===========================================================
#                SCHEMA SQL (FULL REFRESH)
# ===========================================================
def run_schema_sql():
    conn = sqlite3.connect(STAR_DB)
    with open(SCHEMA_SQL, "r", encoding="utf-8") as f:
        conn.executescript(f.read())
    conn.commit()
    conn.close()
    logging.info("Schema (fresh) created successfully.")


# ===========================================================
#                   RAW LOADING
# ===========================================================
def load_raw_df():
    try:
        conn = sqlite3.connect(RAW_DB)
        df = pd.read_sql("SELECT * FROM ivf_patients", conn)
        conn.close()
        logging.info(f"Loaded {len(df)} raw rows.")
        return df
    except Exception as e:
        logging.exception(f"Raw load FAILED: {e}")
        raise


# ===========================================================
#                   CLEAN DATA
# ===========================================================
def clean_data(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    df = df.drop_duplicates()
    return df


# ===========================================================
#    SAFE PLACEHOLDER & ID GENERATION FOR MISSING DATA
# ===========================================================
def apply_placeholder_and_ids(df):
    df["female_id"] = [f"f_{i}" for i in df.index]
    df["male_id"]   = [f"m_{i}" for i in df.index]

    required_cols_protocol = ["protocol_type", "stimulation_days", "total_fsh_dose", "trigger_type"]
    for col in required_cols_protocol:
        if col not in df.columns:
            df[col] = "Unknown"

    df["protocol_id"] = (
        "prot_" + df["protocol_type"].astype(str).str.lower().str.replace(" ","_") +
        "_d" + df["stimulation_days"].astype(str) +
        "_dose" + df["total_fsh_dose"].astype(str) +
        "_trg_" + df["trigger_type"].astype(str).str.lower().str.replace(" ","_")
    )

    df["doctor_id"] = "dr_unknown"

    if "risk_level" not in df.columns: df["risk_level"] = "Unknown"
    if "response_type" not in df.columns: df["response_type"] = "Unknown"

    df["outcome_id"] = (
        "out_" + df["risk_level"].astype(str).str.lower().str.replace(" ","_") +
        "_" + df["response_type"].astype(str).str.lower().str.replace(" ","_")
    )

    if "fresh_et_stage" not in df.columns: df["fresh_et_stage"] = "NA"
    if "grading" not in df.columns: df["grading"] = "NA"

    df["embryo_id"] = [
        f"emb_{row.fresh_et_stage}_{row.grading}_{i}"
        for i, row in df.iterrows()
    ]

    df["transfer_time_id"] = pd.to_datetime(df.get("et_date", None), errors="coerce").dt.strftime("%Y-%m-%d")

    return df


# ===========================================================
#   SAFE INSERT → NO DUPLICATION (APPEND MODE)
# ===========================================================
def insert_or_ignore(table, df_subset, conn):
    cols = df_subset.columns.tolist()
    placeholders = ",".join("?" * len(cols))
    sql = f"INSERT OR IGNORE INTO {table} ({','.join(cols)}) VALUES ({placeholders})"
    conn.executemany(sql, df_subset.values.tolist())
    conn.commit()


# ===========================================================
#       DIMENSIONS LOADING (BOTH MODES SUPPORTED)
# ===========================================================
def load_dimensions(df, conn, refresh=True):
    dim_tables = {
        "dim_female":  ["female_id","female_age","female_bmi","amh_level","fsh_level","afc"],
        "dim_male":    ["male_id","male_age","male_factor","semen_count_mill_per_ml","motility_percent","morphology_percent"],
        "dim_protocol":["protocol_id","protocol_type","stimulation_days","total_fsh_dose","trigger_type","recommended_protocol"],
        "dim_doctor":  ["doctor_id","doctor_recommendation"],
        "dim_outcome": ["outcome_id","risk_level","response_type","suggested_waiting_period_days","failure_reason"],
        "dim_embryo":  ["embryo_id","fresh_et_stage","grading","class_a_rate"]
    }

    for table, cols in dim_tables.items():
        subset = df[cols].drop_duplicates()

        if refresh:
            subset.to_sql(table, conn, if_exists="replace", index=False)
        else:
            insert_or_ignore(table, subset, conn)

        logging.info(f"{table}: {len(subset)} processed.")


# ===========================================================
#       DIM_TIME → ALWAYS APPEND (NO DUPLICATES)
# ===========================================================
def build_dim_time(df, conn):
    tmp = pd.to_datetime(df.get("et_date", None), errors="coerce").dropna().drop_duplicates()

    time_dim = pd.DataFrame({
        "full_date": tmp.dt.strftime("%Y-%m-%d"),
        "day": tmp.dt.day,
        "month": tmp.dt.month,
        "month_name": tmp.dt.month_name(),
        "quarter": tmp.dt.quarter,
        "year": tmp.dt.year,
        "week": tmp.dt.isocalendar().week.astype(int)
    })

    for _, row in time_dim.iterrows():
        conn.execute("""
            INSERT OR IGNORE INTO dim_time
            (full_date, day, month, month_name, quarter, year, week)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        """, tuple(row))
    conn.commit()
    logging.info("dim_time DONE.")


# ===========================================================
#              FACT TABLES (APPEND SAFE)
# ===========================================================
def load_fact_tables(df, conn):
    time_df = pd.read_sql("SELECT time_id, full_date FROM dim_time;", conn)
    date_to_id = dict(zip(time_df["full_date"], time_df["time_id"]))

    fact_needed_cols = [
        "e2_on_trigger","endometrium_thickness","follicles_18mm",
        "gv_count","injected_m2","fertilized_oocytes",
        "cleavage_d3","blastocyst_d5","good_embryos"
    ]
    for col in fact_needed_cols:
        if col not in df.columns:
            df[col] = 0

    df["cycle_start_time_id"] = df["transfer_time_id"].map(date_to_id)

    fact_cycle = df.drop_duplicates(subset=["case_id"])[[
        "case_id","female_id","male_id","protocol_id","doctor_id","outcome_id",
        "cycle_start_time_id","e2_on_trigger","endometrium_thickness",
        "follicles_18mm","retrieved_oocytes","m2_count","gv_count",
        "injected_m2","fertilized_oocytes","fertilization_rate",
        "cleavage_d3","blastocyst_d5","good_embryos"
    ]]
    insert_or_ignore("fact_ivf_cycle", fact_cycle, conn)

    # ---------- FACT 2 ----------
    if all(col in df.columns for col in [
        "case_id","transfer_time_id","doctor_id","embryos_transferred"
    ]):
        tmp = df.drop_duplicates(subset=["case_id"]).copy()
        tmp["transfer_time_fk"] = tmp["transfer_time_id"].map(date_to_id)
        fact_transfer = tmp[[
            "case_id","transfer_time_fk","doctor_id","embryos_transferred",
            "pregnancy_test_result","clinical_pregnancy",
            "live_birth","outcome_id","success_probability_score"
        ]]
        insert_or_ignore("fact_transfer", fact_transfer, conn)

    # ---------- FACT 3 ----------
    try:
        existing_transfer = pd.read_sql("SELECT transfer_sk, case_id FROM fact_transfer;", conn)
        if not existing_transfer.empty and "embryo_id" in df.columns:
            df_merge = df.merge(existing_transfer, on="case_id", how="inner")
            fact_embryo = df_merge[["transfer_sk","embryo_id"]].drop_duplicates()
            insert_or_ignore("fact_transfer_embryo", fact_embryo, conn)
    except:
        logging.warning("fact_transfer_embryo skipped.")


# ===========================================================
#                    MAIN ETL
# ===========================================================
def run_full_etl(refresh=True):
    logging.info("===== ETL STARTED =====")
    if refresh:
        run_schema_sql()

    df = load_raw_df()
    df = clean_data(df)

    rename_map = {"mii_count": "m2_count", "injected_mii": "injected_m2"}
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

    df = apply_placeholder_and_ids(df)

    conn = sqlite3.connect(STAR_DB)
    load_dimensions(df, conn, refresh=refresh)
    build_dim_time(df, conn)
    load_fact_tables(df, conn)
    conn.close()

    logging.info("ETL COMPLETED SUCCESSFULLY.")
    print("ETL Done ✔")


if __name__ == "__main__":
    run_full_etl(refresh=False)   # First time = True, then False


ETL Done ✔


In [4]:
import sqlite3
import pandas as pd

DB = r"E:\work\DEPI\graduation promax\data\warehouse_final\ivf_star_schema.db"
conn = sqlite3.connect(DB)

# 1️⃣  عرض كل الجداول الموجودة فعليًا:
print("\n--- ALL TABLES IN DB ---")
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(tables)

# 2️⃣  عرض عدد الصفوف لكل جدول:
print("\n--- ROW COUNTS ---")
for t in tables['name']:
    count = pd.read_sql(f"SELECT COUNT(*) as rows FROM {t};", conn)
    print(f"{t:<25} → {count['rows'][0]} rows")

# 3️⃣  عرض أول 3 صفوف من كل جدول (لو فيه بيانات):
print("\n--- SAMPLE DATA (LIMIT 3) ---")
for t in tables['name']:
    try:
        sample = pd.read_sql(f"SELECT * FROM {t} LIMIT 3;", conn)
        print(f"\nTABLE: {t}")
        print(sample)
    except:
        print(f"\nTABLE: {t} → Error or no rows")

conn.close()



--- ALL TABLES IN DB ---
                    name
0               dim_time
1        sqlite_sequence
2         fact_ivf_cycle
3          fact_transfer
4   fact_transfer_embryo
5             dim_female
6               dim_male
7           dim_protocol
8             dim_doctor
9            dim_outcome
10            dim_embryo

--- ROW COUNTS ---
dim_time                  → 3376 rows
sqlite_sequence           → 3 rows
fact_ivf_cycle            → 10003 rows
fact_transfer             → 10003 rows
fact_transfer_embryo      → 10003 rows
dim_female                → 10003 rows
dim_male                  → 10003 rows
dim_protocol              → 9770 rows
dim_doctor                → 9 rows
dim_outcome               → 1753 rows
dim_embryo                → 10003 rows

--- SAMPLE DATA (LIMIT 3) ---

TABLE: dim_time
   time_id   full_date  day  month month_name  quarter  year  week
0        1  2022-03-26   26      3      March        1  2022    12
1        2  2016-03-30   30      3      March        1

In [5]:
import sqlite3, pandas as pd
conn = sqlite3.connect(STAR_DB)

df = pd.read_sql("PRAGMA table_info(dim_time);", conn)
print(df)

conn.close()

   cid        name     type  notnull dflt_value  pk
0    0     time_id  INTEGER        0       None   1
1    1   full_date     TEXT        0       None   0
2    2         day  INTEGER        0       None   0
3    3       month  INTEGER        0       None   0
4    4  month_name     TEXT        0       None   0
5    5     quarter  INTEGER        0       None   0
6    6        year  INTEGER        0       None   0
7    7        week  INTEGER        0       None   0


In [3]:
import sqlite3
import pandas as pd

DB_PATH = r"E:\work\DEPI\graduation promax\data\warehouse_final\ivf_star_schema.db"

# Connect
conn = sqlite3.connect(DB_PATH)

queries = {
    "Total Female Patients": "SELECT COUNT(*) FROM dim_female;",
    "Total Male Patients": "SELECT COUNT(*) FROM dim_male;",
    "Protocol Distribution": """
        SELECT protocol_type, COUNT(*) 
        FROM dim_protocol
        GROUP BY protocol_type;
    """,
    "Success Outcome Counts": """
        SELECT outcome_id, COUNT(*) 
        FROM fact_transfer
        GROUP BY outcome_id;
    """,
    "Sample Dates (dim_time)": """
        SELECT * FROM dim_time LIMIT 5;
    """
}

for title, q in queries.items():
    print(f"\n--- {title} ---")
    try:
        df = pd.read_sql(q, conn)
        print(df)
    except Exception as e:
        print("Error:", e)

conn.close()
print("\nConnection closed.")



--- Total Female Patients ---
   COUNT(*)
0     10003

--- Total Male Patients ---
   COUNT(*)
0     10003

--- Protocol Distribution ---
  protocol_type  COUNT(*)
0    Antagonist      5307
1          Long      2519
2          Mild       783
3         Short      1158
4       Unknown         3

--- Success Outcome Counts ---
             outcome_id  COUNT(*)
0         out_high_high        25
1       out_high_normal      2277
2         out_high_poor       141
3          out_low_high        22
4        out_low_normal       494
5          out_low_poor         6
6     out_medium_normal         1
7     out_moderate_high       116
8   out_moderate_normal      6616
9     out_moderate_poor       304
10        out_none_none         1

--- Sample Dates (dim_time) ---
   time_id   full_date  day  month month_name  quarter  year  week
0        1  2022-03-26   26      3      March        1  2022    12
1        2  2016-03-30   30      3      March        1  2016    13
2        3  2018-09-06    6    

In [9]:
df = load_raw_df()
df = clean_data(df)
df = apply_placeholder_and_ids(df)
print(df.columns)   # عشان نتأكد الأعمدة موجودة

print(df[["case_id", "transfer_time_id", "embryos_transferred"]].head())  # نشوف عينات

print(df["embryos_transferred"].value_counts())  # نعرف لو كلها صفر أو فاضية



Index(['case_id', 'et_date', 'female_age', 'female_bmi', 'amh_level',
       'fsh_level', 'afc', 'male_age', 'male_factor',
       'semen_count_mill_per_ml', 'motility_percent', 'morphology_percent',
       'protocol_type', 'stimulation_days', 'total_fsh_dose', 'trigger_type',
       'e2_on_trigger', 'endometrium_thickness', 'follicles_18mm',
       'retrieved_oocytes', 'mii_count', 'mi_count', 'gv_count',
       'injected_mii', 'fertilized_oocytes', 'fertilization_rate',
       'cleavage_d3', 'blastocyst_d5', 'good_embryos', 'class_a_rate',
       'fresh_et_stage', 'embryos_transferred', 'grading',
       'pregnancy_test_result', 'clinical_pregnancy', 'live_birth',
       'success_probability_score', 'response_type', 'risk_level',
       'recommended_protocol', 'suggested_waiting_period_days',
       'failure_reason', 'doctor_recommendation', 'female_id', 'male_id',
       'transfer_time_id', 'protocol_id', 'doctor_id', 'outcome_id',
       'embryo_id'],
      dtype='object')
      ca

In [7]:
import sqlite3
import pandas as pd

# إنشاء DB جديدة
conn = sqlite3.connect(r"E:\work\DEPI\graduation promax\data\raw\ivf_patients_test.db")

# الداتا كاملة بنفس الشكل المطلوب للـ ETL
data = [
    # صف مكرر (للاختبار)
    ["CASE_TEST_001", "2023-03-07", 32, 24.8, 1.2, 5.0, 10, 38, "Normal", 55.0, 20.0, 5.0, 4, 4, 2, 0.5, 1, 1.0, "D5", 2, "B", "Negative", 0, 0, 0.76, "Poor", "High", "Antagonist", 90, "Unknown", "Good response"],
    ["CASE_TEST_001", "2023-03-07", 32, 24.8, 1.2, 5.0, 10, 38, "Normal", 55.0, 20.0, 5.0, 4, 4, 2, 0.5, 1, 1.0, "D5", 2, "B", "Negative", 0, 0, 0.76, "Poor", "High", "Antagonist", 90, "Unknown", "Good response"],

    # صف جديد
    ["CASE_TEST_002", "2022-11-15", 30, 22.5, 0.9, 4.3, 12, 41, "OAT", 40.0, 18.0, 6.0, 5, 5, 3, 0.6, 2, 0.8, "D3", 1, "C", "Positive", 1, 0, 0.88, "Normal", "Medium", "Mild", 45, "Implantation Failure", "Monitor closely"],

    # صف جديد (نصف البيانات ناقص → اختبار null handling)
    ["CASE_TEST_003", None, 29, None, None, 3.1, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 0.00, None, None, None, None, None, None]
]

columns = [
    "case_id","et_date","female_age","female_bmi","amh_level","fsh_level","afc",
    "male_age","male_factor","semen_count_mill_per_ml","motility_percent","morphology_percent",
    "retrieved_oocytes","mii_count","num_embryos_generated","fertilization_rate",
    "good_embryos","class_a_rate","fresh_et_stage","embryos_transferred","grading",
    "pregnancy_test_result","clinical_pregnancy","live_birth","success_probability_score",
    "response_type","risk_level","recommended_protocol","suggested_waiting_period_days",
    "failure_reason","doctor_recommendation"
]

df = pd.DataFrame(data, columns=columns)
df.to_sql("ivf_patients", conn, if_exists="replace", index=False)

conn.close()
print("✔ Test DB created successfully: ivf_patients_test.db")


✔ Test DB created successfully: ivf_patients_test.db


In [9]:
# Load the database and display full table
import sqlite3
import pandas as pd

db_path = r"E:\work\DEPI\graduation promax\data\raw\ivf_patients_test.db"
conn = sqlite3.connect(db_path)

df = pd.read_sql("SELECT * FROM ivf_patients", conn)
conn.close()

df


Unnamed: 0,case_id,et_date,female_age,female_bmi,amh_level,fsh_level,afc,male_age,male_factor,semen_count_mill_per_ml,...,pregnancy_test_result,clinical_pregnancy,live_birth,success_probability_score,response_type,risk_level,recommended_protocol,suggested_waiting_period_days,failure_reason,doctor_recommendation
0,CASE_TEST_001,2023-03-07,32,24.8,1.2,5.0,10.0,38.0,Normal,55.0,...,Negative,0.0,0.0,0.76,Poor,High,Antagonist,90.0,Unknown,Good response
1,CASE_TEST_001,2023-03-07,32,24.8,1.2,5.0,10.0,38.0,Normal,55.0,...,Negative,0.0,0.0,0.76,Poor,High,Antagonist,90.0,Unknown,Good response
2,CASE_TEST_002,2022-11-15,30,22.5,0.9,4.3,12.0,41.0,OAT,40.0,...,Positive,1.0,0.0,0.88,Normal,Medium,Mild,45.0,Implantation Failure,Monitor closely
3,CASE_TEST_003,,29,,,3.1,,,,,...,,,,0.0,,,,,,
