# SAE_Dashboard â€” Aggregation & Qualification (Phase 1)

## Goal
Load SAE Dashboard data across all studies (DM and Safety views),
normalize schemas, perform initial structural checks, and prepare
for canonical aggregation to subject/site level.


In [1]:
# imports
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

DATA_DIR = Path("../data")
INTERMEDIATE_DIR = DATA_DIR / "intermediate"

inventory = pd.read_csv(INTERMEDIATE_DIR / "file_inventory.csv")


In [2]:
# Load SAE Dashboard files
sae_files = inventory[
    inventory.file_type == "sae"
]["file_path"]

assert len(sae_files) > 0, "No SAE Dashboard files found"

dfs = []

for f in sae_files:
    f = Path(f)
    assert f.exists(), f"File not found: {f}"

    df = pd.read_excel(f)
    df["source_file"] = f.name
    dfs.append(df)

sae_master = pd.concat(dfs, ignore_index=True)

assert sae_master.shape[0] > 0, "SAE master dataframe is empty"

sae_master.head()


  sae_master = pd.concat(dfs, ignore_index=True)


Unnamed: 0,Discrepancy ID,Study ID,Country,Site,Patient ID,Form Name,Discrepancy Created Timestamp in Dashboard,Review Status,Action Status,source_file,Case Status,Record Position,Report Update Required
0,643026,Study 5,ESP,Site 300,Subject 2487,Form 1,2025-04-25 20:53:10,Review Completed,No action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,
1,643027,Study 5,ESP,Site 302,Subject 2518,Form 1,2025-09-15 20:53:11,Review Completed,No action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,
2,643028,Study 5,USA,Site 337,Subject 2745,Form 1,2025-09-02 20:52:34,Review Completed,No action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,
3,682028,Study 5,ESP,Site 300,Subject 2487,Form 1,2025-04-25 20:56:49,Review Completed,No action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,
4,682029,Study 5,ESP,Site 302,Subject 2518,Form 1,2025-09-15 20:56:45,Review Completed,No action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,


In [3]:
sae_master.shape

(17098, 13)

In [4]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace("%", "pct", regex=False)
        .str.replace("#", "num_", regex=False)
        .str.replace(r"[^\w]+", "_", regex=True)
        .str.replace(r"_+", "_", regex=True)
        .str.strip("_")
    )
    return df

sae_master = normalize_columns(sae_master)


In [5]:
sae_master.columns.tolist()

['discrepancy_id',
 'study_id',
 'country',
 'site',
 'patient_id',
 'form_name',
 'discrepancy_created_timestamp_in_dashboard',
 'review_status',
 'action_status',
 'source_file',
 'case_status',
 'record_position',
 'report_update_required']

In [6]:
# drop unnamed columns
sae_master = sae_master.loc[
    :, ~sae_master.columns.str.startswith("unnamed")
]

# drop all-null columns
sae_master = sae_master.dropna(axis=1, how="all")

# ensure no duplicate columns
assert sae_master.columns.duplicated().sum() == 0, \
    "Duplicate columns detected after normalization"


In [7]:
sae_master.columns.tolist()


['discrepancy_id',
 'study_id',
 'country',
 'site',
 'patient_id',
 'form_name',
 'discrepancy_created_timestamp_in_dashboard',
 'review_status',
 'action_status',
 'source_file',
 'case_status',
 'record_position',
 'report_update_required']

In [8]:
# --- create canonical identifiers ---

# subject / patient
sae_master["subject_id"] = (
    sae_master["patient_id"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# study_id (already present, normalize)
sae_master["study_id"] = (
    sae_master["study_id"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# site_id
sae_master["site_id"] = (
    sae_master["site"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)


In [9]:
CANONICAL_KEYS = ["subject_id", "study_id", "site_id"]

sae_master[CANONICAL_KEYS].isna().sum()


subject_id    0
study_id      0
site_id       0
dtype: int64

In [10]:
for col in ["review_status", "action_status", "case_status"]:
    if col in sae_master.columns:
        sae_master[col] = (
            sae_master[col]
            .astype(str)
            .str.lower()
            .str.strip()
        )


In [11]:
sae_master["is_review_pending"] = ~sae_master["review_status"].str.contains(
    "completed|done|review completed", na=False
)

sae_master["is_action_pending"] = ~sae_master["action_status"].str.contains(
    "no action|required completed|completed", na=False
)

sae_master["is_case_open"] = ~sae_master["case_status"].str.contains(
    "closed|locked", na=False
)


In [12]:
sae_master["is_open_sae"] = (
    sae_master["is_review_pending"]
    | sae_master["is_action_pending"]
    | sae_master["is_case_open"]
)


In [13]:
sae_master.shape

(17098, 19)

In [14]:
sae_master.head()

Unnamed: 0,discrepancy_id,study_id,country,site,patient_id,form_name,discrepancy_created_timestamp_in_dashboard,review_status,action_status,source_file,case_status,record_position,report_update_required,subject_id,site_id,is_review_pending,is_action_pending,is_case_open,is_open_sae
0,643026,Study 5,ESP,Site 300,Subject 2487,Form 1,2025-04-25 20:53:10,review completed,no action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,,Subject 2487,Site 300,False,False,True,True
1,643027,Study 5,ESP,Site 302,Subject 2518,Form 1,2025-09-15 20:53:11,review completed,no action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,,Subject 2518,Site 302,False,False,True,True
2,643028,Study 5,USA,Site 337,Subject 2745,Form 1,2025-09-02 20:52:34,review completed,no action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,,Subject 2745,Site 337,False,False,True,True
3,682028,Study 5,ESP,Site 300,Subject 2487,Form 1,2025-04-25 20:56:49,review completed,no action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,,Subject 2487,Site 300,False,False,True,True
4,682029,Study 5,ESP,Site 302,Subject 2518,Form 1,2025-09-15 20:56:45,review completed,no action required,Study 5_eSAE_Dashboard_DM_Safety_updated.xlsx,,,,Subject 2518,Site 302,False,False,True,True


In [15]:
agg_dict = {
    "discrepancy_id": "count",        # total SAE discrepancies
    "is_open_sae": "sum",             # unresolved SAE count
    "is_review_pending": "sum",
    "is_action_pending": "sum",
    "is_case_open": "sum",
}


In [16]:
DESCRIPTIVE_COLS = []

for col in ["country", "report_update_required"]:
    if col in sae_master.columns:
        DESCRIPTIVE_COLS.append(col)

for col in DESCRIPTIVE_COLS:
    agg_dict[col] = "first"


In [17]:
sae_agg = (
    sae_master
    .groupby(CANONICAL_KEYS, dropna=False)
    .agg(agg_dict)
    .reset_index()
)


In [18]:
sae_agg = sae_agg.rename(columns={
    "discrepancy_id": "num_sae_discrepancies",
    "is_open_sae": "num_open_sae",
    "is_review_pending": "num_review_pending",
    "is_action_pending": "num_action_pending",
    "is_case_open": "num_case_open",
})


In [19]:
assert sae_agg.shape[0] > 0
assert sae_agg.duplicated(CANONICAL_KEYS).sum() == 0

for col in [
    "num_sae_discrepancies",
    "num_open_sae",
    "num_review_pending",
    "num_action_pending",
    "num_case_open",
]:
    assert (sae_agg[col] >= 0).all(), f"Negative values in {col}"


In [20]:
assert (sae_agg["num_open_sae"] <= sae_agg["num_sae_discrepancies"]).all()


In [21]:
for col in CANONICAL_KEYS:
    sae_agg[col] = sae_agg[col].astype(str).str.strip()

for col in DESCRIPTIVE_COLS:
    sae_agg[col] = sae_agg[col].astype(str).str.strip()


In [22]:
out_parquet = INTERMEDIATE_DIR / "sae_dashboard_agg.parquet"
out_csv = INTERMEDIATE_DIR / "sae_dashboard_agg.csv"

sae_agg.to_parquet(out_parquet, index=False)
sae_agg.to_csv(out_csv, index=False)

(out_parquet, out_csv)


(PosixPath('../data/intermediate/sae_dashboard_agg.parquet'),
 PosixPath('../data/intermediate/sae_dashboard_agg.csv'))

In [23]:
sae_agg.columns.tolist()

['subject_id',
 'study_id',
 'site_id',
 'num_sae_discrepancies',
 'num_open_sae',
 'num_review_pending',
 'num_action_pending',
 'num_case_open',
 'country',
 'report_update_required']