# Compiled_EDRR â€” Aggregation & Qualification (Phase 1)

## Goal
Load Compiled EDRR data across all studies, normalize schemas,
perform initial structural checks, and prepare for canonical aggregation
to subject/site level.


In [1]:
# imports
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

DATA_DIR = Path("../data")
INTERMEDIATE_DIR = DATA_DIR / "intermediate"

inventory = pd.read_csv(INTERMEDIATE_DIR / "file_inventory.csv")


In [2]:
# Load Compiled EDRR files
edrr_files = inventory[
    inventory.file_type == "edrr"
]["file_path"]

assert len(edrr_files) > 0, "No Compiled_EDRR files found"

dfs = []

for f in edrr_files:
    f = Path(f)
    assert f.exists(), f"File not found: {f}"

    df = pd.read_excel(f)
    df["source_file"] = f.name
    dfs.append(df)

edrr_master = pd.concat(dfs, ignore_index=True)

assert edrr_master.shape[0] > 0, "Compiled EDRR master dataframe is empty"

edrr_master.head()


Unnamed: 0,Study,Subject,Total Open issue Count per subject,.,source_file
0,Study 21,Subject 11913,1,,Compiled_EDRR_2025_Nov_12_12_17_01_updated.xlsx
1,Study 21,Subject 6754,1,,Compiled_EDRR_2025_Nov_12_12_17_01_updated.xlsx
2,Study 21,Subject 7446,1,,Compiled_EDRR_2025_Nov_12_12_17_01_updated.xlsx
3,Study 21,Subject 27371,1,,Compiled_EDRR_2025_Nov_12_12_17_01_updated.xlsx
4,Study 21,Subject 11187,1,,Compiled_EDRR_2025_Nov_12_12_17_01_updated.xlsx


In [3]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace("%", "pct", regex=False)
        .str.replace("#", "num_", regex=False)
        .str.replace(r"[^\w]+", "_", regex=True)
        .str.replace(r"_+", "_", regex=True)
        .str.strip("_")
    )
    return df

edrr_master = normalize_columns(edrr_master)


In [6]:
edrr_master.shape

(583, 5)

In [7]:
edrr_master.columns.tolist()

['study', 'subject', 'total_open_issue_count_per_subject', '', 'source_file']

In [8]:
# drop unnamed columns
edrr_master = edrr_master.loc[
    :, ~edrr_master.columns.str.startswith("unnamed")
]

# drop all-null columns
edrr_master = edrr_master.dropna(axis=1, how="all")

# ensure no duplicate columns
assert edrr_master.columns.duplicated().sum() == 0, \
    "Duplicate columns detected after normalization"


In [9]:
edrr_master.columns.tolist()

['study', 'subject', 'total_open_issue_count_per_subject', 'source_file']

In [10]:
# subject_id (primary)
edrr_master["subject_id"] = (
    edrr_master["subject"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# study_id (secondary, required)
edrr_master["study_id"] = (
    edrr_master["study"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)


In [12]:
edrr_master.isna().sum()

study                                 0
subject                               2
total_open_issue_count_per_subject    0
source_file                           0
subject_id                            2
study_id                              0
dtype: int64

In [13]:
# Drop null subject (strict)
rows_before = len(edrr_master)
null_subject_rows = edrr_master["subject_id"].isna().sum()
edrr_master = edrr_master.loc[
    edrr_master["subject_id"].notna()
].reset_index(drop=True)
rows_after = len(edrr_master)
print(f"Dropped {null_subject_rows} rows with null subject_id")


Dropped 2 rows with null subject_id


In [14]:
assert edrr_master["subject_id"].notna().all(), "Null subject_id remains"
assert edrr_master["study_id"].notna().all(), "Null study_id remains"


In [15]:
METRIC_COL = "total_open_issue_count_per_subject"

edrr_master[METRIC_COL] = pd.to_numeric(
    edrr_master[METRIC_COL],
    errors="coerce"
)

assert edrr_master[METRIC_COL].notna().all(), \
    "Null values in total_open_issue_count_per_subject"

assert (edrr_master[METRIC_COL] >= 0).all(), \
    "Negative open issue counts detected"


In [16]:
CANONICAL_KEYS = ["subject_id", "study_id"]


In [17]:
edrr_agg = (
    edrr_master
    .groupby(CANONICAL_KEYS, dropna=False)
    .agg({METRIC_COL: "max"})
    .reset_index()
)


In [18]:
for col in CANONICAL_KEYS:
    edrr_agg[col] = edrr_agg[col].astype(str).str.strip()


In [19]:
out_parquet = INTERMEDIATE_DIR / "compiled_edrr_agg.parquet"
out_csv = INTERMEDIATE_DIR / "compiled_edrr_agg.csv"

edrr_agg.to_parquet(out_parquet, index=False)
edrr_agg.to_csv(out_csv, index=False)

(out_parquet, out_csv)


(PosixPath('../data/intermediate/compiled_edrr_agg.parquet'),
 PosixPath('../data/intermediate/compiled_edrr_agg.csv'))