# Global_Missing_Pages_Report â€” Aggregation & Qualification (Phase 1)

## Goal
Load Global Missing Pages data across all studies, normalize schemas,
perform initial structural checks, and prepare for canonical aggregation
to subject/site level.


In [17]:
# imports
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

DATA_DIR = Path("../data")
INTERMEDIATE_DIR = DATA_DIR / "intermediate"

inventory = pd.read_csv(INTERMEDIATE_DIR / "file_inventory.csv")


In [18]:
# Load Global Missing Pages files
missing_pages_files = inventory[
    inventory.file_type == "missing_pages"
]["file_path"]

assert len(missing_pages_files) > 0, "No Global_Missing_Pages_Report files found"

dfs = []

for f in missing_pages_files:
    f = Path(f)
    assert f.exists(), f"File not found: {f}"

    df = pd.read_excel(f)
    df["source_file"] = f.name
    dfs.append(df)

missing_pages_master = pd.concat(dfs, ignore_index=True)

assert missing_pages_master.shape[0] > 0, "Missing Pages master dataframe is empty"

missing_pages_master.head()


  missing_pages_master = pd.concat(dfs, ignore_index=True)


Unnamed: 0,Study Name,SiteGroupName(CountryName),SiteNumber,SubjectName,Overall Subject Status,Visit Level Subject Status,FolderName,Visit date,Form Type (Summary or Visit),FormName,No. #Days Page Missing,source_file,Form Details,Country,Site Number,Subject Name,Visit Name,Page Name,Subject Status,# of Days Missing,Form 1 Subject Status,Study,Visit,Visit Date,Screening Visit Date
0,Study 21,,Site 456,Subject 5701,Subject continuing,Subject continuing,Month33,Missing visit date,Visit Level,Form 1,,Study 21_Missing_Pages_Report_06Nov2025_update...,,,,,,,,,,,,,
1,Study 21,,Site 456,Subject 5701,Subject continuing,Subject continuing,Month33,Missing visit date,Visit Level,Form 2,,Study 21_Missing_Pages_Report_06Nov2025_update...,,,,,,,,,,,,,
2,Study 21,,Site 456,Subject 5701,Subject continuing,Subject continuing,Month33,Missing visit date,Visit Level,Form 3,,Study 21_Missing_Pages_Report_06Nov2025_update...,,,,,,,,,,,,,
3,Study 21,,Site 456,Subject 5721,Subject continuing,Subject continuing,Month27,26 SEP 2025,Visit Level,Form 1,40.0,Study 21_Missing_Pages_Report_06Nov2025_update...,,,,,,,,,,,,,
4,Study 21,,Site 789,Subject 5844,,,EOS,Missing visit date,Visit Level,Form 1,,Study 21_Missing_Pages_Report_06Nov2025_update...,,,,,,,,,,,,,


In [19]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace("%", "pct", regex=False)
        .str.replace("#", "num_", regex=False)
        .str.replace(r"[^\w]+", "_", regex=True)
        .str.replace(r"_+", "_", regex=True)
        .str.strip("_")
    )
    return df

missing_pages_master = normalize_columns(missing_pages_master)


In [20]:
missing_pages_master.columns.tolist()

['study_name',
 'sitegroupname_countryname',
 'sitenumber',
 'subjectname',
 'overall_subject_status',
 'visit_level_subject_status',
 'foldername',
 'visit_date',
 'form_type_summary_or_visit',
 'formname',
 'no_num_days_page_missing',
 'source_file',
 'form_details',
 'country',
 'site_number',
 'subject_name',
 'visit_name',
 'page_name',
 'subject_status',
 'num_of_days_missing',
 'form_1_subject_status',
 'study',
 'visit',
 'visit_date',
 'screening_visit_date']

In [21]:
# drop unnamed columns
missing_pages_master = missing_pages_master.loc[
    :, ~missing_pages_master.columns.str.startswith("unnamed")
]

# drop all-null columns
missing_pages_master = missing_pages_master.dropna(axis=1, how="all")

# ensure no duplicate columns
assert missing_pages_master.columns.duplicated().sum() == 0, \
    "Duplicate columns detected after normalization"


In [22]:
missing_pages_master.columns.tolist()

['study_name',
 'sitegroupname_countryname',
 'sitenumber',
 'subjectname',
 'overall_subject_status',
 'visit_level_subject_status',
 'foldername',
 'visit_date',
 'form_type_summary_or_visit',
 'formname',
 'no_num_days_page_missing',
 'source_file',
 'form_details',
 'country',
 'site_number',
 'subject_name',
 'visit_name',
 'page_name',
 'subject_status',
 'num_of_days_missing',
 'form_1_subject_status']

In [23]:
missing_pages_master.shape

(2747, 21)

In [24]:
# --- create canonical identifiers ---

# study_id
missing_pages_master["study_id"] = (
    missing_pages_master["study_name"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# site_id
missing_pages_master["site_id"] = (
    missing_pages_master["site_number"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# subject_id
missing_pages_master["subject_id"] = (
    missing_pages_master["subject_name"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)


In [25]:
CANONICAL_KEYS = ["study_id", "site_id", "subject_id"]

missing_pages_master[CANONICAL_KEYS].isna().sum()


study_id       193
site_id       2554
subject_id    2528
dtype: int64

In [26]:
# recover study_id from filename where missing
missing_pages_master.loc[
    missing_pages_master["study_id"].isna(),
    "study_id"
] = (
    missing_pages_master.loc[
        missing_pages_master["study_id"].isna(),
        "source_file"
    ]
    .str.extract(r"(Study[\s_]*\d+)", expand=False)
)

# assert recovery
assert missing_pages_master["study_id"].notna().all(), \
    "Unrecoverable study_id values remain"


In [27]:
rows_before = len(missing_pages_master)
null_subject_rows = missing_pages_master["subject_id"].isna().sum()

print(f"Dropping {null_subject_rows} rows without subject_id "
      f"({null_subject_rows/rows_before:.1%} of dataset)")

missing_pages_master = missing_pages_master[
    missing_pages_master["subject_id"].notna()
]

rows_after = len(missing_pages_master)
rows_before, rows_after


Dropping 2528 rows without subject_id (92.0% of dataset)


(2747, 219)

In [31]:
CANONICAL_KEYS = ["study_id", "site_id", "subject_id"]

# Drop rows with unrecoverable site_id
remaining_null_sites = missing_pages_master["site_id"].isna().sum()
if remaining_null_sites > 0:
    print(f"Dropping {remaining_null_sites} rows with unrecoverable site_id")
    missing_pages_master = missing_pages_master[
        missing_pages_master["site_id"].notna()
    ]



In [32]:
assert missing_pages_master[CANONICAL_KEYS].isna().sum().sum() == 0, \
    "Null values still present in canonical keys"



In [33]:
missing_pages_master[CANONICAL_KEYS].isna().sum()

study_id      0
site_id       0
subject_id    0
dtype: int64

In [34]:
METRIC_COL = "num_of_days_missing"

# coerce to numeric
before_non_null = missing_pages_master[METRIC_COL].notna().sum()

missing_pages_master[METRIC_COL] = pd.to_numeric(
    missing_pages_master[METRIC_COL],
    errors="coerce"
)

after_non_null = missing_pages_master[METRIC_COL].notna().sum()

print(f"Coercion dropped {before_non_null - after_non_null} non-numeric values")

# sanity
assert (missing_pages_master[METRIC_COL].dropna() >= 0).all(), \
    "Negative days missing detected"


Coercion dropped 0 non-numeric values


In [38]:
# Building aggregate
agg_dict = {
    METRIC_COL: "max"
}

DESCRIPTIVE_COLS = ["country"]

for col in DESCRIPTIVE_COLS:
    if col in missing_pages_master.columns:
        agg_dict[col] = "first"

missing_pages_agg = (
    missing_pages_master
    .groupby(CANONICAL_KEYS, dropna=False)
    .agg(agg_dict)
    .reset_index()
)

In [40]:
missing_pages_agg[METRIC_COL].describe(percentiles=[0.95, 0.99, 0.999])


count     21.000000
mean      66.333333
std       43.439997
min        2.000000
50%       58.000000
95%      115.000000
99%      197.400000
99.9%    215.940000
max      218.000000
Name: num_of_days_missing, dtype: float64

In [41]:
missing_pages_agg.sort_values(METRIC_COL, ascending=False).head(10)


Unnamed: 0,study_id,site_id,subject_id,num_of_days_missing,country
32,Study 1,Site 7,Subject 14,218.0,ISR
7,Study 1,Site 15,Subject 47,115.0,ESP
16,Study 1,Site 19,Subject 78,107.0,USA
6,Study 1,Site 15,Subject 45,99.0,ESP
27,Study 1,Site 4,Subject 10,80.0,FRA
17,Study 1,Site 19,Subject 80,67.0,USA
19,Study 1,Site 21,Subject 83,63.0,USA
0,Study 1,Site 11,Subject 21,61.0,SGP
10,Study 1,Site 17,Subject 60,60.0,ESP
28,Study 1,Site 4,Subject 9,59.0,FRA


In [43]:
# Dropping negative days in METRIC_COL
missing_pages_agg = missing_pages_agg[
    missing_pages_agg[METRIC_COL] >= 0
]

In [44]:
# non-negativity is the only hard rule
assert (missing_pages_agg[METRIC_COL] >= 0).all(), \
    "Negative days missing detected"


In [45]:
assert missing_pages_agg.shape[0] > 0, "Aggregation produced empty dataframe"
assert missing_pages_agg.duplicated(CANONICAL_KEYS).sum() == 0, \
    "Duplicate rows after aggregation"

# metric sanity
assert missing_pages_agg[METRIC_COL].between(0, 10_000).all(), \
    "Unrealistic days missing values detected"


In [47]:
for col in CANONICAL_KEYS:
    missing_pages_agg[col] = (
        missing_pages_agg[col]
        .astype(str)
        .str.strip()
    )


In [48]:
out_parquet = INTERMEDIATE_DIR / "global_missing_pages_agg.parquet"
out_csv = INTERMEDIATE_DIR / "global_missing_pages_agg.csv"

missing_pages_agg.to_parquet(out_parquet, index=False)
missing_pages_agg.to_csv(out_csv, index=False)

(out_parquet, out_csv)


(PosixPath('../data/intermediate/global_missing_pages_agg.parquet'),
 PosixPath('../data/intermediate/global_missing_pages_agg.csv'))