# Missing_Lab_Name_and_Missing_Ranges â€” Aggregation & Qualification (Phase 1)

## Goal
Load Missing Lab Name and Missing Ranges data across all studies,
normalize schemas, perform initial structural checks, and prepare
for canonical aggregation to subject/site level.


In [1]:
# imports
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

DATA_DIR = Path("../data")
INTERMEDIATE_DIR = DATA_DIR / "intermediate"

inventory = pd.read_csv(INTERMEDIATE_DIR / "file_inventory.csv")


In [3]:
# Load Missing Lab Name and Missing Ranges files
missing_lab_files = inventory[
    inventory.file_type == "lab_missing"
]["file_path"]

assert len(missing_lab_files) > 0, "No Missing_Lab_Name_and_Missing_Ranges files found"

dfs = []

for f in missing_lab_files:
    f = Path(f)
    assert f.exists(), f"File not found: {f}"

    df = pd.read_excel(f)
    df["source_file"] = f.name
    dfs.append(df)

missing_lab_master = pd.concat(dfs, ignore_index=True)

assert missing_lab_master.shape[0] > 0, "Missing Lab master dataframe is empty"

missing_lab_master.head()


Unnamed: 0,Country,Site number,Subject,Visit,Form Name,Lab category,Lab Date,Test Name,Test description,Issue,source_file,Comments,Study,Form Name.1
0,ARG,Site 165,Subject 2590,Screening,Form 1,HEMATOLOGY,15-SEP-2025,INR,Prothrombin intl.\nnormalized ratio,Ranges/ Units not entered,Study 5_Missing_Lab_Name_And_Missing_Ranges_up...,,,
1,ARG,Site 165,Subject 2590,Screening,Form 2,URINALYSIS,15-SEP-2025,UBLOST,Urine blood dipstick,Missing Lab name,Study 5_Missing_Lab_Name_And_Missing_Ranges_up...,,,
2,ARG,Site 165,Subject 2590,Screening,Form 2,URINALYSIS,15-SEP-2025,UPHST,Urine pH dipstick,Missing Lab name,Study 5_Missing_Lab_Name_And_Missing_Ranges_up...,,,
3,ARG,Site 165,Subject 2590,Screening,Form 2,URINALYSIS,15-SEP-2025,USPGRST,Urine specific\ngravity dipstick,Missing Lab name,Study 5_Missing_Lab_Name_And_Missing_Ranges_up...,,,
4,ARG,Site 165,Subject 2590,Screening,Form 3,URINALYSIS,15-SEP-2025,CYURIAC,Uric Acid Crystals,Ranges/ Units not entered,Study 5_Missing_Lab_Name_And_Missing_Ranges_up...,,,


In [4]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace("%", "pct", regex=False)
        .str.replace("#", "num_", regex=False)
        .str.replace(r"[^\w]+", "_", regex=True)
        .str.replace(r"_+", "_", regex=True)
        .str.strip("_")
    )
    return df

missing_lab_master = normalize_columns(missing_lab_master)


In [7]:
missing_lab_master.shape

(20416, 14)

In [6]:
missing_lab_master.columns.tolist()

['country',
 'site_number',
 'subject',
 'visit',
 'form_name',
 'lab_category',
 'lab_date',
 'test_name',
 'test_description',
 'issue',
 'source_file',
 'comments',
 'study',
 'form_name']

In [9]:
# drop unnamed columns
missing_lab_master = missing_lab_master.loc[
    :, ~missing_lab_master.columns.str.startswith("unnamed")
]

# drop all-null columns
missing_lab_master = missing_lab_master.dropna(axis=1, how="all")

# drop duplicate columns
missing_lab_master = missing_lab_master.loc[
    :, ~missing_lab_master.columns.duplicated()
]

# ensure no duplicate columns
assert missing_lab_master.columns.duplicated().sum() == 0, \
    "Duplicate columns detected after normalization"


In [11]:
missing_lab_master.columns.tolist()

['country',
 'site_number',
 'subject',
 'visit',
 'form_name',
 'lab_category',
 'lab_date',
 'test_name',
 'test_description',
 'issue',
 'source_file',
 'comments',
 'study']

In [12]:
# --- create canonical identifiers ---

# subject_id (primary)
missing_lab_master["subject_id"] = (
    missing_lab_master["subject"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# site_id (secondary but required)
missing_lab_master["site_id"] = (
    missing_lab_master["site_number"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# study_id (descriptive only)
if "study" in missing_lab_master.columns:
    missing_lab_master["study_id"] = (
        missing_lab_master["study"]
        .astype(str)
        .str.strip()
        .replace({"": np.nan, "nan": np.nan})
)


In [13]:
CANONICAL_KEYS = ["subject_id", "site_id"]

missing_lab_master[CANONICAL_KEYS].isna().sum()


subject_id    0
site_id       0
dtype: int64

In [14]:
missing_lab_master["issue_norm"] = (
    missing_lab_master["issue"]
    .astype(str)
    .str.lower()
    .str.strip()
)


In [15]:
missing_lab_master["is_missing_lab_name"] = (
    missing_lab_master["issue_norm"].str.contains("lab name", na=False)
)

missing_lab_master["is_missing_ranges_units"] = (
    missing_lab_master["issue_norm"].str.contains("range|unit", na=False)
)


In [16]:
agg_dict = {
    "issue_norm": "count",                  # total issues
    "is_missing_lab_name": "sum",
    "is_missing_ranges_units": "sum",
}


In [17]:
DESCRIPTIVE_COLS = []

for col in ["country", "lab_category", "study_id"]:
    if col in missing_lab_master.columns:
        DESCRIPTIVE_COLS.append(col)

for col in DESCRIPTIVE_COLS:
    agg_dict[col] = "first"


In [18]:
missing_lab_agg = (
    missing_lab_master
    .groupby(CANONICAL_KEYS, dropna=False)
    .agg(agg_dict)
    .reset_index()
)


In [19]:
missing_lab_agg = missing_lab_agg.rename(columns={
    "issue_norm": "num_lab_issues",
    "is_missing_lab_name": "num_missing_lab_name",
    "is_missing_ranges_units": "num_missing_ranges_units"
})


In [20]:
assert missing_lab_agg.shape[0] > 0
assert missing_lab_agg.duplicated(CANONICAL_KEYS).sum() == 0

for col in [
    "num_lab_issues",
    "num_missing_lab_name",
    "num_missing_ranges_units"
]:
    assert (missing_lab_agg[col] >= 0).all(), f"Negative values in {col}"


In [21]:
for col in CANONICAL_KEYS:
    missing_lab_agg[col] = (
        missing_lab_agg[col]
        .astype(str)
        .str.strip()
    )

for col in DESCRIPTIVE_COLS:
    missing_lab_agg[col] = missing_lab_agg[col].astype(str).str.strip()


In [22]:
out_parquet = INTERMEDIATE_DIR / "missing_lab_issues_agg.parquet"
out_csv = INTERMEDIATE_DIR / "missing_lab_issues_agg.csv"

missing_lab_agg.to_parquet(out_parquet, index=False)
missing_lab_agg.to_csv(out_csv, index=False)

(out_parquet, out_csv)


(PosixPath('../data/intermediate/missing_lab_issues_agg.parquet'),
 PosixPath('../data/intermediate/missing_lab_issues_agg.csv'))