# Coding Reports (WHO-DRA / WHODrug) — Aggregation & Qualification (Phase 1)

## Goal
Load WHO-DRA / WHODrug coding reports across all studies, normalize schemas,
and aggregate medication coding completeness to subject level.


In [3]:
# imports
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

DATA_DIR = Path("../data")
INTERMEDIATE_DIR = DATA_DIR / "intermediate"

inventory = pd.read_csv(INTERMEDIATE_DIR / "file_inventory.csv")


In [4]:
# Load WHO-DRA / WHODrug coding files
whodra_files = inventory[
    inventory.file_type == "whodrug"
]["file_path"]

assert len(whodra_files) > 0, "No WHO-DRA coding files found"

dfs = []

for f in whodra_files:
    f = Path(f)
    assert f.exists(), f"File not found: {f}"

    df = pd.read_excel(f)
    df["source_file"] = f.name
    dfs.append(df)

whodra_master = pd.concat(dfs, ignore_index=True)

assert whodra_master.shape[0] > 0, "WHO-DRA master dataframe is empty"

whodra_master.head()


  whodra_master = pd.concat(dfs, ignore_index=True)


Unnamed: 0,WHODrug Coding Report,Study,Dictionary,Dictionary Version number,Subject,Form OID,Logline,Field OID,Coding Status,Require Coding,source_file
0,WHODrug Coding Report,Study 5,WHODrug-Global-B3,202509.0,Subject 2406,CMO005_TR_1,1,CMTRT,Coded Term,No,Study 5_GlobalCodingReport_WHODD_updated.xlsx
1,WHODrug Coding Report,Study 5,WHODrug-Global-B3,202509.0,Subject 2406,CMO005_TR_1,2,CMTRT,Coded Term,No,Study 5_GlobalCodingReport_WHODD_updated.xlsx
2,WHODrug Coding Report,Study 5,WHODrug-Global-B3,202509.0,Subject 2690,CMO005_TR_1,1,CMTRT,Coded Term,No,Study 5_GlobalCodingReport_WHODD_updated.xlsx
3,WHODrug Coding Report,Study 5,WHODrug-Global-B3,202509.0,Subject 2690,CMO005_TR_1,2,CMTRT,Coded Term,No,Study 5_GlobalCodingReport_WHODD_updated.xlsx
4,WHODrug Coding Report,Study 5,WHODrug-Global-B3,202509.0,Subject 2690,CMO005_TR_1,3,CMTRT,Coded Term,No,Study 5_GlobalCodingReport_WHODD_updated.xlsx


In [5]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace("%", "pct", regex=False)
        .str.replace("#", "num_", regex=False)
        .str.replace(r"[^\w]+", "_", regex=True)
        .str.replace(r"_+", "_", regex=True)
        .str.strip("_")
    )
    return df

whodra_master = normalize_columns(whodra_master)


In [6]:
whodra_master.columns.tolist()

['whodrug_coding_report',
 'study',
 'dictionary',
 'dictionary_version_number',
 'subject',
 'form_oid',
 'logline',
 'field_oid',
 'coding_status',
 'require_coding',
 'source_file']

In [7]:
whodra_master.shape

(310188, 11)

In [8]:
whodra_master = whodra_master.loc[
    :, ~whodra_master.columns.str.startswith("unnamed")
]

whodra_master = whodra_master.dropna(axis=1, how="all")

assert whodra_master.columns.duplicated().sum() == 0


In [9]:
whodra_master.columns.tolist()


['whodrug_coding_report',
 'study',
 'dictionary',
 'dictionary_version_number',
 'subject',
 'form_oid',
 'logline',
 'field_oid',
 'coding_status',
 'require_coding',
 'source_file']

In [10]:
# subject_id (primary)
whodra_master["subject_id"] = (
    whodra_master["subject"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# study_id (secondary)
whodra_master["study_id"] = (
    whodra_master["study"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)


In [11]:
assert whodra_master["subject_id"].notna().all(), "Null subject_id remains"
assert whodra_master["study_id"].notna().all(), "Null study_id remains"


In [12]:
for col in ["coding_status", "require_coding"]:
    whodra_master[col] = (
        whodra_master[col]
        .astype(str)
        .str.lower()
        .str.strip()
    )


In [13]:
# medication requires coding
whodra_master["needs_drug_coding"] = whodra_master["require_coding"].isin(
    ["yes", "y", "true", "required", "1"]
)

# medication coded
whodra_master["is_drug_coded"] = whodra_master["coding_status"].isin(
    ["coded", "complete", "completed", "yes", "y"]
)

# uncoded medication that requires coding → safety gap
whodra_master["is_uncoded_drug"] = (
    whodra_master["needs_drug_coding"] & ~whodra_master["is_drug_coded"]
)


In [14]:
CANONICAL_KEYS = ["subject_id", "study_id"]


In [15]:
agg_dict = {
    "needs_drug_coding": "sum",
    "is_drug_coded": "sum",
    "is_uncoded_drug": "sum",
}


In [16]:
DESCRIPTIVE_COLS = ["dictionary", "dictionary_version_number"]

for col in DESCRIPTIVE_COLS:
    if col in whodra_master.columns:
        agg_dict[col] = "first"


In [17]:
whodra_agg = (
    whodra_master
    .groupby(CANONICAL_KEYS, dropna=False)
    .agg(agg_dict)
    .reset_index()
)


In [18]:
whodra_agg = whodra_agg.rename(columns={
    "needs_drug_coding": "num_drugs_requiring_coding",
    "is_drug_coded": "num_drugs_coded",
    "is_uncoded_drug": "num_drugs_uncoded",
})


In [19]:
assert whodra_agg.shape[0] > 0
assert whodra_agg.duplicated(CANONICAL_KEYS).sum() == 0

assert (whodra_agg["num_drugs_uncoded"] <=
        whodra_agg["num_drugs_requiring_coding"]).all()


In [20]:
for col in CANONICAL_KEYS:
    whodra_agg[col] = whodra_agg[col].astype(str).str.strip()

for col in DESCRIPTIVE_COLS:
    whodra_agg[col] = whodra_agg[col].astype(str).str.strip()


In [21]:
out_parquet = INTERMEDIATE_DIR / "coding_reports_whodra_agg.parquet"
out_csv = INTERMEDIATE_DIR / "coding_reports_whodra_agg.csv"

whodra_agg.to_parquet(out_parquet, index=False)
whodra_agg.to_csv(out_csv, index=False)

(out_parquet, out_csv)


(PosixPath('../data/intermediate/coding_reports_whodra_agg.parquet'),
 PosixPath('../data/intermediate/coding_reports_whodra_agg.csv'))