# Coding Reports (MedDRA / WHO) — Aggregation & Qualification (Phase 1)

## Goal
Load Coding Reports (MedDRA / WHO) across all studies, normalize schemas,
perform initial structural checks, and prepare for canonical aggregation
to subject/site level.


In [1]:
# imports
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

DATA_DIR = Path("../data")
INTERMEDIATE_DIR = DATA_DIR / "intermediate"

inventory = pd.read_csv(INTERMEDIATE_DIR / "file_inventory.csv")


In [3]:
# Load Coding Report files (MedDRA / WHO)
coding_files = inventory[
    inventory.file_type == "meddra"
]["file_path"]

assert len(coding_files) > 0, "No Coding Report files found"

dfs = []

for f in coding_files:
    f = Path(f)
    assert f.exists(), f"File not found: {f}"

    df = pd.read_excel(f)
    df["source_file"] = f.name
    dfs.append(df)

coding_master = pd.concat(dfs, ignore_index=True)

assert coding_master.shape[0] > 0, "Coding master dataframe is empty"

coding_master.head()


  coding_master = pd.concat(dfs, ignore_index=True)


Unnamed: 0,MedDRA Coding Report,Study,Dictionary,Dictionary Version number,Subject,Form OID,Logline,Field OID,Coding Status,Require Coding,source_file
0,MedDRA Coding Report,Study 5,MedDRA,28.1,Subject 2626,AEG002,1,AETERM,Coded Term,No,Study 5_GlobalCodingReport_MedDRA_updated.xlsx
1,MedDRA Coding Report,Study 5,MedDRA,28.1,Subject 2626,AEG002,2,AETERM,Coded Term,No,Study 5_GlobalCodingReport_MedDRA_updated.xlsx
2,MedDRA Coding Report,Study 5,MedDRA,28.1,Subject 2626,AEG002,3,AETERM,Coded Term,No,Study 5_GlobalCodingReport_MedDRA_updated.xlsx
3,MedDRA Coding Report,Study 5,MedDRA,28.1,Subject 2626,AEG002,4,AETERM,Coded Term,No,Study 5_GlobalCodingReport_MedDRA_updated.xlsx
4,MedDRA Coding Report,Study 5,MedDRA,28.1,Subject 2626,AEG002,5,AETERM,Coded Term,No,Study 5_GlobalCodingReport_MedDRA_updated.xlsx


In [4]:
coding_master.columns.tolist()

['MedDRA Coding Report',
 'Study',
 'Dictionary',
 'Dictionary Version number',
 'Subject',
 'Form OID',
 'Logline',
 'Field OID',
 'Coding Status',
 'Require Coding',
 'source_file']

In [5]:
coding_master.shape

(66858, 11)

In [6]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace("%", "pct", regex=False)
        .str.replace("#", "num_", regex=False)
        .str.replace(r"[^\w]+", "_", regex=True)
        .str.replace(r"_+", "_", regex=True)
        .str.strip("_")
    )
    return df

coding_master = normalize_columns(coding_master)


In [7]:
# drop unnamed columns
coding_master = coding_master.loc[
    :, ~coding_master.columns.str.startswith("unnamed")
]

# drop all-null columns
coding_master = coding_master.dropna(axis=1, how="all")

# ensure no duplicate columns
assert coding_master.columns.duplicated().sum() == 0, \
    "Duplicate columns detected after normalization"


In [8]:
coding_master.columns.tolist()


['meddra_coding_report',
 'study',
 'dictionary',
 'dictionary_version_number',
 'subject',
 'form_oid',
 'logline',
 'field_oid',
 'coding_status',
 'require_coding',
 'source_file']

In [9]:
# subject_id (primary)
coding_master["subject_id"] = (
    coding_master["subject"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# study_id (secondary)
coding_master["study_id"] = (
    coding_master["study"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)


In [10]:
assert coding_master["subject_id"].notna().all(), "Null subject_id remains"
assert coding_master["study_id"].notna().all(), "Null study_id remains"


In [11]:
for col in ["coding_status", "require_coding"]:
    coding_master[col] = (
        coding_master[col]
        .astype(str)
        .str.lower()
        .str.strip()
    )


In [12]:
# term requires coding
coding_master["needs_coding"] = coding_master["require_coding"].isin(
    ["yes", "y", "true", "required", "1"]
)

# term already coded
coding_master["is_coded"] = coding_master["coding_status"].isin(
    ["coded", "complete", "completed", "yes", "y"]
)

# uncoded but required → real problem
coding_master["is_uncoded_required"] = (
    coding_master["needs_coding"] & ~coding_master["is_coded"]
)


In [13]:
CANONICAL_KEYS = ["subject_id", "study_id"]


In [14]:
agg_dict = {
    "needs_coding": "sum",
    "is_coded": "sum",
    "is_uncoded_required": "sum",
}


In [15]:
DESCRIPTIVE_COLS = ["dictionary", "dictionary_version_number"]

for col in DESCRIPTIVE_COLS:
    if col in coding_master.columns:
        agg_dict[col] = "first"


In [16]:
coding_agg = (
    coding_master
    .groupby(CANONICAL_KEYS, dropna=False)
    .agg(agg_dict)
    .reset_index()
)


In [17]:
coding_agg = coding_agg.rename(columns={
    "needs_coding": "num_terms_requiring_coding",
    "is_coded": "num_terms_coded",
    "is_uncoded_required": "num_terms_uncoded",
})


In [18]:
assert coding_agg.shape[0] > 0
assert coding_agg.duplicated(CANONICAL_KEYS).sum() == 0

assert (coding_agg["num_terms_uncoded"] <=
        coding_agg["num_terms_requiring_coding"]).all()


In [19]:
coding_agg[
    ["num_terms_requiring_coding", "num_terms_coded", "num_terms_uncoded"]
].describe()


Unnamed: 0,num_terms_requiring_coding,num_terms_coded,num_terms_uncoded
count,6148.0,6148.0,6148.0
mean,0.067664,0.0,0.067664
std,0.488996,0.0,0.488996
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,20.0,0.0,20.0


In [20]:
for col in CANONICAL_KEYS:
    coding_agg[col] = coding_agg[col].astype(str).str.strip()

for col in DESCRIPTIVE_COLS:
    coding_agg[col] = coding_agg[col].astype(str).str.strip()


In [21]:
out_parquet = INTERMEDIATE_DIR / "coding_reports_agg.parquet"
out_csv = INTERMEDIATE_DIR / "coding_reports_agg.csv"

coding_agg.to_parquet(out_parquet, index=False)
coding_agg.to_csv(out_csv, index=False)

(out_parquet, out_csv)


(PosixPath('../data/intermediate/coding_reports_agg.parquet'),
 PosixPath('../data/intermediate/coding_reports_agg.csv'))