# Inactivated_Forms_and_Loglines — Aggregation & Qualification (Phase 1)

## Goal
Load Inactivated Forms and Loglines data across all studies, normalize schemas,
perform initial structural checks, and prepare for canonical aggregation
to subject/site level.


In [1]:
# imports
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

DATA_DIR = Path("../data")
INTERMEDIATE_DIR = DATA_DIR / "intermediate"

inventory = pd.read_csv(INTERMEDIATE_DIR / "file_inventory.csv")


In [2]:
# Load Inactivated Forms and Loglines files
inactivated_files = inventory[
    inventory.file_type == "other"
]["file_path"]

assert len(inactivated_files) > 0, "No Inactivated Forms and Loglines files found"

dfs = []

for f in inactivated_files:
    f = Path(f)
    assert f.exists(), f"File not found: {f}"

    df = pd.read_excel(f)
    df["source_file"] = f.name
    dfs.append(df)

inactivated_master = pd.concat(dfs, ignore_index=True)

assert inactivated_master.shape[0] > 0, \
    "Inactivated Forms and Loglines master dataframe is empty"

inactivated_master.head()


  inactivated_master = pd.concat(dfs, ignore_index=True)


Unnamed: 0,Study Name,SiteGroupName(CountryName),SiteNumber,SubjectName,Overall Subject Status,Form 1 Subject Status,FolderName,Visit date,Form Type (Summary or Visit),No. #Days Page Missing,source_file,Country,Study Site Number,Subject,Folder,Form,Data on Form/\nRecord,RecordPostion,Audit Action,Study,Total Open issue Count per subject,.,MedDRA Coding Report,Dictionary,Dictionary Version number,Form OID,Logline,Field OID,Coding Status,Require Coding,Site,Visit,Actual Date,# Days Outstanding,Site number,Form Name,Lab category,Lab Date,Test Name,Test description,Issue,Comments,Form.1,RecordPosition,Project Name,Region,Site ID,Subject ID,Latest Visit (SV) (Source: Rave EDC: BO4),Subject Status (Source: PRIMARY Form),Input files,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,CPMD,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,SSM,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Visit Level Subject Status,FormName,Projected Date
0,Study 5,ARG,Site 165,Subject 2590,On Trial,Subject continuing,Screening,15 SEP 2025,Form 1,60.0,Study 5_Global Missing Pages_updated.xlsx,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Study 5,ARG,Site 165,Subject 2590,On Trial,Subject continuing,Screening,15 SEP 2025,Form 1,60.0,Study 5_Global Missing Pages_updated.xlsx,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Study 5,ARG,Site 165,Subject 2590,On Trial,Subject continuing,Screening,15 SEP 2025,Form 1,60.0,Study 5_Global Missing Pages_updated.xlsx,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Study 5,ARG,Site 165,Subject 2590,On Trial,Subject continuing,Screening,15 SEP 2025,Form 1,60.0,Study 5_Global Missing Pages_updated.xlsx,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Study 5,ARG,Site 165,Subject 2590,On Trial,Subject continuing,Screening,15 SEP 2025,Form 1,60.0,Study 5_Global Missing Pages_updated.xlsx,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [3]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace("%", "pct", regex=False)
        .str.replace("#", "num_", regex=False)
        .str.replace(r"[^\w]+", "_", regex=True)
        .str.replace(r"_+", "_", regex=True)
        .str.strip("_")
    )
    return df

inactivated_master = normalize_columns(inactivated_master)


In [4]:
inactivated_master.shape

(555988, 90)

In [5]:
inactivated_master.columns.tolist()

['study_name',
 'sitegroupname_countryname',
 'sitenumber',
 'subjectname',
 'overall_subject_status',
 'form_1_subject_status',
 'foldername',
 'visit_date',
 'form_type_summary_or_visit',
 'no_num_days_page_missing',
 'source_file',
 'country',
 'study_site_number',
 'subject',
 'folder',
 'form',
 'data_on_form_record',
 'recordpostion',
 'audit_action',
 'study',
 'total_open_issue_count_per_subject',
 '',
 'meddra_coding_report',
 'dictionary',
 'dictionary_version_number',
 'form_oid',
 'logline',
 'field_oid',
 'coding_status',
 'require_coding',
 'site',
 'visit',
 'actual_date',
 'num_days_outstanding',
 'site_number',
 'form_name',
 'lab_category',
 'lab_date',
 'test_name',
 'test_description',
 'issue',
 'comments',
 'form',
 'recordposition',
 'project_name',
 'region',
 'site_id',
 'subject_id',
 'latest_visit_sv_source_rave_edc_bo4',
 'subject_status_source_primary_form',
 'input_files',
 'unnamed_8',
 'unnamed_9',
 'unnamed_10',
 'unnamed_11',
 'unnamed_12',
 'unnamed_1

In [7]:
# drop unnamed columns
inactivated_master = inactivated_master.loc[
    :, ~inactivated_master.columns.str.startswith("unnamed")
]

# drop all-null columns
inactivated_master = inactivated_master.dropna(axis=1, how="all")

# drop duplicate columns
inactivated_master = inactivated_master.loc[
    :, ~inactivated_master.columns.duplicated()
]

# ensure no duplicate columns
assert inactivated_master.columns.duplicated().sum() == 0, \
    "Duplicate columns detected after normalization"


In [8]:
inactivated_master.columns.tolist()

['study_name',
 'sitegroupname_countryname',
 'sitenumber',
 'subjectname',
 'overall_subject_status',
 'form_1_subject_status',
 'foldername',
 'visit_date',
 'form_type_summary_or_visit',
 'no_num_days_page_missing',
 'source_file',
 'country',
 'study_site_number',
 'subject',
 'folder',
 'form',
 'data_on_form_record',
 'recordpostion',
 'audit_action',
 'study',
 'total_open_issue_count_per_subject',
 'meddra_coding_report',
 'dictionary',
 'dictionary_version_number',
 'form_oid',
 'logline',
 'field_oid',
 'coding_status',
 'require_coding',
 'site',
 'visit',
 'actual_date',
 'num_days_outstanding',
 'site_number',
 'form_name',
 'lab_category',
 'lab_date',
 'test_name',
 'test_description',
 'issue',
 'comments',
 'recordposition',
 'project_name',
 'region',
 'site_id',
 'subject_id',
 'latest_visit_sv_source_rave_edc_bo4',
 'subject_status_source_primary_form',
 'input_files',
 'cpmd',
 'ssm',
 'visit_level_subject_status',
 'formname',
 'projected_date']

In [9]:
# Authoritative column list from document
DOC_COLS = [
    "country",
    "site",
    "study_site_number",
    "subject",
    "folder",
    "form",
    "data_on_form_record",
    "recordposition",
    "audit_action",
    "study",
    "source_file",
]

# keep only columns that actually exist
DOC_COLS_EXISTING = [c for c in DOC_COLS if c in inactivated_master.columns]

inactivated_master = inactivated_master[DOC_COLS_EXISTING]

DOC_COLS_EXISTING


['country',
 'site',
 'study_site_number',
 'subject',
 'folder',
 'form',
 'data_on_form_record',
 'recordposition',
 'audit_action',
 'study',
 'source_file']

In [10]:
inactivated_master.head()


Unnamed: 0,country,site,study_site_number,subject,folder,form,data_on_form_record,recordposition,audit_action,study,source_file
0,,,,,,,,,,,Study 5_Global Missing Pages_updated.xlsx
1,,,,,,,,,,,Study 5_Global Missing Pages_updated.xlsx
2,,,,,,,,,,,Study 5_Global Missing Pages_updated.xlsx
3,,,,,,,,,,,Study 5_Global Missing Pages_updated.xlsx
4,,,,,,,,,,,Study 5_Global Missing Pages_updated.xlsx


In [13]:
inactivated_master.shape

(555988, 11)

In [12]:
inactivated_master.isna().sum()

country                435351
site                   554902
study_site_number      489276
subject                 55975
folder                 489159
form                   533106
data_on_form_record    489159
recordposition         517652
audit_action           489159
study                  124143
source_file                 0
dtype: int64

In [14]:
# subject_id (primary)
inactivated_master["subject_id"] = (
    inactivated_master["subject"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# site_id (secondary)
inactivated_master["site_id"] = (
    inactivated_master["study_site_number"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# study_id (descriptive only)
inactivated_master["study_id"] = (
    inactivated_master["study"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)


In [None]:
# Drop Rows Without Subject (STRICT)
rows_before = len(inactivated_master)
null_subject_rows = inactivated_master["subject_id"].isna().sum()

print(f"Dropping {null_subject_rows} rows without subject_id")

inactivated_master = inactivated_master[
    inactivated_master["subject_id"].notna()
]

rows_before, len(inactivated_master)


Dropping 55975 rows without subject_id


(555988, 500013)

In [17]:
# subject is mandatory
assert inactivated_master["subject_id"].notna().all(), \
    "Null subject_id remains"

# site is optional — log only
missing_sites = inactivated_master["site_id"].isna().sum()
print(f"Rows without site_id: {missing_sites}")


Rows without site_id: 433301


In [19]:
CANONICAL_KEYS = ["subject_id"]

agg_dict = {
    "is_inactivated": "sum",      # total inactivated records
}

# descriptive context
DESCRIPTIVE_COLS = ["country", "study_id"]

for col in DESCRIPTIVE_COLS:
    agg_dict[col] = "first"



In [20]:
agg_dict = {
    "is_inactivated": "sum",      # total inactivated records
}

# descriptive context
DESCRIPTIVE_COLS = ["country", "study_id"]

for col in DESCRIPTIVE_COLS:
    agg_dict[col] = "first"


In [22]:
# normalize audit_action
inactivated_master["audit_action_norm"] = (
    inactivated_master["audit_action"]
    .astype(str)
    .str.lower()
    .str.strip()
)

# define inactivated flag
inactivated_master["is_inactivated"] = (
    inactivated_master["audit_action_norm"]
    .str.contains("inactivat", na=False)
)


In [23]:
inactivated_agg = (
    inactivated_master
    .groupby(CANONICAL_KEYS, dropna=False)
    .agg(agg_dict)
    .reset_index()
)


In [24]:
inactivated_agg = inactivated_agg.rename(columns={
    "is_inactivated": "num_inactivated_records"
})


In [25]:
assert inactivated_agg.shape[0] > 0
assert inactivated_agg.duplicated(CANONICAL_KEYS).sum() == 0
assert (inactivated_agg["num_inactivated_records"] >= 0).all()


In [26]:
inactivated_agg["num_inactivated_records"].describe()


count    24358.000000
mean         2.743616
std         15.835337
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max        688.000000
Name: num_inactivated_records, dtype: float64

In [27]:
for col in CANONICAL_KEYS:
    inactivated_agg[col] = (
        inactivated_agg[col]
        .astype(str)
        .str.strip()
    )

for col in DESCRIPTIVE_COLS:
    inactivated_agg[col] = inactivated_agg[col].astype(str).str.strip()


In [28]:
out_parquet = INTERMEDIATE_DIR / "inactivated_forms_loglines_agg.parquet"
out_csv = INTERMEDIATE_DIR / "inactivated_forms_loglines_agg.csv"

inactivated_agg.to_parquet(out_parquet, index=False)
inactivated_agg.to_csv(out_csv, index=False)

(out_parquet, out_csv)


(PosixPath('../data/intermediate/inactivated_forms_loglines_agg.parquet'),
 PosixPath('../data/intermediate/inactivated_forms_loglines_agg.csv'))