# Visit_Projection_Tracker â€” Aggregation & Qualification (Phase 1)

## Goal
Load Visit Projection Tracker data across all studies, normalize schemas,
perform initial structural checks, and prepare for canonical aggregation
to subject/site level.


In [1]:
# imports
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

DATA_DIR = Path("../data")
INTERMEDIATE_DIR = DATA_DIR / "intermediate"

inventory = pd.read_csv(INTERMEDIATE_DIR / "file_inventory.csv")


In [3]:
# Load Visit Projection Tracker files
visit_proj_files = inventory[
    inventory.file_type == "visit_projection"
]["file_path"]

assert len(visit_proj_files) > 0, "No Visit Projection Tracker files found"

dfs = []

for f in visit_proj_files:
    f = Path(f)
    assert f.exists(), f"File not found: {f}"

    df = pd.read_excel(f)
    df["source_file"] = f.name
    dfs.append(df)

visit_proj_master = pd.concat(dfs, ignore_index=True)

assert visit_proj_master.shape[0] > 0, "Visit Projection master dataframe is empty"

visit_proj_master.head()


  visit_proj_master = pd.concat(dfs, ignore_index=True)


Unnamed: 0,Country,Site,Subject,Visit,Projected Date,# Days Outstanding,source_file,# Days Outstanding (TODAY - PROJECTED\nDATE),# Days Outstanding.1,Study,Subject Name,Visit date,Novartis: Restricted,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,AUS,Site 2,Subject 2,EFFICACYFU2,30SEP2025,45.0,Study 5_Visit Projection Tracker_updated.xlsx,,,,,,,,,,,
1,AUS,Site 2,Subject 3,LONGTERMSAFETYFU1,30OCT2025,15.0,Study 5_Visit Projection Tracker_updated.xlsx,,,,,,,,,,,
2,AUS,Site 2,Subject 4,EFFICACYFU1,22OCT2025,23.0,Study 5_Visit Projection Tracker_updated.xlsx,,,,,,,,,,,
3,FRA,Site 277,Subject 2339,LONGTERMSAFETYFU1,11NOV2025,3.0,Study 5_Visit Projection Tracker_updated.xlsx,,,,,,,,,,,
4,FRA,Site 277,Subject 2345,CYCLE4WEEK3,05NOV2025,9.0,Study 5_Visit Projection Tracker_updated.xlsx,,,,,,,,,,,


In [4]:
visit_proj_master.shape

(2016, 18)

In [6]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace("%", "pct", regex=False)
        .str.replace("#", "num_", regex=False)
        .str.replace(r"[^\w]+", "_", regex=True)
        .str.replace(r"_+", "_", regex=True)
        .str.strip("_")
    )
    return df

visit_proj_master = normalize_columns(visit_proj_master)


In [7]:
visit_proj_master.columns.to_list()

['country',
 'site',
 'subject',
 'visit',
 'projected_date',
 'num_days_outstanding',
 'source_file',
 'num_days_outstanding_today_projected_date',
 'num_days_outstanding',
 'study',
 'subject_name',
 'visit_date',
 'novartis_restricted',
 'unnamed_1',
 'unnamed_2',
 'unnamed_3',
 'unnamed_4',
 'unnamed_5']

In [9]:
# drop unnamed columns
visit_proj_master = visit_proj_master.loc[
    :, ~visit_proj_master.columns.str.startswith("unnamed")
]

# drop all-null columns
visit_proj_master = visit_proj_master.dropna(axis=1, how="all")

# drop duplicate columns, keeping the first occurrence
visit_proj_master = visit_proj_master.loc[
    :, ~visit_proj_master.columns.duplicated()
]

# ensure no duplicate columns
assert visit_proj_master.columns.duplicated().sum() == 0, \
    "Duplicate columns detected after normalization"


In [10]:
visit_proj_master.shape

(2016, 9)

In [12]:
visit_proj_master.columns.to_list()

['country',
 'site',
 'subject',
 'visit',
 'projected_date',
 'num_days_outstanding',
 'source_file',
 'num_days_outstanding_today_projected_date',
 'novartis_restricted']

In [13]:
# --- create canonical identifiers ---

# study_id from filename
visit_proj_master["study_id"] = (
    visit_proj_master["source_file"]
    .astype(str)
    .str.extract(r"(Study[\s_]*\d+)", expand=False)
)

# site_id
visit_proj_master["site_id"] = (
    visit_proj_master["site"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)

# subject_id
visit_proj_master["subject_id"] = (
    visit_proj_master["subject"]
    .astype(str)
    .str.strip()
    .replace({"": np.nan, "nan": np.nan})
)


In [14]:
visit_proj_master.columns.to_list()

['country',
 'site',
 'subject',
 'visit',
 'projected_date',
 'num_days_outstanding',
 'source_file',
 'num_days_outstanding_today_projected_date',
 'novartis_restricted',
 'study_id',
 'site_id',
 'subject_id']

In [19]:
CANONICAL_KEYS = ["site_id", "subject_id"]

visit_proj_master[CANONICAL_KEYS].isna().sum()


site_id       0
subject_id    0
dtype: int64

In [20]:
rows_before = len(visit_proj_master)

null_subject_rows = visit_proj_master["subject_id"].isna().sum()
print(f"Dropping {null_subject_rows} rows without subject_id")

visit_proj_master = visit_proj_master[
    visit_proj_master["subject_id"].notna()
]

rows_after = len(visit_proj_master)
rows_before, rows_after


Dropping 0 rows without subject_id


(509, 509)

In [21]:
visit_proj_master[CANONICAL_KEYS].isna().sum()

site_id       0
subject_id    0
dtype: int64

In [22]:
assert visit_proj_master[CANONICAL_KEYS].isna().sum().sum() == 0, \
    "Null values found in canonical keys"

In [24]:
missing_study = visit_proj_master["study_id"].isna().sum()
print(f"Visit Projection Tracker: {missing_study} rows without study_id")

Visit Projection Tracker: 98 rows without study_id


In [25]:
DESCRIPTIVE_COLS = []

for col in ["country", "study_id"]:
    if col in visit_proj_master.columns:
        DESCRIPTIVE_COLS.append(col)

DESCRIPTIVE_COLS


['country', 'study_id']

In [26]:
METRIC_COL = "num_days_outstanding"

agg_dict = {
    METRIC_COL: "max"
}

for col in DESCRIPTIVE_COLS:
    agg_dict[col] = "first"

agg_dict


{'num_days_outstanding': 'max', 'country': 'first', 'study_id': 'first'}

In [27]:
visit_proj_agg = (
    visit_proj_master
    .groupby(CANONICAL_KEYS, dropna=False)
    .agg(agg_dict)
    .reset_index()
)


In [29]:
# Dropping negative days outstanding
visit_proj_agg = visit_proj_agg[
    visit_proj_agg[METRIC_COL] >= 0
]

In [30]:
assert visit_proj_agg.shape[0] > 0, "Aggregation produced empty dataframe"
assert visit_proj_agg.duplicated(CANONICAL_KEYS).sum() == 0, \
    "Duplicate rows after aggregation"
assert (visit_proj_agg[METRIC_COL] >= 0).all(), \
    "Negative days outstanding detected"


In [31]:
visit_proj_agg[METRIC_COL].describe(percentiles=[0.95, 0.99])


count    143.000000
mean      23.384615
std       50.867533
min        0.000000
50%       10.000000
95%       60.600000
99%      312.780000
max      373.000000
Name: num_days_outstanding, dtype: float64

In [32]:
for col in CANONICAL_KEYS:
    visit_proj_agg[col] = (
        visit_proj_agg[col]
        .astype(str)
        .str.strip()
    )

for col in DESCRIPTIVE_COLS:
    visit_proj_agg[col] = visit_proj_agg[col].astype(str).str.strip()


In [33]:
out_parquet = INTERMEDIATE_DIR / "visit_projection_tracker_agg.parquet"
out_csv = INTERMEDIATE_DIR / "visit_projection_tracker_agg.csv"

visit_proj_agg.to_parquet(out_parquet, index=False)
visit_proj_agg.to_csv(out_csv, index=False)

(out_parquet, out_csv)


(PosixPath('../data/intermediate/visit_projection_tracker_agg.parquet'),
 PosixPath('../data/intermediate/visit_projection_tracker_agg.csv'))