In [1]:
from periomod.data import ProcessedDataLoader
from periomod.resampling import Resampler
import pandas as pd
import numpy as np

dataloader = ProcessedDataLoader(
    task="pocketclosure",
    encoding="onehot",
    encode=False, 
    scale=False
)

data = dataloader.load_data(path="../data/processed/processed_data.csv")
resampler = Resampler(classification="binary", encoding="one_hot")
train_df, test_df = resampler.split_train_test_df(df=data)

val_df = dataloader.load_data(path="../data/processed/Leuven_Dataset_v2.csv")


In [2]:
data["gender"].value_counts()

gender
1    53028
0    46896
Name: count, dtype: int64

In [3]:
from periomod.data import ProcessedDataLoader, _helpers
from periomod.resampling import Resampler
import pandas as pd
import numpy as np
stageimputer=_helpers.PeriodontalStageGradeExtentCalculator()
data= stageimputer.assign_stage_grade_extent(data)



ValueError: setting an array element with a sequence.

In [12]:
data["extent"].value_counts()

extent
0    63168
1    36756
Name: count, dtype: int64

In [4]:


def _get_occluding_teeth():
    """
    Load the occluding teeth mapping from the specified CSV file.
    Returns:
        dict: A dictionary mapping tooth numbers to their occluding teeth.
    """
    occluding_pairs = [(17, 47), (16, 46), (15, 45), (14, 44), (13, 43),
            (12, 42), (11, 41), (21, 31), (22, 32), (23, 33), (24, 34), 
            (25, 35), (26, 36), (27, 37)]

    return occluding_pairs

def _calculate_occluding_pairs(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the number of occluding pairs of teeth per patient 
    and assign to a new column "occluding_pairs" per row.

    Args:
        df (pd.DataFrame): DataFrame containing patient data with 
            columns "id_patient" and "tooth".

    Returns:
        pd.DataFrame: DataFrame with an additional column "occluding_pairs" 
            indicating the number of occluding pairs per patient.
    """
    occluding_pairs = _get_occluding_teeth()
    df_teeth = df.groupby(["id_patient", "tooth"]).size().reset_index()[["id_patient", "tooth"]]
    
    # Build a mapping of patient_id -> set of teeth
    patient_teeth = df_teeth.groupby("id_patient")["tooth"].apply(set)

    # Count occluding pairs per patient
    occlusion_map = {
        patient_id: sum(1 for upper, lower in occluding_pairs if upper in teeth and lower in teeth)
        for patient_id, teeth in patient_teeth.items()
    }

    # Map occluding pair counts back to original DataFrame
    df["occluding_pairs"] = df["id_patient"].map(occlusion_map)
    return df

def _calculate_stage_for_row_occludingpairs(row: pd.Series) -> int:
    """Calculates the periodontal stage for a given row based 
    on CAL and missing teeth.

    Args:
        row (pd.Series): A row from the DataFrame containing 'CAL' and
            'missing_teeth' columns.

    Returns:
        int: The periodontal stage:
            - 4 if CAL >= 5 and missing teeth >= 5
            - 3 if CAL >= 5 and missing teeth < 5
            - 2 if CAL in [3, 4]
            - 1 if CAL in [1, 2]
            - 0 otherwise
    """
    cal = row['CAL']
    missing = row.get('missing_teeth', 0)
    occluding_pairs= row.get('occluding_pairs', 0)
    furcation_involvement = row.get("furcationbaseline", 0)

    if cal >= 5 and occluding_pairs < 10:
        return 4
    elif cal >= 5 and occluding_pairs > 9:
        return 3
    elif cal in [3, 4]:
        if furcation_involvement > 1:
            return 3
        else:
            return 2
    elif cal in [1, 2]:
        return 1
    else:
        return 0
    
def _calculate_cal(data: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the CAL (Clinical Attachment Level) for each row in the DataFrame.
    The formula used is: CAL = PDBaseline + RecBaseline - 3

    Args:
        data (pd.DataFrame): DataFrame containing patient data with columns 
        "pdbaseline" and "recbaseline".
    
    Returns:
        pd.DataFrame: DataFrame with an additional column "CAL" indicating 
        the Clinical Attachment Level.
    """
    data["CAL"] = np.where(
        data["recbaseline"] > 0,
        data["pdbaseline"] + data["recbaseline"],
        data["pdbaseline"] - 3
    )
    return data

def _calculate_missing_teeth_per_patient(data: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the number of missing teeth per patient (excluding wisdom teeth)
    and assign to a new column "missing_teeth" per row.

    Args:  
        data (pd.DataFrame): DataFrame containing patient data with columns "id_patient" and "tooth".
    
    Returns:
        pd.DataFrame: DataFrame with an additional column "missing_teeth" indicating the number of missing teeth per patient.
    """
    fdi_teeth = [t for t in range(11, 49) if t % 10 not in [0, 8, 9]]

    missing_teeth_dict = {}

    for id in data["id_patient"].unique():
        patient_teeth = data[(data["id_patient"] == id)]["tooth"]
        patient_teeth = patient_teeth[~patient_teeth.isin([18, 28, 38, 48])].unique()

        missing_teeth = len(set(fdi_teeth) - set(patient_teeth))
        missing_teeth_dict[id] = missing_teeth

    data["missing_teeth"] = data["id_patient"].map(missing_teeth_dict)

    return data

def _calculate_stage_by_patient(df: pd.DataFrame) -> pd.DataFrame:
    """
    Assigns:
    - 'tooth_stage': stage per row (site) based on CAL and occluding pairs
    - 'max_stage': maximum stage per patient, with upgrade if ≥2 non-adjacent teeth have PPD ≥ 6 mm

    Args:
        df (pd.DataFrame): DataFrame with "id_patient", "side", "CAL", "pdbaseline", "tooth", and "occluding_pairs"

    Returns:
        pd.DataFrame: Original DataFrame with added columns:
            - 'tooth_stage': row-specific stage
            - 'max_stage': highest stage found for the patient (after rule)
    """
    # Apply tooth-level staging
    stage_df = df[df["side"].isin([1, 3, 4, 6])].copy()
    stage_df['tooth_stage'] = stage_df.apply(_calculate_stage_for_row_occludingpairs, axis=1)

    # Initialize patient stage mapping
    stage_map = {}

    # Tooth adjacency map (FDI, excluding third molars)
    adjacent_pairs = {
        11: [12, 21], 12: [11, 13], 13: [12, 14], 14: [13, 15], 15: [14, 16], 16: [15, 17], 17: [16],
        21: [11, 22], 22: [21, 23], 23: [22, 24], 24: [23, 25], 25: [24, 26], 26: [25, 27], 27: [26],
        31: [32, 41], 32: [31, 33], 33: [32, 34], 34: [33, 35], 35: [34, 36], 36: [35, 37], 37: [36],
        41: [31, 42], 42: [41, 43], 43: [42, 44], 44: [43, 45], 45: [44, 46], 46: [45, 47], 47: [46]
    }

    for patient_id, group in stage_df.groupby("id_patient"):
        max_stage = group["tooth_stage"].max()

        # Check if upgrade condition applies (PPD ≥ 6 at ≥2 non-adjacent teeth)
        if max_stage == 2:
            ppd_teeth = set(group[group["pdbaseline"] >= 6]["tooth"].unique())
            non_adjacent_teeth = []

            for tooth in ppd_teeth:
                if all(tooth not in adjacent_pairs.get(other, []) for other in ppd_teeth if other != tooth):
                    non_adjacent_teeth.append(tooth)

            if len(non_adjacent_teeth) >= 2:
                max_stage = 3  # Upgrade due to non-adjacent deep pockets

        stage_map[patient_id] = max_stage

    # Map results back to full dataframe
    df['tooth_stage'] = df.apply(_calculate_stage_for_row_occludingpairs, axis=1)
    df['max_stage'] = df['id_patient'].map(stage_map)

    return df

def impute_stage(self, df):
    """
    Assign the highest periodontal stage per patient based on CAL and missing teeth.
    Returns a DataFrame with the 'stage' column added.
    """
    df = _calculate_occluding_pairs(df)
    df = _calculate_missing_teeth_per_patient(df)
    df = _calculate_cal(df)
    df = _calculate_stage_by_patient(df)
    return df

def _get_sideencoding():
    """
    Load the side encoding mapping for tooth surfaces.
    Returns:
        dict: A dictionary mapping surface labels to their corresponding site indices.
    """
    sideencoding = {"m": [3, 4], "d": [1, 6], "b": [2], "o": [5]}
    return sideencoding

def _get_gender_map():
    """
    Load the gender mapping for patient data.
    Returns:
        dict: A dictionary
    """
    gender_map = {0: "women", 1: "men"}
    return gender_map

def _get_surface_label(site_index: int):
    """
    Get the surface label based on the site index.
    Args:
        site_index (int): The index of the site (1-6).
    
    Returns:
        str: The surface label ("m", "d", "avg_md", or None
    """
    sideencoding = _get_sideencoding()
    if site_index in sideencoding["m"]:
        return "m"
    elif site_index in sideencoding["d"]:
        return "d"
    elif site_index in sideencoding["b"] or site_index in sideencoding["o"]:
        return "avg_md"
    return None

def _get_rootlength_data():
    """
    Load root length data from the specified CSV file.
    """
    rootlength = pd.read_csv("Revision/root_length_percentages.csv")
    return rootlength

def _calculate_bone_loss_percentage_row(row: pd.Series) -> float:
    """
    Calculate the bone loss percentage for a given row.

    Args:
        row (pd.Series): A row from the DataFrame containing 'pdbaseline', 'recbaseline', 'tooth', 'side', and

    Returns:
        float: The bone loss percentage calculated using the formula:
               Bone Loss Percentage = (PDBaseline + RecBaseline) / Root Length * 100
               Returns None if the root length is not found.
    """
    rootlengthmap = _get_rootlength_data()
    gender_map = _get_gender_map()
    tooth = row['tooth']
    surface = _get_surface_label(row['side'])
    gender = gender_map[row['gender']]

    if surface == "avg_md":
        m = rootlengthmap[
            (rootlengthmap['Tooth'] == tooth) &
            (rootlengthmap['Gender'] == gender) &
            (rootlengthmap['Surface'] == 'm')
        ]['R'].values
        d = rootlengthmap[
            (rootlengthmap['Tooth'] == tooth) &
            (rootlengthmap['Gender'] == gender) &
            (rootlengthmap['Surface'] == 'd')
        ]['R'].values
        if len(m) == 0 or len(d) == 0:
            return None
        root_length = (m[0] + d[0]) / 2
    else:
        r = rootlengthmap[
            (rootlengthmap['Tooth'] == tooth) &
            (rootlengthmap['Gender'] == gender) &
            (rootlengthmap['Surface'] == surface)
        ]['R'].values
        if len(r) == 0:
            return None
        root_length = r[0]
    return round((row['pdbaseline'] + row['recbaseline']-3) / root_length, 1)* 100

def _apply_bone_loss_percentage(df: pd.DataFrame
                                   ) -> pd.DataFrame:
    """
    Calculate the bone loss percentage for each row in the DataFrame.
    The formula used is: Bone Loss Percentage = (PDBaseline + RecBaseline) / Root Length * 100

    Args:
        df (pd.DataFrame): DataFrame containing patient data with columns "pdbaseline", "recbaseline", "tooth", "side", and
    
    returns:
        pd.DataFrame: DataFrame with an additional column "bone_loss_percentage" indicating the bone loss percentage.
    """
    df['bone_loss_percentage'] = df.apply(_calculate_bone_loss_percentage_row, axis=1)
    return df

def _calculate_boneloss_per_age(df: pd.DataFrame
                               )-> pd.DataFrame:
    """
    Calculate the bone loss per age for each row in the DataFrame.
    The formula used is: Bone Loss per Age = Bone Loss Percentage / Age

    Args:
        df (pd.DataFrame): DataFrame containing patient data with columns "bone_loss_percentage" and "age".
    
    Returns:
        pd.DataFrame: DataFrame with an additional column "bl/age" indicating the bone loss per age.
    """
    df["bl/age"] = df["bone_loss_percentage"] / df["age"]
    return df

def _grade_from_row(row: pd.Series) -> int:
    """
    Determine the periodontal grade based on bone loss percentage and age.
    
    Args:
        row (pd.Series): A row from the DataFrame containing 'bl/age'.
    
    Returns:
        int: The periodontal grade (0, 1, or 2).
    """
    bl_age = row['bl/age']
    cigarettenumber = row.get("cigarettenumber", 0)
    diabetes = row.get("diabetes", 0)

    if bl_age < 0.25:
        if cigarettenumber >= 10:
            return 2
        if cigarettenumber > 0 or diabetes > 1:
            return 1
        else:
            return 0
    elif bl_age < 1:
        if cigarettenumber >= 10:
            return 2
        else:
            return 1
    else:
        return 2
    
def _assign_grade(df: pd.DataFrame) -> pd.DataFrame:
    """
    Assign the periodontal grade based on the bone loss percentage and age.
    Returns a DataFrame with the 'grade' column added.
    """
    df['grade'] = df.apply(_grade_from_row, axis=1)
    return df

def _calculate_grade_by_patient(df: pd.DataFrame) -> pd.DataFrame:
    """
    Assign the highest periodontal grade per patient based on the number of missing teeth.
    Returns a DataFrame with the 'grade' column added.

    Args:
        df (pd.DataFrame): DataFrame containing patient data with columns "id_patient", "side", "bl/age".
    
    Returns:
        pd.DataFrame: DataFrame with an additional column "grade" indicating the highest periodontal grade per patient.
    """
    grade_df = df[df["side"].isin([1, 3, 4, 6])].copy()
    grade_df['grade_temp'] = grade_df.apply(_grade_from_row, axis=1)
    max_grade_per_patient = grade_df.groupby('id_patient')['grade_temp'].max()
    df['grade'] = df['id_patient'].map(max_grade_per_patient)

    return df

def assign_grade_by_patient(df: pd.DataFrame) -> pd.DataFrame:
    """
    Assign the highest periodontal grade per patient based on the bone loss percentage and age.
    Returns a DataFrame with the 'grade' column added.
    """
    df = _apply_bone_loss_percentage(df)
    df = _calculate_boneloss_per_age(df)
    df = _assign_grade(df)
    df = _calculate_grade_by_patient(df)
    return df

def assign_extent(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the extent of periodontal disease for each patient based on the maximum stage
    and the percentage of teeth at that stage.

    Args:
        df (pd.DataFrame): DataFrame containing patient data with columns "id_patient", "side", "tooth_stage", "missing_teeth".
        
    Returns:
        pd.DataFrame: DataFrame with additional columns "extent" and "percent_max_stage".
    """
    extent_map = {}
    percent_map = {}
    df_side = df[df["side"].isin([1, 3, 4, 6])].copy()

    for patient_id in df_side["id_patient"].unique():
        patient_data = df_side[df_side["id_patient"] == patient_id]
        patient_tooth_data = patient_data.groupby("tooth").max(numeric_only=True)

        max_stage = patient_tooth_data["max_stage"].max()
        max_stage_count = (patient_tooth_data["tooth_stage"] == max_stage).sum()
        total_teeth = patient_tooth_data.shape[0]

        percent_max_stage = (max_stage_count / total_teeth * 100) if total_teeth > 0 else 0
        extent = int(percent_max_stage >= 30)

        extent_map[patient_id] = extent
        percent_map[patient_id] = percent_max_stage

    df["extent"] = df["id_patient"].map(extent_map)
    df["percent_max_stage"] = df["id_patient"].map(percent_map)

    return df

def _get_summary_data(
    train: pd.DataFrame, 
    test: pd.DataFrame, 
    val: pd.DataFrame, 
    predictor: list = ["max_stage", "grade", "extent"]
    ) -> pd.DataFrame:
    """
    Summarize predictor counts and percentages across train, test, and validation sets.

    Returns:
        pd.DataFrame with columns:
        ['predictor', 'category', 'train', 'test', 'val', 'train_percent', 'test_percent', 'val_percent']
    """
    records = []

    for pred in predictor:
        # Aggregate per patient
        train_counts = train.groupby("id_patient")[pred].first().value_counts().sort_index()
        test_counts  = test.groupby("id_patient")[pred].first().value_counts().sort_index()
        val_counts   = val.groupby("id_patient")[pred].first().value_counts().sort_index()

        all_categories = sorted(set(train_counts.index) | set(test_counts.index) | set(val_counts.index))

        for cat in all_categories:
            train_n = train_counts.get(cat, 0)
            test_n  = test_counts.get(cat, 0)
            val_n   = val_counts.get(cat, 0)

            train_pct = round(train_n / train_counts.sum() * 100, 1) if train_counts.sum() else 0
            test_pct  = round(test_n  / test_counts.sum() * 100, 1) if test_counts.sum() else 0
            val_pct   = round(val_n   / val_counts.sum() * 100, 1) if val_counts.sum() else 0

            records.append({
                "predictor": pred,
                "category": cat,
                "train": train_n,
                "test": test_n,
                "val": val_n,
                "train_percent": train_pct,
                "test_percent": test_pct,
                "val_percent": val_pct
            })

    return pd.DataFrame(records)


In [5]:
test_df = assign_stage(test_df)
train_df = assign_stage(train_df)
val_df= assign_stage(val_df)

test_df = assign_grade_by_patient(test_df)
train_df = assign_grade_by_patient(train_df)
val_df = assign_grade_by_patient(val_df)

train_df= assign_extent(train_df)
test_df= assign_extent(test_df)
val_df= assign_extent(val_df)

NameError: name 'assign_stage' is not defined

In [None]:
val_df.to_csv("val_df.csv", index=False)
train_df.to_csv("train_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)

In [None]:
summary=_get_summary_data(train_df, test_df, val_df)


In [None]:
summary.to_latex("summary.tex", index=False, float_format="%.1f", escape=False)

In [None]:
df = pd.read_excel("Revision/Dataset_Leuven.xlsx", header=0)

In [None]:
import pandas as pd

# Ensure both columns are datetime
df["ExaminationDate"] = pd.to_datetime(df["ExaminationDate"])
df["ExaminationDate_rev"] = pd.to_datetime(df["ExaminationDate_rev"])

# Calculate month difference (rounded down)
df["months_between"] = (
    (df["ExaminationDate_rev"].dt.year - df["ExaminationDate"].dt.year) * 12 +
    (df["ExaminationDate_rev"].dt.month - df["ExaminationDate"].dt.month)
)


In [None]:
df.describe()

Unnamed: 0,Tooth,PD,Mobility,BOP,RootNumber,Toothtype,Restoration,ID,Age,ExaminationDate,...,BOP_rev,ExaminationDate_rev,Furcation,Furcation_rev,BodyMassIndex,CigaretteNumber,Stresslvl,Sickdays_Year,Plaque (%),months_between
count,14464.0,14374.0,14380.0,14380.0,13726.0,13726.0,13459.0,14380.0,14380.0,14380,...,13993.0,13993,579.0,371.0,6888.0,2490.0,14050.0,0.0,0.0,13909.0
mean,29.169939,3.669125,1.695688,1.676843,1.350794,1.837899,0.719816,72553210.0,49.011822,2022-12-22 09:24:41.057023744,...,1.229758,2023-08-09 12:15:04.566569216,1.181347,1.272237,26.561463,10.86747,4.54427,,,7.645913
min,11.0,0.0,0.0,1.0,1.0,1.0,0.0,60205610.0,24.0,2020-12-16 00:00:00,...,1.0,2021-06-01 00:00:00,0.0,0.0,19.05,2.0,2.0,,,3.0
25%,21.0,2.0,1.0,1.0,1.0,1.0,0.0,63777480.0,38.0,2022-02-22 00:00:00,...,1.0,2022-10-26 00:00:00,1.0,1.0,22.77,6.0,4.0,,,5.0
50%,31.0,3.0,2.0,2.0,1.0,2.0,1.0,71606350.0,51.0,2023-01-18 00:00:00,...,1.0,2023-10-04 00:00:00,1.0,1.0,25.9,10.0,5.0,,,7.0
75%,41.0,5.0,2.0,2.0,2.0,3.0,1.0,80913200.0,60.0,2023-11-28 00:00:00,...,1.0,2024-06-18 00:00:00,2.0,2.0,28.37,15.0,5.0,,,9.0
max,48.0,15.0,2.0,2.0,2.0,3.0,2.0,89851970.0,76.0,2024-06-03 00:00:00,...,2.0,2024-10-14 00:00:00,3.0,3.0,40.0,20.0,8.0,,,32.0
std,11.43616,1.731837,0.562916,0.467698,0.477236,0.838349,0.515651,9608890.0,13.486381,,...,0.420692,,0.665458,0.513346,4.762956,4.981858,1.188209,,,4.295736
