In [1]:
from Training_Helper_Functions import *
from Preprocessing_Functions2 import * 
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import pandas as pd

from sklearn.model_selection import train_test_split

Using cuda


In [2]:
random_state = 42
import os
os.makedirs("./DATA/folds", exist_ok=True)
os.makedirs("./DATA/holdout_set", exist_ok=True)
os.makedirs("./DATA/synthetic_training_set", exist_ok=True)
os.makedirs("./DATA/training_set", exist_ok=True)
# Load and split dataset
data = pd.read_csv("./original_dataset/processed_data_encoded.csv")
X = data.drop(columns=["DR"])  # Keep BMI and TCTG if you're removing them later
Y = data["DR"]

X_folds, X_test, Y_folds, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=random_state)

# Save training and holdout sets
pd.concat([X_folds, Y_folds], axis=1).reset_index(drop=True).to_csv("./DATA/training_set/training_data.csv", index=False)
pd.concat([X_test, Y_test], axis=1).reset_index(drop=True).to_csv("./DATA/holdout_set/holdout_data.csv", index=False)

# Apply encoding
df_train, df_test = apply_one_hot_encoding(
    pd.concat([X_folds, Y_folds], axis=1).reset_index(drop=True),
    pd.concat([X_test, Y_test], axis=1).reset_index(drop=True)
)

df_test.to_csv("./DATA/holdout_set/holdout_data_OHE.csv", index=False) #only do OHE for holdout set for now

In [3]:
def Outlier_Removal(df_train, OD_majority, OD_minority): 
    cont_cols = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 
                 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c', 'Height', 'Weight', 'BMI', 'Duration']
    # Use the original encoded single column name here
    cat_cols = ['Gender', 'Community'] 
    y_col = 'DR'

    print("Original class distribution:",df_train[y_col].value_counts())
    assert y_col in df_train.columns, f"'{y_col}' column is missing in the DataFrame."
    
    #* OUTLIER DETECTION START
    available_cont_cols = [col for col in cont_cols if col in df_train.columns]
    df_majority = df_train[df_train[y_col] == 0].copy()
    df_minority = df_train[df_train[y_col] == 1].copy()
    if OD_majority is not None:
        outliers_majority = OD_majority.fit_predict(df_majority[available_cont_cols])
        df_majority = df_majority[outliers_majority == 1]
        print(f"After OD, majority: {len(df_majority)}")
    if OD_minority is not None:
        outliers_minority = OD_minority.fit_predict(df_minority[available_cont_cols])
        df_minority = df_minority[outliers_minority == 1]
        print(f"After OD, minority: {len(df_minority)}")
    df_after_OD = pd.concat([df_majority, df_minority], ignore_index=True)
    #* OUTLIER DETECTION END - df_after_OD is the new df
    return df_after_OD

from imblearn.over_sampling import SMOTENC
def apply_smotenc_oversampling(df_train):
    
    cont_cols = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 
                 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c', 'Height', 'Weight', 'BMI', 'Duration']
    # Use the original encoded single column name here
    cat_cols = ['Gender', 'Community'] 
    y_col = 'DR'

    print("\nApplying SMOTENC oversampling...")

    # Split features and label
    X = df_train.drop(columns=[y_col])
    y = df_train[y_col]

    # Find indices of categorical features
    cat_indices = [X.columns.get_loc(col) for col in cat_cols if col in X.columns]

    # Ensure 'Community' is integer type if present
    if 'Community' in X.columns:
        X['Community'] = X['Community'].astype(int)

    oversampler = SMOTENC(categorical_features=cat_indices, random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    print(pd.DataFrame(X_resampled, columns=X.columns).describe())
    print("\nFinal class distribution:", pd.Series(y_resampled).value_counts())

    # Recombine into a single DataFrame
    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[y_col] = y_resampled

    return df_resampled

In [4]:
from sklearn.model_selection import StratifiedKFold
def FOLDS_GENERATOR1(df:pd.DataFrame, n_splits=5, random_state=42, OD_majority=None, OD_minority=None,
                    synthesizer = "TVAE", epochs = 100, batch_size=512, n_synthetic_data=0,
                    scaler=None):
    """
    Generates n_splits folds for cross-validation.
    """
    cont_cols = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 
                 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c', 'Height', 'Weight', 'BMI', 'Duration']
    cat_cols = ['Gender', 'Community'] 
    # Create a StratifiedKFold object
    kF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    # Initialize a list to hold the folds
    df_copy = df.copy()
    X = df_copy.drop(columns=["DR"])
    Y = df[["DR"]]
    X = X.drop(columns=["BMI", "TCTG"])

    kFolds_list = []
    for fold, (train_idx, test_idx) in enumerate(kF.split(X, Y)):
        # Split the data into training and testing sets for this fold
        train = pd.concat([X.iloc[train_idx], Y.iloc[train_idx]], axis=1)
        test = pd.concat([X.iloc[test_idx], Y.iloc[test_idx]], axis=1)
        
        # #* OUTLIER DETECTION
        X_train_processed = Outlier_Removal(train, 
                                            OD_majority=OD_majority,
                                            OD_minority=OD_minority,
                                            )
        X_train_processed = apply_smotenc_oversampling(X_train_processed)
        # #* OVERSAMPLING & SYNTHETIC DATA GENERATION
        print("Before oversampling & synthetic data:", X_train_processed[["DR"]].value_counts())
        X_train_processed = Synthetic_Data_Generator2(X_train_processed, fold, synthesizer=synthesizer, epochs=epochs, batch_size=512, n_synthetic_data=n_synthetic_data)
        
        
        print("After oversampling & synthetic data:", X_train_processed[["DR"]].value_counts())
        X_train_processed = train.copy()
        
        #* Calculate BMI, TCTG & ENCODING
        X_train_processed, test = get_bmi(X_train_processed, test)
        X_train_processed, test = get_TCTG(X_train_processed, test)
        X_train_processed, test = apply_one_hot_encoding(X_train_processed, test)
        #* Scaler
        X_train_processed[cont_cols] = scaler.fit_transform(X_train_processed[cont_cols])
        test[cont_cols] = scaler.transform(test[cont_cols])
        # Append processed data (excluding the target column 'DR')

        # Save to CSV with fold number
        X_train_processed.to_csv(f"./DATA/folds/train_fold_{fold}.csv", index=False)
        test.to_csv(f"./DATA/folds/test_fold_{fold}.csv", index=False)
        kFolds_list.append((
                            X_train_processed.drop(columns=['DR']),
                            test.drop(columns=['DR']),
                            X_train_processed['DR'].values.reshape(-1, 1),  # Ensures the target is 2D
                            test['DR'].values.reshape(-1, 1)  # Ensures the target is 2D
                        ))
        break
    print(f"Fold: {fold+1}, Train: {X_train_processed.drop(columns=['DR']).shape}, Test: {test.drop(columns=['DR']).shape}")
    
    return kFolds_list


In [6]:
# from sklearn import standardscaler
from sklearn.preprocessing import StandardScaler
# scaler = RobustScaler()
scaler = StandardScaler()
training_set = pd.read_csv("./DATA/training_set/training_data.csv")

kFolds = FOLDS_GENERATOR1(training_set, n_splits=5, random_state=42,             
                            OD_majority = IQRDetector(factor=1),
                            OD_minority = IQRDetector(factor=2.5),
                            synthesizer = "TVAE",
                            epochs = 1000,
                            n_synthetic_data = 10000,
                            scaler=scaler,      
                            ) 

Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 2136
After OD, minority: 377

Applying SMOTENC oversampling...
               Age       Gender    Community         UAlb           Ucr  \
count  4272.000000  4272.000000  4272.000000  4272.000000   4272.000000   
mean     63.407783     0.538858     4.034176    17.776940   3601.733876   
std       6.674629     0.498546     3.094886    24.725244   5182.172389   
min      36.000000     0.000000     0.000000     0.100000      1.000000   
25%      59.000000     0.000000     1.000000     4.600000      6.000000   
50%      63.752691     1.000000     4.000000     9.250930     11.000000   
75%      68.000000     1.000000     7.000000    19.664959   6899.750000   
max      87.000000     1.000000     9.000000   174.800000  19307.000000   

              UACR           TC           TG         LDLC         HDLC  \
count  4272.000000  4272.000000  4272.000000  4272.000000  4272.000000   
mean     19

Loss: 3.985: 100%|██████████| 1000/1000 [01:50<00:00,  9.01it/s]


Generating synthetic samples per class based on distribution...


Sampling conditions: 100%|██████████| 5000/5000 [00:00<00:00, 16825.93it/s]
Sampling conditions: 100%|██████████| 5000/5000 [00:00<00:00, 16922.87it/s]


Final synthetic class distribution:
DR
0.0    5000
1.0    5000
Name: count, dtype: int64
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 18/18 [00:00<00:00, 31.27it/s]|
Column Shapes Score: 91.57%

(2/2) Evaluating Column Pair Trends: |██████████| 153/153 [00:00<00:00, 429.19it/s]|
Column Pair Trends Score: 93.84%

Overall Score (Average): 92.7%

After oversampling & synthetic data: DR 
0.0    7136
1.0    7136
Name: count, dtype: int64
Fold: 1, Train: (4593, 28), Test: (1149, 28)
