In [1]:
from Training_Helper_Functions import *
from Preprocessing_Functions import * 
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
import numpy as np
import optuna
from torch import optim
import pandas as pd

from sklearn.model_selection import train_test_split
random_state = 42 
raw_dataset = pd.read_csv("./original_dataset/processed_data_encoded.csv") #data has X and Y, community 0-9
# X = raw_dataset.drop(columns=["BMI", "TCTG", "DR"])
X = raw_dataset.drop(columns=["DR"])
Y = pd.DataFrame(raw_dataset["DR"])
X_FOR_FOLDS, X_FINAL_TEST, Y_FOR_FOLDS, Y_FINAL_TEST = train_test_split(X, Y, test_size=0.1, random_state=random_state, stratify=Y)
df = pd.concat([X_FOR_FOLDS, Y_FOR_FOLDS], axis=1)
df.reset_index(drop=True, inplace=True)
df.to_csv("./DATA/training_set/training_data_for_folds.csv", index=False)
df_test = pd.concat([X_FINAL_TEST, Y_FINAL_TEST], axis=1)
df_test.reset_index(drop=True, inplace=True)
df_test.to_csv("./DATA/holdout_set/holdout_data.csv", index=False)


Using cuda


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.evaluation.single_table import get_column_plot
import sdv
from sdv.metadata import Metadata

def get_bmi_i(df):
    # Calculate BMI for both training and test sets
    df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)

    return df

def get_TCTG_i(df):
    # Calculate TCTG for both training and test sets
    df['TCTG'] = df['TC'] / df['TG']
    return df
def Synthetic_Data_Generator(df_train, synthesizer = "TVAE", epochs = 200, batch_size = 128, n_synthetic_data = 1000): 
    """Conditions: "balanced" or None"""
    df_train = df_train.drop(columns=["BMI", "TCTG"])
    metadata = Metadata.detect_from_dataframe(data=df_train)
    metadata.validate()
    
    #* Synthetic Data generation conditions
    condition_list = []
    #* Synthesizer setup
    if synthesizer == "CTGAN":
        filepath = f"{synthesizer}_{epochs}.pkl"
        synthesizer = CTGANSynthesizer(
                                metadata=metadata, 
                                enforce_min_max_values=True, 
                                enforce_rounding=True, 
                                epochs = epochs,
                                verbose=True, 
                                cuda=True,
                                batch_size=300 # need to be divisible by 10 or pac size
                                )  
        # df_train = make_divisible(df_train, 10)
    elif synthesizer == "TVAE":
        filepath = f"{synthesizer}_{epochs}.pkl"
        synthesizer = TVAESynthesizer(
                                metadata=metadata, 
                                enforce_min_max_values=True, 
                                enforce_rounding=True, 
                                epochs = epochs,
                                verbose=True, 
                                cuda=True,
                                batch_size=batch_size,
                                )
    else:
        return df_train
    print("Balancing condition applied")
    
    # Step 1: Fit the synthesizer
    synthesizer.fit(df_train)
    
    synthesizer.save(filepath)
    # Step 2: Get class counts
    # Step 1: Get class counts
    count_0 = df_train[df_train['DR'] == 0].shape[0]
    count_1 = df_train[df_train['DR'] == 1].shape[0]

    # Step 2: Balance to the max count
    balanced_per_class = max(count_0, count_1)

    cond_0 = Condition(column_values={'DR': 0}, num_rows=balanced_per_class - count_0)
    cond_1 = Condition(column_values={'DR': 1}, num_rows=balanced_per_class - count_1)

    balanced_data = synthesizer.sample_from_conditions([cond_0, cond_1])

    # Step 3: Add more *evenly* on top to hit n_synthetic_data
    # Note: You already have (balanced_per_class * 2) at this point

    current_total = balanced_per_class * 2
    remaining = n_synthetic_data - current_total

    # Split remaining evenly across classes
    extra_per_class = remaining // 2

    # (optional: +1 to one class if remaining is odd)
    cond_extra_0 = Condition(column_values={'DR': 0}, num_rows=extra_per_class)
    cond_extra_1 = Condition(column_values={'DR': 1}, num_rows=remaining - extra_per_class)

    extra_data = synthesizer.sample_from_conditions([cond_extra_0, cond_extra_1])

    # Step 5: Combine all the synthetic garbage
    synthetic_data = pd.concat([balanced_data, extra_data], ignore_index=True)
    quality_report = evaluate_quality(df_train, synthetic_data, metadata)
    synthetic_data = get_bmi_i(synthetic_data)
    synthetic_data = get_TCTG_i(synthetic_data)
    
    # Ensure folder exists

    # Save to specific file
    
    synthetic_data.to_csv(f"./DATA/synthetic_training_set/synthetic_data_{epochs}_TVAE.csv", index=False)

    # synthetic_data.to_csv('./synthetic_dataset/synthetic_data2.csv', index=False)
    df_train = pd.concat([synthetic_data, df_train], ignore_index=True)
    return df_train

def FOLDS_GENERATOR_Synthetic(dataset, n_splits=5, random_state=42, 
                    OD_majority=None, OD_minority=None,
                    synthesizer = "TVAE", epochs = 200, n_synthetic_data=0, 
                    scaler=None):
    
    cont_cols = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 
                 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c', 'Height', 'Weight', 'BMI', 'Duration']
    # Use the original encoded single column name here
    cat_cols = ['Gender', 'Community'] 
    y_col = 'DR'
    
    kF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    kFolds_list = []

    # Convert column names to strings to ensure compatibility
    df = dataset.copy()
    X = df.drop(columns=["DR"])
    Y = pd.DataFrame(df["DR"])

    for fold, (train_idx, test_idx) in enumerate(kF.split(X, Y)):
        # Split the data into training and testing sets for this fold
        train = pd.concat([X.iloc[train_idx], Y.iloc[train_idx]], axis=1)
        test = pd.concat([X.iloc[test_idx], Y.iloc[test_idx]], axis=1)
        
        #* OUTLIER DETECTION
        X_train_processed = Outlier_Removal(train, 
                                            OD_majority=OD_majority,
                                            OD_minority=OD_minority,
                                            )
        
        #* OVERSAMPLING & SYNTHETIC DATA GENERATION
        print("Before oversampling & synthetic data:", X_train_processed[["DR"]].value_counts())
        X_train_processed = Synthetic_Data_Generator(X_train_processed, synthesizer=synthesizer, epochs=epochs, batch_size=512, n_synthetic_data=n_synthetic_data)
            
        print("After oversampling & synthetic data:", X_train_processed[["DR"]].value_counts())
        
        #* Calculate BMI & ENCODING
        # X_train_processed, test = get_bmi(X_train_processed, test)
        # X_train_processed, test = get_TCTG(X_train_processed, test)
        X_train_processed, test = apply_one_hot_encoding(X_train_processed, test)
        #* Scaler
        X_train_processed[cont_cols] = scaler.fit_transform(X_train_processed[cont_cols])
        test[cont_cols] = scaler.transform(test[cont_cols])
        # Append processed data (excluding the target column 'DR')
        
        
        kFolds_list.append((
                            X_train_processed.drop(columns=['DR']),
                            test.drop(columns=['DR']),
                            X_train_processed['DR'].values.reshape(-1, 1),  # Ensures the target is 2D
                            test['DR'].values.reshape(-1, 1)  # Ensures the target is 2D
                        ))
        break
    print(f"Fold: {fold+1}, Train: {X_train_processed.drop(columns=['DR']).shape}, Test: {test.drop(columns=['DR']).shape}")
    return kFolds_list



In [3]:
scaler = RobustScaler()
kFolds = FOLDS_GENERATOR_Synthetic(df, n_splits=5, random_state=42,             
                            # OD_majority = IsolationForest(contamination=trial.suggest_float("contamination_majority", 0.01, 0.4), random_state=random_state, 
                            OD_majority = IQRDetector(factor=1),
                            OD_minority = IQRDetector(factor=1),
         
                            synthesizer = "TVAE",
                            epochs = 1000,
                            n_synthetic_data = 10000,
                            scaler=scaler,      
                            ) 

Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 1911
After OD, minority: 210
Before oversampling & synthetic data: DR 
0.0    1911
1.0     210
Name: count, dtype: int64
Balancing condition applied


Loss: 2.715: 100%|██████████| 1000/1000 [01:49<00:00,  9.13it/s]
Sampling conditions: 100%|██████████| 1701/1701 [00:00<00:00, 3796.88it/s]
Sampling conditions: 100%|██████████| 6178/6178 [00:00<00:00, 8761.30it/s] 


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 18/18 [00:00<00:00, 59.56it/s]|
Column Shapes Score: 84.38%

(2/2) Evaluating Column Pair Trends: |██████████| 153/153 [00:00<00:00, 347.19it/s]|
Column Pair Trends Score: 86.75%

Overall Score (Average): 85.56%

After oversampling & synthetic data: DR 
0.0    5000
1.0    5000
Name: count, dtype: int64
Fold: 1, Train: (10000, 28), Test: (1149, 28)


In [4]:
df = pd.read_csv("./DATA/synthetic_training_set/synthetic_data_2000_TVAE.csv")
df.describe()

Unnamed: 0,Age,Gender,Community,UAlb,Ucr,UACR,TC,TG,LDLC,HDLC,Scr,BUN,FPG,HbA1c,Height,Weight,Duration,DR,BMI,TCTG
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,63.6144,0.5931,4.0921,13.71096,3713.2622,14.00539,5.159593,1.371252,3.222766,1.321891,58.4201,5.5647,8.76655,7.19594,161.0575,62.70339,7.51883,0.2295,24.124111,4.566567
std,6.88734,0.49128,2.94236,11.472183,5160.705713,12.222573,0.702019,0.653386,0.65713,0.249,13.262753,1.221509,2.23005,1.284794,6.806881,7.465654,5.718972,0.420532,1.956432,1.955758
min,46.0,0.0,0.0,0.1,1.0,0.1,3.26,0.4,1.33,0.73,30.0,3.0,4.4,4.4,144.0,40.0,0.1,0.0,16.992187,1.074194
25%,59.0,0.0,2.0,6.6,1.0,5.9,4.67,0.87,2.76,1.13,48.0,4.6,7.0,6.1,157.0,57.5,2.1,0.0,22.851562,2.919511
50%,64.0,1.0,4.0,10.4,13.0,9.8,5.14,1.15,3.22,1.29,56.0,5.5,8.5,7.0,160.0,61.8,6.5,0.0,24.074668,4.405405
75%,68.25,1.0,7.0,16.7,7674.0,18.0,5.63,1.78,3.68,1.51,68.0,6.4,10.2,8.2,167.0,67.7,12.3,0.0,25.39032,5.866837
max,79.0,1.0,9.0,102.7,19307.0,118.7,7.37,3.3,5.1,1.99,90.0,9.2,15.7,11.0,180.0,83.0,25.0,1.0,32.965661,13.925


In [5]:
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.evaluation.single_table import get_column_plot
import sdv
from sdv.metadata import Metadata

synthetic_data = read_csv("./synthetic_dataset/synthetic_data_only.csv")
metadata = Metadata.detect_from_dataframe(data=real_data)
metadata.validate()
metadata.visualize()
# 1. perform basic validity checks
diagnostic = run_diagnostic(df, synthetic_data, metadata)

# 2. measure the statistical similarity
quality_report = evaluate_quality(real_data, synthetic_data, metadata)

# # 3. plot the data
# fig = get_column_plot(
#     real_data=real_data,
#     synthetic_data=synthetic_data,
#     metadata=metadata,
#     column_name='Age' #change this u decide

NameError: name 'read_csv' is not defined