In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score


class Ivan_NN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.input_fc = nn.Linear(input_dim, 2180)
        self.input_bn = nn.BatchNorm1d(2180)
        self.drop = nn.Dropout(0.3)

        self.block1 = nn.Sequential(
            nn.Linear(2180, 888),
            nn.BatchNorm1d(888),
            nn.LeakyReLU(),
            nn.Dropout(0.3)
        )

        self.block2 = nn.Sequential(
            nn.Linear(888, 1122),
            nn.BatchNorm1d(1122),
            nn.LeakyReLU(),
            nn.Dropout(0.3)
        )

        self.block3 = nn.Sequential(
            nn.Linear(1122, 624),
            nn.BatchNorm1d(624),
            nn.LeakyReLU(),
            nn.Dropout(0.3)
        )

        self.block4 = nn.Sequential(
            nn.Linear(624, 1080),
            nn.BatchNorm1d(1080),
            nn.LeakyReLU(),
            nn.Dropout(0.3)
        )

        self.block5 = nn.Sequential(
            nn.Linear(1080, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Dropout(0.3)
        )

        self.block6 = nn.Sequential(
            nn.Linear(512, 128),
            nn.BatchNorm1d(128),  # ← Fixed this
            nn.LeakyReLU(),
            nn.Dropout(0.3)
        )

        self.block7 = nn.Sequential(
            nn.Linear(128, 36),
            nn.BatchNorm1d(36),
            nn.LeakyReLU(),
            nn.Dropout(0.2)
        )

        self.block8 = nn.Sequential(
            nn.Linear(36, 64),
            nn.BatchNorm1d(64),
            nn.Sigmoid(),  # or LeakyReLU, up to you
            nn.Dropout(0.1)
        )

        self.output = nn.Linear(64, 1)

        # Skip connection projectors
        self.skip1_proj = nn.Sequential(nn.Linear(2180, 1122))
        self.skip2_proj = nn.Sequential(nn.Linear(1122, 128))
        self.skip3_proj = nn.Sequential(nn.Linear(128, 64))

    def forward(self, x):
        x = self.input_fc(x)
        x = self.input_bn(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.drop(x)

        x1 = self.block1(x)
        x2 = self.block2(x1)

        skip1 = self.skip1_proj(x)
        x2 = x2 + skip1

        x3 = self.block3(x2)
        x4 = self.block4(x3)
        x5 = self.block5(x4)

        x6 = self.block6(x5)

        skip2 = self.skip2_proj(x2)
        x6 = x6 + skip2

        x7 = self.block7(x6)
        x8 = self.block8(x7)

        skip3 = self.skip3_proj(x6)
        x8 = x8 + skip3

        out = self.output(x8)
        return out


def cleanRawData(filename:str = "./data/raw_data.csv", colsToDrop:list[str] = ["name", "DR1", "DR2", "image1", "image2", "CSME1", "HBPRP1", "CSME2", "HBPRP2", "WorseDR", "ACR"]) -> pd.DataFrame:
    rawData = pd.read_csv(filename)
    rawData1 = rawData.dropna()
    rawData2 = rawData1.drop(columns=colsToDrop)
    #? Change gender to binary instead of 1s and 2s
    rawData2["gender"] = rawData2["gender"].replace(2,0).astype(float)
    #? Hot-encode Categorical column
    community = pd.get_dummies(rawData2["community"],).astype(float)
    #? remove original categorical column and join Hotencoded columns
    notSoRawData = pd.concat([rawData2.drop(columns=["community"]),community], axis=1)
    for col in notSoRawData.columns.tolist():
        if notSoRawData[col].dtype == 'object':
            notSoRawData[col] = notSoRawData[col].str.replace(',', '').astype(float)
    return notSoRawData[~notSoRawData.isin([np.inf, -np.inf]).any(axis=1)]

class TabularDataset(Dataset):
    def __init__(self, X:pd.DataFrame, y:pd.DataFrame):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    

def train_model(model:Ivan_NN, nFeatures:int, train_loader, val_loader, epochs=10, learningRate = 0.0001, treshold:float = 0.5, patience = 30, device=None):
    # Use GPU if available
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")

    theMODEL = model(nFeatures)
    theMODEL.to(device)
    print(theMODEL)

    posWeight = torch.tensor(10, dtype=torch.float32).to("cpu")
    criterion = nn.BCEWithLogitsLoss(pos_weight=posWeight)
    optimizer = optim.Adam(theMODEL.parameters() ,lr=learningRate, weight_decay=1e-4)

    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies= []
    val_precisions = []
    val_recalls = []
    val_f1s = []
    best_val_loss = float("inf")
    best_val_f1 = .0


    for epoch in range(epochs):
        theMODEL.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, targets in train_loader:
            optimizer.zero_grad()
            inputs, targets = inputs.to(device), targets.squeeze().to(device)

            # Forward pass
            outputs = theMODEL(inputs).squeeze()
            loss = criterion(outputs, targets)
            # Backward and optimize
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=.5)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()# * inputs.size(0)

            # preds = torch.round(torch.sigmoid(outputs))
            preds = (torch.sigmoid(outputs) >= treshold).float()

            correct += (preds == targets).sum().item()
            total += targets.size(0)

        train_loss = running_loss / total
        train_acc = correct / total
        

        # Validation
        theMODEL.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.squeeze().to(device)
                outputs = theMODEL(inputs).squeeze()
                loss = criterion(outputs, targets)
                val_loss += loss.item() # * inputs.size(0)

                if pd.isna(loss.item()):
                    print( "Val Loss:" ,loss)
                    in_rows = torch.isnan(inputs).any(axis=1)
                    out_rows = torch.isnan(outputs).any(axis=-1)
                    targets_rows = torch.isnan(targets).any(axis=-1)
                    print(inputs[in_rows], outputs[out_rows], targets[targets_rows],sep="\n")
                    return
                
                # preds = torch.round(torch.sigmoid(outputs))
                preds = (torch.sigmoid(outputs) >= treshold).float()

                val_correct += (preds == targets).sum().item()
                val_total += targets.size(0)

                f1Score = f1_score(targets, preds)
                precision = precision_score(targets,preds,zero_division=.0)
                recall = recall_score(targets, preds)
                accuracy = accuracy_score(targets,preds)


        if val_total == 0:
            print("Empty validation set!")
            return

        val_loss /= val_total
        val_acc = val_correct / val_total

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        val_precisions.append(precision)
        val_recalls.append(recall)
        val_f1s.append(f1Score)

        print(f"Epoch [{epoch+1}/{epochs}]".ljust(16),
              f"Train Loss: {train_loss:.6f} Acc: {train_acc:.6f} | "
            #   f"Val Loss: {val_loss:.6f} Acc: {val_acc:.6f}"
              f"Val Loss: {val_loss:.6f} Acc: {accuracy:.2f}% Precision: {precision:.2f} Recall: {recall:.2f} F1: {f1Score:.2f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_loss_model = {
                "epoch": epoch+1,
                "stateDict": theMODEL.state_dict(),
            }
            print("Best loss model saved!")
        if f1Score > best_val_f1:
            best_val_f1 = f1Score
            best_f1_model = {
                "epoch": epoch+1,
                "stateDict": theMODEL.state_dict(),
            }
            print("Best f1 model saved!")
        elif val_loss > best_val_loss and epoch > best_loss_model["epoch"] + patience:
            print(f"Early stopping... Current Val Loss:{val_loss}  vs  Best Val Loss: {best_val_loss}")
            break

    plt.figure(figsize=(8, 5))
    plt.plot(train_losses, label='Train Loss', marker='^')
    plt.plot(val_losses, label='Val Loss', marker='x')
    plt.title("Training vs. Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(8, 5))
    plt.plot(train_accuracies, label='Train Acc', marker='^')
    plt.plot(val_accuracies, label='Val Acc', marker='x')
    plt.title("Training vs. Val Acc")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(8, 5))
    plt.plot(val_precisions, label='Val Precision', marker='^')
    plt.plot(val_recalls, label='Val Recall', marker='x')
    plt.plot(val_f1s, label='Val F1', marker='|')
    plt.title("Precision, Recall, F1")
    plt.xlabel("Epoch")
    plt.ylabel("Score")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    trainingData = {
        "learningRate": learningRate,
        "epochs": (epoch, epochs),
        "treshold": treshold,
        "initial training loss": train_losses,
        "initial training acc": train_accuracies,
        "initial validation loss": val_losses,
        "initial validation acc": val_accuracies,
        "initial validation precision": val_precisions,
        "initial validation recall": val_recalls,
        "initial validation F1":val_f1s, 
        "best initial loss model": best_loss_model,
        "best initial f1 model": best_f1_model,
    }
    return trainingData

    
def train_loss_model(model:Ivan_NN, nFeatures:int, bestModelData:dict, trainVal_loader, test_loader, epochs=20, learningRate = 0.00001, treshold:float = 0.5, device=None):
    # Use GPU if available
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")

    print(f"Loading best loss model State from epoch {bestModelData['epoch']}")
    finalModel = model(nFeatures)
    finalModel.load_state_dict(bestModelData["stateDict"])
    posWeight = torch.tensor(10, dtype=torch.float32).to("cpu")
    criterion = nn.BCEWithLogitsLoss(pos_weight=posWeight)
    optimizer = optim.Adam(finalModel.parameters() ,lr=learningRate, weight_decay=1e-4)

    train_losses = []
    train_accuracies = []

    for epoch in range(bestModelData["epoch"]):
        finalModel.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, targets in trainVal_loader:
            optimizer.zero_grad()
            inputs, targets = inputs.to(device), targets.squeeze().to(device)

            # Forward pass
            outputs = finalModel(inputs).squeeze()
            loss = criterion(outputs, targets)
            # Backward and optimize
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=.5)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()# * inputs.size(0)

            # preds = torch.round(torch.sigmoid(outputs))
            preds = (torch.sigmoid(outputs) >= treshold).float()


            correct += (preds == targets).sum().item()
            total += targets.size(0)

        train_loss = running_loss / total
        train_acc = correct / total
        print(f"Epoch [{epoch+1}/{epochs}]".ljust(16),
              f"Train Loss: {train_loss:.6f} Acc: {train_acc:.6f} | ")
        
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)


    finalModel.eval()   

    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.squeeze().to(device)
            outputs = finalModel(inputs).squeeze()
            loss = criterion(outputs, targets)
            test_loss = loss.item() # * inputs.size(0)

            # preds = torch.round(torch.sigmoid(outputs))
            preds = (torch.sigmoid(outputs) >= treshold).float()

            test_f1Score = f1_score(targets, preds)
            test_precision = precision_score(targets,preds)
            test_recall = recall_score(targets, preds)
            test_accuracy = accuracy_score(targets,preds)

            qwerty = [
                f"Loss: {test_loss:.6f}", 
                f"Accuracy: {test_accuracy:.2f}%", 
                f"Precision: {test_precision:.2f}", 
                f"Recall: {test_recall:.2f}", 
                f"F1 Score: {test_f1Score:.2f}",
            ]
            wwidth = 30
            print("Test Results".center(16).center(wwidth,"="))
            for line in qwerty:
                print(line.ljust(16).center(wwidth-2).center(wwidth,"|"))
            # print("Test", f"Loss: {test_loss:.6f}", f"Accuracy: {test_accuracy:.2f}%", f"Precision: {test_precision:.2f}", f"Recall: {test_recall:.2f}", f"F1 Score: {test_f1Score:.2f}%", sep="\n ")

    plt.figure(figsize=(8, 5))
    plt.plot(train_losses, label='Train Loss', marker='^')
    plt.title("Training Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(8, 5))
    plt.plot(train_accuracies, label='Train Acc', marker='^')
    plt.title("Training Acc")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


    loss_Model = {
        "final training loss": train_losses,
        "final training acc": train_accuracies,
        "Test loss": test_loss,
        "Test acc": test_accuracy,
        "Test precision": test_precision,
        "Test recall": test_recall,
        "Test F1": test_f1Score,
        "final model": finalModel.state_dict(),
    }


    return loss_Model

def train_f1_model(model:Ivan_NN, nFeatures:int, bestModelData:dict, trainVal_loader, test_loader, epochs=20, learningRate = 0.00001, treshold:float = 0.5, device=None):
    # Use GPU if available
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")

    print(f"Loading best f1 model State from epoch {bestModelData['epoch']}")
    f1Model = model(nFeatures)
    f1Model.load_state_dict(bestModelData["stateDict"])
    posWeight = torch.tensor(10, dtype=torch.float32).to("cpu")
    criterion = nn.BCEWithLogitsLoss(pos_weight=posWeight)
    optimizer = optim.Adam(f1Model.parameters() ,lr=learningRate, weight_decay=1e-4)

    train_losses = []
    train_accuracies = []

    for epoch in range(bestModelData['epoch']):
        f1Model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, targets in trainVal_loader:
            optimizer.zero_grad()
            inputs, targets = inputs.to(device), targets.squeeze().to(device)

            # Forward pass
            outputs = f1Model(inputs).squeeze()
            loss = criterion(outputs, targets)
            # Backward and optimize
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=.5)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()# * inputs.size(0)

            preds = torch.round(torch.sigmoid(outputs))
            preds = (torch.sigmoid(outputs) >= treshold).float()

            correct += (preds == targets).sum().item()
            total += targets.size(0)

        train_loss = running_loss / total
        train_acc = correct / total
        print(f"Epoch [{epoch+1}/{epochs}]".ljust(16),
              f"Train Loss: {train_loss:.6f} Acc: {train_acc:.6f} | ")
        
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)


    f1Model.eval()   

    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.squeeze().to(device)
            outputs = f1Model(inputs).squeeze()
            loss = criterion(outputs, targets)
            test_loss = loss.item() # * inputs.size(0)

            preds = torch.round(torch.sigmoid(outputs))
            preds = (torch.sigmoid(outputs) >= treshold).float()

            test_f1Score = f1_score(targets, preds)
            test_precision = precision_score(targets,preds)
            test_recall = recall_score(targets, preds)
            test_accuracy = accuracy_score(targets,preds)

            qwerty = [
                f"Loss: {test_loss:.6f}", 
                f"Accuracy: {test_accuracy:.2f}%", 
                f"Precision: {test_precision:.2f}", 
                f"Recall: {test_recall:.2f}", 
                f"F1 Score: {test_f1Score:.2f}",
            ]
            wwidth = 30
            print("Test Results".center(16).center(wwidth,"="))
            for line in qwerty:
                print(line.ljust(16).center(wwidth-2).center(wwidth,"|"))
            # print("Test", f"Loss: {test_loss:.6f}", f"Accuracy: {test_accuracy:.2f}%", f"Precision: {test_precision:.2f}", f"Recall: {test_recall:.2f}", f"F1 Score: {test_f1Score:.2f}%", sep="\n ")

    plt.figure(figsize=(8, 5))
    plt.plot(train_losses, label='Train Loss', marker='^')
    plt.title("Training Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(8, 5))
    plt.plot(train_accuracies, label='Train Acc', marker='^')
    plt.title("Training Acc")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


    f1_Model = {
        "final training loss": train_losses,
        "final training acc": train_accuracies,
        "Test loss": test_loss,
        "Test acc": test_accuracy,
        "Test precision": test_precision,
        "Test recall": test_recall,
        "Test F1": test_f1Score,
        "final model": f1Model.state_dict(),
    }

    
    return f1_Model

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader
from Training_Helper_Functions import *
from Preprocessing_Functions2 import * 
path = "../ADL2/DATA_21/training_set/training_data.csv"
data = pd.read_csv(path)

# data = cleanRawData()
X , Y = data.drop(columns=["DR"]).astype(float), data[["DR"]].astype(float)

trainVal, testing = train_test_split(data, test_size=0.1, random_state=42, stratify=data["DR"])
training, validation = train_test_split(trainVal, test_size=1/9, random_state=42, stratify=trainVal["DR"])

# trainVal_X, test_X, trainVal_Y, test_Y = train_test_split(X, Y, test_size=0.1, random_state=42, stratify=Y)
# train_X, val_X, train_Y, val_Y = train_test_split(trainVal_X, trainVal_Y, test_size=1/9, random_state=42, stratify=trainVal_Y)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
scaler = StandardScaler()
cont_cols = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 'LDLC', 'HDLC', 
             'Scr', 'BUN', 'FPG', 'HbA1c', 'Height', 'Weight', 'BMI', 'Duration']
    # Use the original encoded single column name here
cat_cols = ['Gender', 'Community'] 
y_col = 'DR'

# Create the directory
directory = "vanData"
os.makedirs(directory, exist_ok=True)
print(f"directory {directory} created")
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#* OUTLIER DETECTION
X_train_processed = Outlier_Removal(training, 
                                    OD_majority=None,
                                    OD_minority=None,
                                    )
# X_train_processed = apply_smotenc_oversampling(X_train_processed)
# #* OVERSAMPLING & SYNTHETIC DATA GENERATION
print("Before oversampling & synthetic data:", X_train_processed[["DR"]].value_counts())
X_train_processed = Synthetic_Data_Generator2(X_train_processed, "", synthesizer="TVAE", epochs=10000, batch_size=700, n_synthetic_data=10000)

print("After oversampling & synthetic data:", X_train_processed[["DR"]].value_counts())

#* Calculate BMI, TCTG & ENCODING
X_train_processed, validation = get_bmi(X_train_processed, validation)
X_train_processed, validation = get_TCTG(X_train_processed, validation)
X_train_processed, validation = apply_one_hot_encoding(X_train_processed, validation)
#* Scaler
X_train_processed[cont_cols] = scaler.fit_transform(X_train_processed[cont_cols])
validation[cont_cols] = scaler.transform(validation[cont_cols])

print("Saving generated training and validation datasets")
X_train_processed.to_csv(f"{directory}/training.csv")
validation.to_csv(f"{directory}/validation.csv")
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# #* OUTLIER DETECTION
fullTrain_processed = Outlier_Removal(trainVal, 
                                    OD_majority=None,
                                    OD_minority=None,
                                    )
# X_train_processed = apply_smotenc_oversampling(X_train_processed)
# #* OVERSAMPLING & SYNTHETIC DATA GENERATION
print("Before oversampling & synthetic data:", fullTrain_processed[["DR"]].value_counts())
fullTrain_processed = Synthetic_Data_Generator2(fullTrain_processed, "", synthesizer="TVAE", epochs=10000, batch_size=700, n_synthetic_data=10000)

print("After oversampling & synthetic data:", fullTrain_processed[["DR"]].value_counts())

#* Calculate BMI, TCTG & ENCODING
fullTrain_processed, testing = get_bmi(fullTrain_processed, testing)
fullTrain_processed, testing = get_TCTG(fullTrain_processed, testing)
fullTrain_processed, testing = apply_one_hot_encoding(fullTrain_processed, testing)
#* Scaler
fullTrain_processed[cont_cols] = scaler.fit_transform(fullTrain_processed[cont_cols])
testing[cont_cols] = scaler.transform(testing[cont_cols])

print("Saving generated combinedTrain and test datasets")
fullTrain_processed.to_csv(f"{directory}/fullTrain.csv")
testing.to_csv(f"{directory}/testing.csv")
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


directory vanData created
Original class distribution: DR
0.0    4128
1.0     464
Name: count, dtype: int64
Before oversampling & synthetic data: DR 
0.0    4128
1.0     464
Name: count, dtype: int64
Fitting synthesizer...


Loss: -21.132: 100%|██████████| 10000/10000 [1:51:57<00:00,  1.49it/s] 


Generating synthetic samples per class based on distribution...


Sampling conditions: 100%|██████████| 8989/8989 [00:01<00:00, 6854.42it/s]
Sampling conditions: 100%|██████████| 1010/1010 [00:04<00:00, 248.72it/s]


Final synthetic class distribution:
DR
0.0    8989
1.0    1010
Name: count, dtype: int64
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:06<00:00,  3.25it/s]|
Column Shapes Score: 90.2%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:02<00:00, 84.53it/s]|
Column Pair Trends Score: 88.31%

Overall Score (Average): 89.26%

After oversampling & synthetic data: DR 
0.0    13117
1.0     1474
Name: count, dtype: int64
Saving generated training and validation datasets
Original class distribution: DR
0.0    4645
1.0     522
Name: count, dtype: int64
Before oversampling & synthetic data: DR 
0.0    4645
1.0     522
Name: count, dtype: int64
Fitting synthesizer...


Loss: -18.167:  69%|██████▉   | 6925/10000 [1:27:48<46:57,  1.09it/s]  

In [None]:

#? For Training and Cross Validation Loop
train_X_n, train_Y = fullTrain_processed.drop(columns=['DR']), fullTrain_processed[['DR']]
# train_X_n = (train_X - train_X.min()) / (train_X.max() - train_X.min())
train_dataset = TabularDataset(train_X_n, train_Y)

val_X_n , val_Y = validation.drop(columns=['DR']), validation[['DR']]
# val_X_n = (val_X - val_X.min()) / (val_X.max() - val_X.min())
val_dataset = TabularDataset(val_X_n, val_Y)

#? For Final Training and Testing Loop
trainVal_X_n , trainVal_Y = fullTrain_processed.drop(columns=['DR']), fullTrain_processed[['DR']]
# trainVal_X_n = (trainVal_X - trainVal_X.min()) / (trainVal_X.max() - trainVal_X.min())
trainVal_dataset = TabularDataset(trainVal_X_n, trainVal_Y)

test_X_n , test_Y = testing.drop(columns=['DR']), testing[['DR']]
# test_X_n = (test_X - test_X.min()) / (test_X.max() - test_X.min())
test_dataset = TabularDataset(test_X_n, test_Y)

nInputs = len(X.columns)
batchSize = 700
epoch = 1000
learningRate = 0.000001
treshold = 0.65
# model = Ivan_NN2(nInputs)

#? Attemps to "Balance" the class imbalance
# posWeight = torch.tensor(10, dtype=torch.float32).to("cpu")
# criterion = nn.BCEWithLogitsLoss(pos_weight=posWeight)
# optimiser = optim.Adam(model.parameters() ,lr=learningRate, weight_decay=1e-4)

train_loader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False, drop_last=True)
trainVal_loader = DataLoader(trainVal_dataset, batch_size=batchSize, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False, drop_last=True)


In [None]:
trainingData = train_model(Ivan_NN, nInputs, train_loader, val_loader, epoch, learningRate, treshold)

In [None]:
lossModelData = train_loss_model(Ivan_NN, nInputs,trainingData["best initial loss model"], trainVal_loader, test_loader, epoch, learningRate=learningRate, treshold=treshold)

In [None]:
f1ModelData = train_f1_model(Ivan_NN, nInputs,trainingData["best initial f1 model"], trainVal_loader, test_loader, epoch, learningRate=learningRate, treshold=treshold)