#MLP

##Load Libraries

In [None]:
# ---------------------------------------
# Load Required Libraries
# ---------------------------------------


!pip install Bio

In [None]:
import numpy as np
import pandas as pd
from Bio.Align import substitution_matrices
import tensorflow as tf
import os
from Bio.Align import substitution_matrices
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, classification_report
)
from collections import defaultdict

##BLOSUM

In [None]:
seq_data = pd.read_csv('/data/seq_data.csv')
blosum62 = substitution_matrices.load("BLOSUM62")

In [None]:
input_dim= 20
drop_out=0.2

NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(20,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(16,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
NN.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy", "mae", "mse"],)

In [None]:
# === Load manual folds ===
def load_folds(folds_dir):
    folds = []
    for fold in range(1, 6):
        train_path = os.path.join(folds_dir, f'fold_{fold}_train_ids.csv')
        test_path = os.path.join(folds_dir, f'fold_{fold}_test_ids.csv')
        train_ids = pd.read_csv(train_path).iloc[:, 0]
        test_ids = pd.read_csv(test_path).iloc[:, 0]
        folds.append((train_ids, test_ids))
    return folds

# === BLOSUM62 feature extraction (20D mean pooling, standard AA) ===
blosum62 = substitution_matrices.load("BLOSUM62")
aminos20 = list("ACDEFGHIKLMNPQRSTVWY")
aa_to_idx_20 = {aa: i for i, aa in enumerate(aminos20)}
blosum_matrix_20 = np.zeros((20, 20))
for i, aa1 in enumerate(aminos20):
    for j, aa2 in enumerate(aminos20):
        blosum_matrix_20[i, j] = blosum62[aa1, aa2]
# Normalize
mean, std = blosum_matrix_20.mean(), blosum_matrix_20.std()
blosum_matrix_20 = (blosum_matrix_20 - mean) / std

def seq_to_blosum_vector(seq):
    vecs = []
    for aa in seq.upper():
        if aa in aa_to_idx_20:
            idx = aa_to_idx_20[aa]
            vecs.append(blosum_matrix_20[idx])
        else:
            vecs.append(np.zeros(20))
    return np.mean(vecs, axis=0) if vecs else np.zeros(20)

def store_initial_weights(model):
    # Store the initial weights
    initial_weights = model.get_weights()
    return initial_weights

def reset_weights(model, initial_weights):
    # Reset the weights to the stored initial weights
    model.set_weights(initial_weights)

initial_weights = store_initial_weights(NN)


data_path = "/data/seq_data.csv"
folds_dir = "/data/folds/5_folds"
#folds_dir = "/data/folds/5_folds_IId"
df = pd.read_csv(data_path)
folds = load_folds(folds_dir)

df['blosum'] = df['seq'].apply(seq_to_blosum_vector)

In [None]:
df["blosum"] = df["blosum"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
blosum_cols = pd.DataFrame(df["blosum"].tolist(), columns=[f"blosum_{i+1}" for i in range(20)])
new_df = pd.concat([df.drop(columns=["blosum"]), blosum_cols], axis=1)

In [None]:
# Placeholder for storing fold metrics
accuracy_list, precision_list, recall_list = [], [], []
f1score_list, auc_list, avg_precision_list, auprc_list = [], [], [], []

X = new_df.drop(columns=['id', 'label','seq'])
y = new_df['label']

# Choose your balancing method: 'smote' or 'undersample'
balancing = 'over'
# balancing = 'undersample'

for fold_num, (train_ids, test_ids) in enumerate(folds, 1):

    # Assuming reset_weights and NN, initial_weights are defined elsewhere
    # reset_weights(NN, initial_weights)

    # Split
    reset_weights(NN, initial_weights)
    print(train_ids.shape)
    print(test_ids.shape)
    x_train = X[new_df['id'].isin(list(train_ids))]
    x_test = X[new_df['id'].isin(list(test_ids))]
    y_train = y[new_df['id'].isin(list(train_ids))]
    y_test = y[new_df['id'].isin(list(test_ids))]

    # --- Data Balancing on Train only ---
    if balancing == 'over':
        #sm = SMOTE(random_state=42)
        #x_train_bal, y_train_bal = sm.fit_resample(x_train, y_train)
        oversampler = RandomOverSampler(random_state=42)
        # Apply oversampling
        x_train_bal, y_train_bal = oversampler.fit_resample(x_train, y_train) # <--- CORRECTED HERE: y_train_balr to y_train_bal
    elif balancing == 'undersample':
        rus = RandomUnderSampler(random_state=42)
        x_train_bal, y_train_bal = rus.fit_resample(x_train, y_train)
    else:
        x_train_bal, y_train_bal = x_train, y_train

    # --- Train on balanced data ---
    # The NN.fit call now correctly uses y_train_bal
    NN.fit(x_train_bal, y_train_bal, epochs=50, batch_size=32)
    pred = NN.predict(x_test)

    # Evaluate
    threshold = 0.5
    binary_predictions = (pred >= threshold).astype(int)
    accuracy = accuracy_score(y_test, binary_predictions)
    precision = precision_score(y_test, binary_predictions)
    recall = recall_score(y_test, binary_predictions)
    f1 = f1_score(y_test, binary_predictions)
    auc_score = roc_auc_score(y_test, pred)
    precision_values, recall_values, _ = precision_recall_curve(y_test, pred)
    avg_precision = average_precision_score(y_test, pred)
    auprc = auc(recall_values, precision_values)

    # Store
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1score_list.append(f1)
    auc_list.append(auc_score)
    avg_precision_list.append(avg_precision)
    auprc_list.append(auprc)

    print(f"Fold {fold_num} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, "
          f"AUC: {auc_score:.4f}, AUPRC: {auprc:.4f}")

def print_mean_std(metric_list, metric_name):
    print(f"{metric_name:<15}: {np.mean(metric_list):.4f} ± {np.std(metric_list):.4f}")

print("\nAverage metrics across all folds (mean ± std):")
print_mean_std(accuracy_list, "Accuracy")
print_mean_std(precision_list, "Precision")
print_mean_std(recall_list, "Recall")
print_mean_std(f1score_list, "F1 Score")
print_mean_std(auc_list, "AUC")
print_mean_std(auprc_list, "AUPRC")

##Dipeptide

In [None]:
df_features = pd.read_csv('/data/embeddings/Dipeptide_embeddings.csv')

In [None]:
df_features.rename(columns={df_features.columns[0]: "id"}, inplace=True)
new_df = df_features.reset_index(drop=True)

In [None]:
input_dim= 400
drop_out=0.2

NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(400,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(128,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(16,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
NN.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy", "mae", "mse"],)

In [None]:
def load_folds(folds_dir):
    folds = []
    for fold in range(1, 6):  # Assuming 5 folds (1 to 5)
        # Define the paths for train.npy and test.npy for each fold
        train_path = os.path.join(folds_dir, f'fold_{fold}_train_ids.csv')
        test_path = os.path.join(folds_dir, f'fold_{fold}_test_ids.csv')

        # Load the train and test files
        train_ids = pd.read_csv(train_path)  # This should load the numpy array of IDs
        test_ids = pd.read_csv(test_path)
        folds.append((train_ids, test_ids))

    return folds

def store_initial_weights(model):
    # Store the initial weights
    initial_weights = model.get_weights()
    return initial_weights

def reset_weights(model, initial_weights):
    # Reset the weights to the stored initial weights
    model.set_weights(initial_weights)

initial_weights = store_initial_weights(NN)
folds_dir = "/data/folds/5_folds_IId"
#folds_dir = "/data/folds/5_folds"
folds = load_folds(folds_dir)

In [None]:
# Placeholder for storing fold metrics
accuracy_list, precision_list, recall_list = [], [], []
f1score_list, auc_list, avg_precision_list, auprc_list = [], [], [], []

X = new_df.drop(columns=['id', 'label'])
y = new_df['label']

balancing = 'over'


for fold_num, (train_ids, test_ids) in enumerate(folds, 1):

    # Assuming reset_weights and NN, initial_weights are defined elsewhere
    # reset_weights(NN, initial_weights)

    # Split
    reset_weights(NN, initial_weights)
    print(train_ids.shape)
    print(test_ids.shape)
    x_train = X[new_df['id'].isin(list(train_ids.iloc[:,0]))]
    x_test = X[new_df['id'].isin(list(test_ids.iloc[:,0]))]
    y_train = y[new_df['id'].isin(list(train_ids.iloc[:,0]))]
    y_test = y[new_df['id'].isin(list(test_ids.iloc[:,0]))]

    # --- Data Balancing on Train only ---
    if balancing == 'over':
        #sm = SMOTE(random_state=42)
        #x_train_bal, y_train_bal = sm.fit_resample(x_train, y_train)
        oversampler = RandomOverSampler(random_state=42)
        # Apply oversampling
        x_train_bal, y_train_bal = oversampler.fit_resample(x_train, y_train) # <--- CORRECTED HERE: y_train_balr to y_train_bal
    elif balancing == 'undersample':
        rus = RandomUnderSampler(random_state=42)
        x_train_bal, y_train_bal = rus.fit_resample(x_train, y_train)
    else:
        x_train_bal, y_train_bal = x_train, y_train

    # --- Train on balanced data ---
    # The NN.fit call now correctly uses y_train_bal
    NN.fit(x_train_bal, y_train_bal, epochs=50, batch_size=32)
    pred = NN.predict(x_test)

    # Evaluate
    threshold = 0.5
    binary_predictions = (pred >= threshold).astype(int)
    accuracy = accuracy_score(y_test, binary_predictions)
    precision = precision_score(y_test, binary_predictions)
    recall = recall_score(y_test, binary_predictions)
    f1 = f1_score(y_test, binary_predictions)
    auc_score = roc_auc_score(y_test, pred)
    precision_values, recall_values, _ = precision_recall_curve(y_test, pred)
    avg_precision = average_precision_score(y_test, pred)
    auprc = auc(recall_values, precision_values)

    # Store
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1score_list.append(f1)
    auc_list.append(auc_score)
    avg_precision_list.append(avg_precision)
    auprc_list.append(auprc)

    print(f"Fold {fold_num} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, "
          f"AUC: {auc_score:.4f}, AUPRC: {auprc:.4f}")

def print_mean_std(metric_list, metric_name):
    print(f"{metric_name:<15}: {np.mean(metric_list):.4f} ± {np.std(metric_list):.4f}")

print("\nAverage metrics across all folds (mean ± std):")
print_mean_std(accuracy_list, "Accuracy")
print_mean_std(precision_list, "Precision")
print_mean_std(recall_list, "Recall")
print_mean_std(f1score_list, "F1 Score")
print_mean_std(auc_list, "AUC")
print_mean_std(auprc_list, "AUPRC")

##ProtBert

In [None]:
import pandas as pd
new_df = pd.read_csv('/data/embeddings/PotBert_embeddings.csv')
new_df = new_df.iloc[:4819,:]
new_df.rename(columns={new_df.columns[0]: "id"}, inplace=True)

In [None]:
input_dim=1024
drop_out=0.2

NN=tf.keras.models.Sequential([
    tf.keras.layers.Dense(1024,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(128,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(1, activation='sigmoid'),
    ])

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
NN.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy", "mae", "mse"],)

In [None]:
def load_folds(folds_dir):
    folds = []
    for fold in range(1, 6):  # Assuming 5 folds (1 to 5)
        # Define the paths for train.npy and test.npy for each fold
        train_path = os.path.join(folds_dir, f'fold_{fold}_train_ids.csv')
        test_path = os.path.join(folds_dir, f'fold_{fold}_test_ids.csv')

        # Load the train and test files
        train_ids = pd.read_csv(train_path)  # This should load the numpy array of IDs
        test_ids = pd.read_csv(test_path)
        folds.append((train_ids, test_ids))

    return folds

def store_initial_weights(model):
    # Store the initial weights
    initial_weights = model.get_weights()
    return initial_weights

def reset_weights(model, initial_weights):
    # Reset the weights to the stored initial weights
    model.set_weights(initial_weights)

initial_weights = store_initial_weights(NN)
folds_dir = "/data/folds/5_folds_IId"
#folds_dir = "/data/folds/5_folds"
folds = load_folds(folds_dir)

In [None]:
# Placeholder for storing fold metrics
accuracy_list, precision_list, recall_list = [], [], []
f1score_list, auc_list, avg_precision_list, auprc_list = [], [], [], []

X = new_df.drop(columns=['id', 'label'])
y = new_df['label']

# Choose your balancing method: 'smote' or 'undersample'
balancing = 'over'
# balancing = 'undersample'

for fold_num, (train_ids, test_ids) in enumerate(folds, 1):

    # Assuming reset_weights and NN, initial_weights are defined elsewhere
    # reset_weights(NN, initial_weights)

    # Split
    reset_weights(NN, initial_weights)
    print(train_ids.shape)
    print(test_ids.shape)
    x_train = X[new_df['id'].isin(list(train_ids.iloc[:,0]))]
    x_test = X[new_df['id'].isin(list(test_ids.iloc[:,0]))]
    y_train = y[new_df['id'].isin(list(train_ids.iloc[:,0]))]
    y_test = y[new_df['id'].isin(list(test_ids.iloc[:,0]))]

    # --- Data Balancing on Train only ---
    if balancing == 'over':
        #sm = SMOTE(random_state=42)
        #x_train_bal, y_train_bal = sm.fit_resample(x_train, y_train)
        oversampler = RandomOverSampler(random_state=42)
        # Apply oversampling
        x_train_bal, y_train_bal = oversampler.fit_resample(x_train, y_train) # <--- CORRECTED HERE: y_train_balr to y_train_bal
    elif balancing == 'undersample':
        rus = RandomUnderSampler(random_state=42)
        x_train_bal, y_train_bal = rus.fit_resample(x_train, y_train)
    else:
        x_train_bal, y_train_bal = x_train, y_train

    # --- Train on balanced data ---
    # The NN.fit call now correctly uses y_train_bal
    NN.fit(x_train_bal, y_train_bal, epochs=50, batch_size=32)
    pred = NN.predict(x_test)

    # Evaluate
    threshold = 0.5
    binary_predictions = (pred >= threshold).astype(int)
    accuracy = accuracy_score(y_test, binary_predictions)
    precision = precision_score(y_test, binary_predictions)
    recall = recall_score(y_test, binary_predictions)
    f1 = f1_score(y_test, binary_predictions)
    auc_score = roc_auc_score(y_test, pred)
    precision_values, recall_values, _ = precision_recall_curve(y_test, pred)
    avg_precision = average_precision_score(y_test, pred)
    auprc = auc(recall_values, precision_values)

    # Store
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1score_list.append(f1)
    auc_list.append(auc_score)
    avg_precision_list.append(avg_precision)
    auprc_list.append(auprc)

    print(f"Fold {fold_num} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, "
          f"AUC: {auc_score:.4f}, AUPRC: {auprc:.4f}")

def print_mean_std(metric_list, metric_name):
    print(f"{metric_name:<15}: {np.mean(metric_list):.4f} ± {np.std(metric_list):.4f}")

print("\nAverage metrics across all folds (mean ± std):")
print_mean_std(accuracy_list, "Accuracy")
print_mean_std(precision_list, "Precision")
print_mean_std(recall_list, "Recall")
print_mean_std(f1score_list, "F1 Score")
print_mean_std(auc_list, "AUC")
print_mean_std(auprc_list, "AUPRC")

##ProtT5

In [None]:
import pandas as pd
new_df = pd.read_csv('/data/embeddings/ProtT5_embeddings.csv')
new_df = new_df.iloc[:4819,:]
new_df.rename(columns={new_df.columns[0]: "id"}, inplace=True)

In [None]:
input_dim=1024
drop_out=0.2

NN=tf.keras.models.Sequential([
    tf.keras.layers.Dense(1024,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(128,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(1, activation='sigmoid'),
    ])

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
NN.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy", "mae", "mse"],)

In [None]:
def load_folds(folds_dir):
    folds = []
    for fold in range(1, 6):  # Assuming 5 folds (1 to 5)
        # Define the paths for train.npy and test.npy for each fold
        train_path = os.path.join(folds_dir, f'fold_{fold}_train_ids.csv')
        test_path = os.path.join(folds_dir, f'fold_{fold}_test_ids.csv')

        # Load the train and test files
        train_ids = pd.read_csv(train_path)  # This should load the numpy array of IDs
        test_ids = pd.read_csv(test_path)
        folds.append((train_ids, test_ids))

    return folds

def store_initial_weights(model):
    # Store the initial weights
    initial_weights = model.get_weights()
    return initial_weights

def reset_weights(model, initial_weights):
    # Reset the weights to the stored initial weights
    model.set_weights(initial_weights)

initial_weights = store_initial_weights(NN)
folds_dir = "/data/folds/5_folds_IId"
#folds_dir = "/data/folds/5_folds"
folds = load_folds(folds_dir)

In [None]:
# Placeholder for storing fold metrics
accuracy_list, precision_list, recall_list = [], [], []
f1score_list, auc_list, avg_precision_list, auprc_list = [], [], [], []

X = new_df.drop(columns=['id', 'label'])
y = new_df['label']

# Choose your balancing method: 'smote' or 'undersample'
balancing = 'over'
# balancing = 'undersample'

for fold_num, (train_ids, test_ids) in enumerate(folds, 1):

    # Assuming reset_weights and NN, initial_weights are defined elsewhere
    # reset_weights(NN, initial_weights)

    # Split
    reset_weights(NN, initial_weights)
    print(train_ids.shape)
    print(test_ids.shape)
    x_train = X[new_df['id'].isin(list(train_ids.iloc[:,0]))]
    x_test = X[new_df['id'].isin(list(test_ids.iloc[:,0]))]
    y_train = y[new_df['id'].isin(list(train_ids.iloc[:,0]))]
    y_test = y[new_df['id'].isin(list(test_ids.iloc[:,0]))]

    # --- Data Balancing on Train only ---
    if balancing == 'over':
        #sm = SMOTE(random_state=42)
        #x_train_bal, y_train_bal = sm.fit_resample(x_train, y_train)
        oversampler = RandomOverSampler(random_state=42)
        # Apply oversampling
        x_train_bal, y_train_bal = oversampler.fit_resample(x_train, y_train) # <--- CORRECTED HERE: y_train_balr to y_train_bal
    elif balancing == 'undersample':
        rus = RandomUnderSampler(random_state=42)
        x_train_bal, y_train_bal = rus.fit_resample(x_train, y_train)
    else:
        x_train_bal, y_train_bal = x_train, y_train

    # --- Train on balanced data ---
    # The NN.fit call now correctly uses y_train_bal
    NN.fit(x_train_bal, y_train_bal, epochs=50, batch_size=32)
    pred = NN.predict(x_test)

    # Evaluate
    threshold = 0.5
    binary_predictions = (pred >= threshold).astype(int)
    accuracy = accuracy_score(y_test, binary_predictions)
    precision = precision_score(y_test, binary_predictions)
    recall = recall_score(y_test, binary_predictions)
    f1 = f1_score(y_test, binary_predictions)
    auc_score = roc_auc_score(y_test, pred)
    precision_values, recall_values, _ = precision_recall_curve(y_test, pred)
    avg_precision = average_precision_score(y_test, pred)
    auprc = auc(recall_values, precision_values)

    # Store
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1score_list.append(f1)
    auc_list.append(auc_score)
    avg_precision_list.append(avg_precision)
    auprc_list.append(auprc)

    print(f"Fold {fold_num} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, "
          f"AUC: {auc_score:.4f}, AUPRC: {auprc:.4f}")

def print_mean_std(metric_list, metric_name):
    print(f"{metric_name:<15}: {np.mean(metric_list):.4f} ± {np.std(metric_list):.4f}")

print("\nAverage metrics across all folds (mean ± std):")
print_mean_std(accuracy_list, "Accuracy")
print_mean_std(precision_list, "Precision")
print_mean_std(recall_list, "Recall")
print_mean_std(f1score_list, "F1 Score")
print_mean_std(auc_list, "AUC")
print_mean_std(auprc_list, "AUPRC")

##ESM

In [None]:
import pandas as pd
new_df = pd.read_csv('/data/embeddings/esm_embeddings.csv')
new_df = new_df.iloc[:4819,:]
new_df.rename(columns={new_df.columns[0]: "id"}, inplace=True)

In [None]:
input_dim= 320
drop_out=0.2

NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(320,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(128,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(16,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
NN.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy", "mae", "mse"],)

In [None]:
import os
import numpy as np
import torch

def load_folds(folds_dir):
    folds = []
    for fold in range(1, 6):  # Assuming 5 folds (1 to 5)
        # Define the paths for train.npy and test.npy for each fold
        train_path = os.path.join(folds_dir, f'fold_{fold}_train_ids.csv')
        test_path = os.path.join(folds_dir, f'fold_{fold}_test_ids.csv')

        # Load the train and test files
        train_ids = pd.read_csv(train_path)  # This should load the numpy array of IDs
        test_ids = pd.read_csv(test_path)
        folds.append((train_ids, test_ids))

    return folds

def store_initial_weights(model):
    # Store the initial weights
    initial_weights = model.get_weights()
    return initial_weights

def reset_weights(model, initial_weights):
    # Reset the weights to the stored initial weights
    model.set_weights(initial_weights)

initial_weights = store_initial_weights(NN)
folds_dir = "/data/folds/5_folds_IId"
#folds_dir = "/data/folds/5_folds"
folds = load_folds(folds_dir)

In [None]:
# Placeholder for storing fold metrics
accuracy_list, precision_list, recall_list = [], [], []
f1score_list, auc_list, avg_precision_list, auprc_list = [], [], [], []

X = new_df.drop(columns=['id', 'label'])
y = new_df['label']

balancing = 'over'


for fold_num, (train_ids, test_ids) in enumerate(folds, 1):

    # Assuming reset_weights and NN, initial_weights are defined elsewhere
    # reset_weights(NN, initial_weights)

    # Split
    reset_weights(NN, initial_weights)
    print(train_ids.shape)
    print(test_ids.shape)
    x_train = X[new_df['id'].isin(list(train_ids.iloc[:,0]))]
    x_test = X[new_df['id'].isin(list(test_ids.iloc[:,0]))]
    y_train = y[new_df['id'].isin(list(train_ids.iloc[:,0]))]
    y_test = y[new_df['id'].isin(list(test_ids.iloc[:,0]))]

    # --- Data Balancing on Train only ---
    if balancing == 'over':
        #sm = SMOTE(random_state=42)
        #x_train_bal, y_train_bal = sm.fit_resample(x_train, y_train)
        oversampler = RandomOverSampler(random_state=42)
        # Apply oversampling
        x_train_bal, y_train_bal = oversampler.fit_resample(x_train, y_train) # <--- CORRECTED HERE: y_train_balr to y_train_bal
    elif balancing == 'undersample':
        rus = RandomUnderSampler(random_state=42)
        x_train_bal, y_train_bal = rus.fit_resample(x_train, y_train)
    else:
        x_train_bal, y_train_bal = x_train, y_train

    # --- Train on balanced data ---
    # The NN.fit call now correctly uses y_train_bal
    NN.fit(x_train_bal, y_train_bal, epochs=50, batch_size=32)
    pred = NN.predict(x_test)

    # Evaluate
    threshold = 0.5
    binary_predictions = (pred >= threshold).astype(int)
    accuracy = accuracy_score(y_test, binary_predictions)
    precision = precision_score(y_test, binary_predictions)
    recall = recall_score(y_test, binary_predictions)
    f1 = f1_score(y_test, binary_predictions)
    auc_score = roc_auc_score(y_test, pred)
    precision_values, recall_values, _ = precision_recall_curve(y_test, pred)
    avg_precision = average_precision_score(y_test, pred)
    auprc = auc(recall_values, precision_values)

    # Store
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1score_list.append(f1)
    auc_list.append(auc_score)
    avg_precision_list.append(avg_precision)
    auprc_list.append(auprc)

    print(f"Fold {fold_num} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, "
          f"AUC: {auc_score:.4f}, AUPRC: {auprc:.4f}")

def print_mean_std(metric_list, metric_name):
    print(f"{metric_name:<15}: {np.mean(metric_list):.4f} ± {np.std(metric_list):.4f}")

print("\nAverage metrics across all folds (mean ± std):")
print_mean_std(accuracy_list, "Accuracy")
print_mean_std(precision_list, "Precision")
print_mean_std(recall_list, "Recall")
print_mean_std(f1score_list, "F1 Score")
print_mean_std(auc_list, "AUC")
print_mean_std(auprc_list, "AUPRC")

#