#Word2vec_ProtT5

##Load Libraries

In [None]:
# ===============================
# Load Libraries
# ===============================

# Core
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch

# Embeddings
from gensim.models import Word2Vec
import networkx as nx
import gensim
import nltk
from nltk.tokenize import word_tokenize

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Class Imbalance
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, precision_recall_curve,
    average_precision_score, auc
)
from imblearn.over_sampling import RandomOverSampler

##Word2Vec

### Load Data

In [None]:
graph_data = pd.read_csv('data/ppi_edges_iid.csv')
df_features = pd.read_csv('data/protT5_embeddings.csv')

df_feature = df_features.iloc[:4819, :]
id_list = set(df_feature['Id'])
G = nx.Graph()
for id1, id2 in zip(graph_data.iloc[:, 0], graph_data.iloc[:, 1]):
    if str(id1) in id_list and str(id2) in id_list:
        G.add_edge(id1, id2)

### Generate Word2Vec Embeddings

In [None]:
sentences = []

for i in range(len(graph_data)):
    n1, n2 = graph_data.iloc[i, 0], graph_data.iloc[i, 1]
    sentences.append(f"{n1} is_connected {n2}")

for i in range(len(graph_data)):
    n1, n2 = graph_data.iloc[i, 1], graph_data.iloc[i, 0]
    sentences.append(f"{n1} is_connected {n2}")


# Tokenization
nltk.download('punkt')
tokenized_sentences = [word_tokenize(s.lower()) for s in sentences]


# Train Word2Vec Model
w2v_model = Word2Vec(
    min_count=1,
    alpha=0.001,
    vector_size=512,
    window=2,
    epochs=1000,
    sg=1
)

w2v_model.build_vocab(tokenized_sentences)
w2v_model.train(
    tokenized_sentences,
    total_examples=w2v_model.corpus_count,
    epochs=w2v_model.epochs
)


# Extract Node Embeddings
vocab = w2v_model.wv.index_to_key

node_vectors = {
    str(node): w2v_model.wv[str(node)]
    for node in vocab
}

node_vector = (
    pd.DataFrame.from_dict(node_vectors, orient='index')
    .reset_index()
    .rename(columns={'index': 'Id'})
)

node_vector.to_csv("word2vec_features.csv", index=False)


## Merge Word2Vec with Labels
df_features = df_features.rename(columns={df_features.columns[0]: 'Id'})

filtered_node_vector = node_vector[
    node_vector['Id'].isin(df_features['Id'])
].copy()

word2vec_vec = pd.merge(
    filtered_node_vector,
    df_features[['Id', 'label']],
    on='Id',
    how='left'
)

word2vec_vec = word2vec_vec[word2vec_vec['label'] != -1]

### Combine with Prot T5 Features

In [None]:
cols_to_add = [c for c in df_features.columns if c != 'label']

merged_df = pd.merge(
    word2vec_vec,
    df_features[cols_to_add],
    on='Id',
    how='left'
)

cols = [c for c in merged_df.columns if c != 'label'] + ['label']
merged_df = merged_df[cols]


##Rename Feature Columns and Save
num_features = 1536
feature_names = [f'v{i}' for i in range(1, num_features + 1)]
merged_df.columns = ['Id'] + feature_names + ['label']

##Neural Network Word2Vec




In [None]:
input_dim=512
drop_out=0.5

NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(512,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(128,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(16,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
NN.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy", "mae", "mse"],)

In [None]:
def load_folds(folds_dir):
    folds = []
    for fold in range(1, 6):  # Assuming 5 folds (1 to 5)
        # Define the paths for train.npy and test.npy for each fold
        train_path = os.path.join(folds_dir, f'fold_{fold}_train_ids.csv')
        test_path = os.path.join(folds_dir, f'fold_{fold}_test_ids.csv')

        # Load the train and test files
        train_ids = pd.read_csv(train_path)  # This should load the numpy array of IDs
        test_ids = pd.read_csv(test_path)
        folds.append((train_ids, test_ids))

    return folds

def store_initial_weights(model):
    # Store the initial weights
    initial_weights = model.get_weights()
    return initial_weights

def reset_weights(model, initial_weights):
    # Reset the weights to the stored initial weights
    model.set_weights(initial_weights)

initial_weights = store_initial_weights(NN)
# Example usage
folds_dir = "/data/5folds_IId"  # Update this path to your Google Drive directory
folds = load_folds(folds_dir)

In [None]:
# Placeholder for storing fold metrics
accuracy_list, precision_list, recall_list = [], [], []
f1score_list, auc_list, avg_precision_list, auprc_list = [], [], [], []

X = word2vec_vec.drop(columns=['Id', 'label'])
y = word2vec_vec['label']

# Choose your balancing method: 'smote' or 'undersample'
balancing = 'over'
# balancing = 'undersample'

for fold_num, (train_ids, test_ids) in enumerate(folds, 1):

    reset_weights(NN, initial_weights)
    x_train = X[word2vec_vec['Id'].isin(list(train_ids.iloc[:,0]))]
    x_test = X[word2vec_vec['Id'].isin(list(test_ids.iloc[:,0]))]
    y_train = y[word2vec_vec['Id'].isin(list(train_ids.iloc[:,0]))]
    y_test = y[word2vec_vec['Id'].isin(list(test_ids.iloc[:,0]))]

    # --- Data Balancing on Train only ---
    if balancing == 'over':
        #sm = SMOTE(random_state=42)
        #x_train_bal, y_train_bal = sm.fit_resample(x_train, y_train)
        oversampler = RandomOverSampler(random_state=42)
        # Apply oversampling
        x_train_bal, y_train_bal = oversampler.fit_resample(x_train, y_train) # <--- CORRECTED HERE: y_train_balr to y_train_bal
    elif balancing == 'undersample':
        rus = RandomUnderSampler(random_state=42)
        x_train_bal, y_train_bal = rus.fit_resample(x_train, y_train)
    else:
        x_train_bal, y_train_bal = x_train, y_train

    # --- Train on balanced data ---
    # The NN.fit call now correctly uses y_train_bal
    NN.fit(x_train_bal, y_train_bal, epochs=50, batch_size=32)
    pred = NN.predict(x_test)

    # Evaluate
    threshold = 0.5
    binary_predictions = (pred >= threshold).astype(int)
    accuracy = accuracy_score(y_test, binary_predictions)
    precision = precision_score(y_test, binary_predictions)
    recall = recall_score(y_test, binary_predictions)
    f1 = f1_score(y_test, binary_predictions)
    auc_score = roc_auc_score(y_test, pred)
    precision_values, recall_values, _ = precision_recall_curve(y_test, pred)
    avg_precision = average_precision_score(y_test, pred)
    auprc = auc(recall_values, precision_values)

    # Store
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1score_list.append(f1)
    auc_list.append(auc_score)
    avg_precision_list.append(avg_precision)
    auprc_list.append(auprc)

    print(f"Fold {fold_num} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, "
          f"AUC: {auc_score:.4f}, Avg Precision: {avg_precision:.4f}, AUPRC: {auprc:.4f}")

def print_mean_std(metric_list, metric_name):
    print(f"{metric_name:<15}: {np.mean(metric_list):.4f} ± {np.std(metric_list):.4f}")

print("\nAverage metrics across all folds (mean ± std):")
print_mean_std(accuracy_list, "Accuracy")
print_mean_std(precision_list, "Precision")
print_mean_std(recall_list, "Recall")
print_mean_std(f1score_list, "F1 Score")
print_mean_std(auc_list, "AUC")
print_mean_std(auprc_list, "AUPRC")

##Neural Network Concat (Prot T5,Word2vec)

In [None]:
input_dim= 1536
drop_out=0.5

NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(1536,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(128,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(16,input_dim=input_dim, activation='relu'),
    tf.keras.layers.Dropout(drop_out),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
NN.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy", "mae", "mse"],)

In [None]:
def load_folds(folds_dir):
    folds = []
    for fold in range(1, 6):  # Assuming 5 folds (1 to 5)
        # Define the paths for train.npy and test.npy for each fold
        train_path = os.path.join(folds_dir, f'fold_{fold}_train_ids.csv')
        test_path = os.path.join(folds_dir, f'fold_{fold}_test_ids.csv')

        # Load the train and test files
        train_ids = pd.read_csv(train_path)  # This should load the numpy array of IDs
        test_ids = pd.read_csv(test_path)
        folds.append((train_ids, test_ids))

    return folds

def store_initial_weights(model):
    # Store the initial weights
    initial_weights = model.get_weights()
    return initial_weights

def reset_weights(model, initial_weights):
    # Reset the weights to the stored initial weights
    model.set_weights(initial_weights)

initial_weights = store_initial_weights(NN)
# Example usage
folds_dir = "/data/5folds_IId"  # Update this path to your Google Drive directory
folds = load_folds(folds_dir)

In [None]:
# Placeholder for storing fold metrics
accuracy_list, precision_list, recall_list = [], [], []
f1score_list, auc_list, avg_precision_list, auprc_list = [], [], [], []

X = merged_df.drop(columns=['Id', 'label'])
y = merged_df['label']

# Choose your balancing method: 'smote' or 'undersample'
balancing = 'over'
# balancing = 'undersample'

for fold_num, (train_ids, test_ids) in enumerate(folds, 1):

    # Assuming reset_weights and NN, initial_weights are defined elsewhere
    # reset_weights(NN, initial_weights)

    # Split
    reset_weights(NN, initial_weights)
    x_train = X[merged_df['Id'].isin(list(train_ids.iloc[:,0]))]
    x_test = X[merged_df['Id'].isin(list(test_ids.iloc[:,0]))]
    y_train = y[merged_df['Id'].isin(list(train_ids.iloc[:,0]))]
    y_test = y[merged_df['Id'].isin(list(test_ids.iloc[:,0]))]

    # --- Data Balancing on Train only ---
    if balancing == 'over':
        #sm = SMOTE(random_state=42)
        #x_train_bal, y_train_bal = sm.fit_resample(x_train, y_train)
        oversampler = RandomOverSampler(random_state=42)
        # Apply oversampling
        x_train_bal, y_train_bal = oversampler.fit_resample(x_train, y_train) # <--- CORRECTED HERE: y_train_balr to y_train_bal
    elif balancing == 'undersample':
        rus = RandomUnderSampler(random_state=42)
        x_train_bal, y_train_bal = rus.fit_resample(x_train, y_train)
    else:
        x_train_bal, y_train_bal = x_train, y_train

    # --- Train on balanced data ---
    # The NN.fit call now correctly uses y_train_bal
    NN.fit(x_train_bal, y_train_bal, epochs=50, batch_size=32)
    pred = NN.predict(x_test)

    # Evaluate
    threshold = 0.5
    binary_predictions = (pred >= threshold).astype(int)
    accuracy = accuracy_score(y_test, binary_predictions)
    precision = precision_score(y_test, binary_predictions)
    recall = recall_score(y_test, binary_predictions)
    f1 = f1_score(y_test, binary_predictions)
    auc_score = roc_auc_score(y_test, pred)
    precision_values, recall_values, _ = precision_recall_curve(y_test, pred)
    avg_precision = average_precision_score(y_test, pred)
    auprc = auc(recall_values, precision_values)

    # Store
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1score_list.append(f1)
    auc_list.append(auc_score)
    avg_precision_list.append(avg_precision)
    auprc_list.append(auprc)

    print(f"Fold {fold_num} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, "
          f"AUC: {auc_score:.4f}, Avg Precision: {avg_precision:.4f}, AUPRC: {auprc:.4f}")

def print_mean_std(metric_list, metric_name):
    print(f"{metric_name:<15}: {np.mean(metric_list):.4f} ± {np.std(metric_list):.4f}")

print("\nAverage metrics across all folds (mean ± std):")
print_mean_std(accuracy_list, "Accuracy")
print_mean_std(precision_list, "Precision")
print_mean_std(recall_list, "Recall")
print_mean_std(f1score_list, "F1 Score")
print_mean_std(auc_list, "AUC")
print_mean_std(auprc_list, "AUPRC")