#GAT_PB IID

##Load Libaries

In [None]:
!pip install ogb
!pip install torch_geometric

In [None]:
pip install torch

In [None]:
from torch._C import *
import torch
print(torch.__version__)
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, auc
from sklearn.preprocessing import label_binarize
import numpy as np
from scipy.sparse import coo_matrix


##GAT Model Layers

In [None]:
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels,num_classes,num_layers, dropout):
        super(GAT, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.fc = torch.nn.ModuleList()
        self.convs.append(GATConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(GATConv(hidden_channels, hidden_channels))
        self.convs.append(GATConv(hidden_channels, out_channels))
        self.fc.append(torch.nn.Linear(out_channels,num_classes))
        self.dropout = dropout


    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()

    def forward(self, x, adj_t):
        for conv in self.convs:
            x = conv(x, adj_t)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        for fc in self.fc[:-1]:
            x = fc(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.fc[-1](x)
        return x

def train(model, data, train_idx, optimizer):
      model.train()
      criterion = torch.nn.CrossEntropyLoss(weight=weights)
      optimizer.zero_grad()
      out = model(data.x, data.adj_t)[train_idx]
      loss = criterion(out, data.y[train_idx])
      loss.backward()
      optimizer.step()
      return loss.item()

def to_one_hot(y, num_classes):
    y_one_hot = torch.zeros(y.size(0), num_classes).to(y.device)
    y_one_hot.scatter_(1, y.view(-1, 1), 1)
    return y_one_hot

@torch.no_grad()
def test(model, data, split_idx, num_classes):
    model.eval()

    y_probs = model(data.x, data.adj_t)
    y_probs = torch.softmax(y_probs, dim=1)

    y_true_train = to_one_hot(data.y[split_idx['train']], num_classes)
    y_true_valid = to_one_hot(data.y[split_idx['valid']], num_classes)
    y_true_test = to_one_hot(data.y[split_idx['test']], num_classes)

    def compute_metrics(y_true, y_probs):
        y_pred_labels = y_probs.argmax(dim=1).cpu().numpy()
        y_true_labels = y_true.argmax(dim=1).cpu().numpy()

        acc = accuracy_score(y_true_labels, y_pred_labels)
        prec = precision_score(y_true_labels, y_pred_labels, zero_division=0)
        rec = recall_score(y_true_labels, y_pred_labels, zero_division=0)
        f1 = f1_score(y_true_labels, y_pred_labels, zero_division=0)
        rocauc = roc_auc_score(y_true.cpu(), y_probs.cpu())
        y_true_cpu = y_true.cpu()
        y_true_binarized = label_binarize(y_true_cpu, classes=[0, 1])
        y_probs_cpu = y_probs.cpu().numpy()
        precision, recall, _ = precision_recall_curve(y_true_binarized[:, 1], [score[1] for score in y_probs_cpu])
        auprc = auc(recall, precision)

        return [acc, prec, rec, f1, rocauc, auprc]

    train_metrics = compute_metrics(y_true_train, y_probs[split_idx['train']])
    valid_metrics = compute_metrics(y_true_valid, y_probs[split_idx['valid']])
    test_metrics = compute_metrics(y_true_test, y_probs[split_idx['test']])

    return train_metrics, valid_metrics, test_metrics

@torch.no_grad()
def test_kfold(model, data, split_idx, num_classes):
    model.eval()

    y_probs = model(data.x, data.adj_t)
    y_probs = torch.softmax(y_probs, dim=1)

    y_true_train = to_one_hot(data.y[split_idx['train']], num_classes)
    #y_true_valid = to_one_hot(data.y[split_idx['valid']], num_classes)
    y_true_test = to_one_hot(data.y[split_idx['test']], num_classes)

    def compute_metrics(y_true, y_probs):
        y_pred_labels = y_probs.argmax(dim=1).cpu().numpy()
        y_true_labels = y_true.argmax(dim=1).cpu().numpy()

        acc = accuracy_score(y_true_labels, y_pred_labels)
        prec = precision_score(y_true_labels, y_pred_labels, zero_division=0)
        rec = recall_score(y_true_labels, y_pred_labels, zero_division=0)
        f1 = f1_score(y_true_labels, y_pred_labels, zero_division=0)
        rocauc = roc_auc_score(y_true.cpu(), y_probs.cpu())
        y_true_cpu = y_true.cpu()
        y_true_binarized = label_binarize(y_true_cpu, classes=[0, 1])
        y_probs_cpu = y_probs.cpu().numpy()
        precision, recall, _ = precision_recall_curve(y_true_binarized[:, 1], [score[1] for score in y_probs_cpu])
        auprc = auc(recall, precision)

        return [acc, prec, rec, f1, rocauc, auprc]

    train_metrics = compute_metrics(y_true_train, y_probs[split_idx['train']])
    test_metrics = compute_metrics(y_true_test, y_probs[split_idx['test']])

    return train_metrics, test_metrics

##Load Data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import pandas as pd
df_features = pd.read_csv('/content/gdrive/MyDrive/New_Repo_Data/embeddings_mean_protbert_3k.csv')
df_features.iloc[:,1:-1] = np.random.uniform(low=-3, high=6, size=(5609, 1024))

df_labels = df_features .iloc[:4819, [0, -1]]

In [None]:
df_labels.shape

In [None]:
df_labels['Id'] = df_labels['Id'].astype(str)
df_features['Id'] = df_features['Id'].astype(str)

In [None]:
order_dict = {value: index for index, value in enumerate(list(df_labels['Id']))}

def sorting_key(value):
    return (order_dict.get(value, float('inf')), value)

df_features = df_features.sort_values(by='Id', key=lambda x: x.map(sorting_key))

In [None]:
df_features.shape

In [None]:
import networkx as nx

graph_data = pd.read_csv('/content/gdrive/MyDrive/New_Repo_Data/new_ppi_edges_iid.csv')
id_list = list(df_features['Id'])

G = nx.Graph()
for id1,id2 in zip(list(graph_data.iloc[:,0]),list(graph_data.iloc[:,1])):
  if((str(id1) in id_list) and (str(id2) in id_list)):
    G.add_edge(id1,id2)
df_features = df_features[df_features['Id'].isin(list(G.nodes()))]

In [None]:
adj_sparse = nx.adjacency_matrix(G, nodelist=list(df_features.iloc[:,0]))

In [None]:
node_labels = np.array(df_features.iloc[:,-1])

In [None]:
import os
import numpy as np
import torch

adj_matrix = coo_matrix(adj_sparse).todense()
# Define the number of nodes and features
num_nodes = 4819
num_features = 1024
num_classes = 2

node_features = np.array(df_features.iloc[:,1:-1])
node_names = np.array(df_features.iloc[:,0])

adj_matrix = (adj_matrix + adj_matrix.T) / 2
adj_matrix[adj_matrix < 0.9] = 0  # Sparsify the adjacency matrix

node_features_tensor = torch.from_numpy(node_features).float()
node_labels_tensor = torch.from_numpy(node_labels).long()  # Convert labels to long type

adj_coo = adj_sparse.tocoo()
indices = np.vstack((adj_coo.row, adj_coo.col))
values = adj_sparse.data

indices_tensor = torch.tensor(indices, dtype=torch.long)
values_tensor = torch.tensor(values, dtype=torch.float)

adj_t = torch.sparse_coo_tensor(indices_tensor, values_tensor, adj_sparse.shape).to_sparse_csr()

data = Data(uni_id=node_names ,x=node_features_tensor, adj_t=adj_t, y=node_labels_tensor)

# Assuming your data structure
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Map unique IDs to indices
id_to_idx_map = {id_: idx for idx, id_ in enumerate(data.uni_id)}

# Function to load folds (train.npy and test.npy) from Google Drive
def load_folds(folds_dir):
    folds = []
    for fold in range(1, 6):  # Assuming 5 folds (1 to 5)
        # Define the paths for train.npy and test.npy for each fold
        train_path = os.path.join(folds_dir, f'fold_{fold}_train_ids.csv')
        test_path = os.path.join(folds_dir, f'fold_{fold}_test_ids.csv')

        # Load the train and test files
        train_ids = pd.read_csv(train_path)  # This should load the numpy array of IDs
        test_ids = pd.read_csv(test_path)

        # Map the IDs to indices
        train_idx = torch.tensor([id_to_idx_map[id_] for id_ in list (train_ids.iloc[:,0])], dtype=torch.long).to(device)
        test_idx = torch.tensor([id_to_idx_map[id_] for id_ in list (test_ids.iloc[:,0])], dtype=torch.long).to(device)

        # Append the tuple of train and test indices for this fold
        folds.append((train_idx, test_idx))

    return folds

folds_dir = "/content/gdrive/MyDrive/New_Repo_Data/5folds"  # Update this path to your Google Drive directory
folds = load_folds(folds_dir)

In [None]:
import os
import numpy as np
import torch
from sklearn.model_selection import KFold
import torch.nn.functional as F

# Hyper-parameters and settings
hidden_channels = 128
out_channels = 64
num_layers = 2
dropout = 0.5
runs = 5
lr = 0.001
epochs = 500
eval_steps = 10
log_steps = 10
weights = torch.tensor([2.0,1.0])
num_classes = 2
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load data and folds
data = data.to(device)
weights = weights.to(device)
model = GAT(in_channels=data.num_features,
            hidden_channels=hidden_channels,
            out_channels=out_channels,
            num_classes=num_classes,
            num_layers=num_layers,
            dropout=dropout).to(device)

# Metrics storage
train_acc_list, train_prec_list, train_rec_list = [], [], []
train_f1_list, train_rocauc_list, train_aucprc_list = [], [], []
test_acc_list, test_prec_list, test_rec_list = [], [], []
test_f1_list, test_rocauc_list, test_aucprc_list = [], [], []

# Main training loop
for fold, (train_idx, test_idx) in enumerate(folds):
    print(f'\n=== Fold {fold + 1}/{len(folds)} ===')
    train_idx = torch.tensor(train_idx, dtype=torch.long).to(device)
    test_idx = torch.tensor(test_idx, dtype=torch.long).to(device)
    split_idx = {'train': train_idx, 'test': test_idx}

    model.reset_parameters()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_train_rocauc = 0
    loss_history = []

    for epoch in range(1, epochs + 1):
        loss = train(model, data, train_idx, optimizer)
        loss_history.append(loss)

        # Early stopping check
        if len(loss_history) > 10 and np.std(loss_history[-10:]) < 1e-3:
            print(f"Early stopping at epoch {epoch}")
            break

        # Evaluation
        if epoch % eval_steps == 0 or epoch == epochs:
            train_metrics, test_metrics = test_kfold(model, data, split_idx, num_classes)
            train_rocauc = train_metrics[4]

            # Save best model
            if train_rocauc > best_train_rocauc:
                best_train_rocauc = train_rocauc
                torch.save(model.state_dict(), f'best_model_fold_{fold}.pt')

    # Final evaluation with best model
    model.load_state_dict(torch.load(f'best_model_fold_{fold}.pt'))
    train_metrics, test_metrics = test_kfold(model, data, split_idx, num_classes)

    # Store metrics
    for lst, values in zip([train_acc_list, train_prec_list, train_rec_list, train_f1_list, train_rocauc_list, train_aucprc_list],
                           train_metrics):
        lst.append(values)

    for lst, values in zip([test_acc_list, test_prec_list, test_rec_list, test_f1_list, test_rocauc_list, test_aucprc_list],
                           test_metrics):
        lst.append(values)

# Metrics calculations
def calculate_stats(metric_list, name):
    return {
        'mean': np.mean(metric_list),
        'std': np.std(metric_list),
        'var': np.var(metric_list),
        'min': np.min(metric_list),
        'max': np.max(metric_list)
    }

# Generate report
print("\n=== Final Report ===")
print("{:<15} {:<8} {:<8} {:<8} {:<8} {:<8}".format(
    'Metric', 'Mean', 'Std', 'Var', 'Min', 'Max'))

for metric_name, train_list, test_list in [
    ('Accuracy', train_acc_list, test_acc_list),
    ('Precision', train_prec_list, test_prec_list),
    ('Recall', train_rec_list, test_rec_list),
    ('F1', train_f1_list, test_f1_list),
    ('ROC AUC', train_rocauc_list, test_rocauc_list),
    ('PR AUC', train_aucprc_list, test_aucprc_list)
]:
    train_stats = calculate_stats(train_list, 'Train')
    test_stats = calculate_stats(test_list, 'Test')

    print(f"\n**{metric_name}**")
    print("Train:\t{mean:.4f} ± {std:.4f}\t(var: {var:.4f})\t[{min:.4f}-{max:.4f}]".format(**train_stats))
    print("Test:\t{mean:.4f} ± {std:.4f}\t(var: {var:.4f})\t[{min:.4f}-{max:.4f}]".format(**test_stats))

##Run Model

In [None]:
import os
import numpy as np
import torch
from sklearn.model_selection import KFold
import torch.nn.functional as F

# Hyper-parameters and settings
hidden_channels = 128
out_channels = 64
num_layers = 2
dropout = 0.5
runs = 5
lr = 0.001
epochs = 500
eval_steps = 10
log_steps = 10
weights = torch.tensor([2.0,1.0])
num_classes = 2
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load data and folds
data = data.to(device)
weights = weights.to(device)
model = GAT(in_channels=data.num_features,
            hidden_channels=hidden_channels,
            out_channels=out_channels,
            num_classes=num_classes,
            num_layers=num_layers,
            dropout=dropout).to(device)

# Metrics storage
train_acc_list, train_prec_list, train_rec_list = [], [], []
train_f1_list, train_rocauc_list, train_aucprc_list = [], [], []
test_acc_list, test_prec_list, test_rec_list = [], [], []
test_f1_list, test_rocauc_list, test_aucprc_list = [], [], []

# Main training loop
for fold, (train_idx, test_idx) in enumerate(folds):
    print(f'\n=== Fold {fold + 1}/{len(folds)} ===')
    train_idx = torch.tensor(train_idx, dtype=torch.long).to(device)
    test_idx = torch.tensor(test_idx, dtype=torch.long).to(device)
    split_idx = {'train': train_idx, 'test': test_idx}

    model.reset_parameters()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_train_rocauc = 0
    loss_history = []

    for epoch in range(1, epochs + 1):
        loss = train(model, data, train_idx, optimizer)
        loss_history.append(loss)

        # Early stopping check
        if len(loss_history) > 10 and np.std(loss_history[-10:]) < 1e-3:
            print(f"Early stopping at epoch {epoch}")
            break

        # Evaluation
        if epoch % eval_steps == 0 or epoch == epochs:
            train_metrics, test_metrics = test_kfold(model, data, split_idx, num_classes)
            train_rocauc = train_metrics[4]

            # Save best model
            if train_rocauc > best_train_rocauc:
                best_train_rocauc = train_rocauc
                torch.save(model.state_dict(), f'best_model_fold_{fold}.pt')

    # Final evaluation with best model
    model.load_state_dict(torch.load(f'best_model_fold_{fold}.pt'))
    train_metrics, test_metrics = test_kfold(model, data, split_idx, num_classes)

    # Store metrics
    for lst, values in zip([train_acc_list, train_prec_list, train_rec_list, train_f1_list, train_rocauc_list, train_aucprc_list],
                           train_metrics):
        lst.append(values)

    for lst, values in zip([test_acc_list, test_prec_list, test_rec_list, test_f1_list, test_rocauc_list, test_aucprc_list],
                           test_metrics):
        lst.append(values)

# Metrics calculations
def calculate_stats(metric_list, name):
    return {
        'mean': np.mean(metric_list),
        'std': np.std(metric_list),
        'var': np.var(metric_list),
        'min': np.min(metric_list),
        'max': np.max(metric_list)
    }

# Generate report
print("\n=== Final Report ===")
print("{:<15} {:<8} {:<8} {:<8} {:<8} {:<8}".format(
    'Metric', 'Mean', 'Std', 'Var', 'Min', 'Max'))

for metric_name, train_list, test_list in [
    ('Accuracy', train_acc_list, test_acc_list),
    ('Precision', train_prec_list, test_prec_list),
    ('Recall', train_rec_list, test_rec_list),
    ('F1', train_f1_list, test_f1_list),
    ('ROC AUC', train_rocauc_list, test_rocauc_list),
    ('PR AUC', train_aucprc_list, test_aucprc_list)
]:
    train_stats = calculate_stats(train_list, 'Train')
    test_stats = calculate_stats(test_list, 'Test')

    print(f"\n**{metric_name}**")
    print("Train:\t{mean:.4f} ± {std:.4f}\t(var: {var:.4f})\t[{min:.4f}-{max:.4f}]".format(**train_stats))
    print("Test:\t{mean:.4f} ± {std:.4f}\t(var: {var:.4f})\t[{min:.4f}-{max:.4f}]".format(**test_stats))