In [1]:
import os
import wandb
import random
import pyreadstat
import torch
import numpy as np
import pandas as pd
import networkx as nx
import torch.nn as nn

import torch_geometric as tg
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.transforms import RandomNodeSplit
from torch.utils.data import DataLoader, TensorDataset

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from torch.utils.tensorboard import SummaryWriter

In [2]:
# wandb.login()

In [3]:
# Preparing the Graph Dataset
DATASET_DIR = '/home/shussein/NetCO/GNNs/data/COPD/SparsifiedNetworks'
dataset_name = 'trimmed_fev1_0.515_0.111_adj.csv'

graph_adj_file = os.path.join(DATASET_DIR, dataset_name)
graph_adj = pd.read_csv(graph_adj_file, index_col=0).to_numpy()
nodes_names = pd.read_csv(graph_adj_file, index_col=0).index.tolist()

original_dataset = pd.read_csv(os.path.join(DATASET_DIR, 'fev1_X.csv'), index_col=0).reset_index(drop='index')
dataset_associated_phenotype = pd.read_csv(os.path.join(DATASET_DIR, 'fev1_Y.csv'), index_col=0).reset_index(drop='index')

# TODO: Attempting to run the GCN without predicting the phenotype; if this works -> remove this code
nodes_correlation_with_phenotype = original_dataset.corrwith(dataset_associated_phenotype['FEV1pp_utah'])

original_dataset_sid = pd.read_csv('../GNNs/data/COPD/SparsifiedNetworks/fev1_X.csv', index_col=0)
original_dataset_sid.index.name = 'sid'

clinical_variables, meta = pyreadstat.read_sas7bdat("../Data/COPDGene_P1P2P3_SM_NS_Long_Oct22.sas7bdat")
clinical_variables = clinical_variables.set_index('sid')
# Filtering on Visit Number
clinical_variables = clinical_variables[clinical_variables['visitnum'] == 2.0]

clinical_variables_comorbidities = ['Angina', 'CongestHeartFail', 'CoronaryArtery', 'HeartAttack', 'PeriphVascular', 'Stroke', 'TIA', 'Diabetes',
                                    'Osteoporosis', 'HighBloodPres', 'HighCholest', 'CognitiveDisorder', 'MacularDegen', 'KidneyDisease', 'LiverDisease']
clinical_variables = clinical_variables.assign(comorbidities=clinical_variables[clinical_variables_comorbidities].sum(axis=1))

complete_original_dataset = pd.merge(clinical_variables, original_dataset_sid, left_index=True, right_index=True)
clinical_variables = complete_original_dataset[clinical_variables.columns]

complete_original_dataset['finalgold_visit'].fillna(0, inplace=True)

complete_original_dataset.drop(complete_original_dataset[complete_original_dataset['finalgold_visit'] == -1].index, inplace=True)
original_dataset_sid = original_dataset_sid[original_dataset_sid.index.isin(complete_original_dataset.index.tolist())]
clinical_variables_cols = ['gender', 'age_visit', 'Chronic_Bronchitis', 'PRM_pct_emphysema_Thirona', 'PRM_pct_normal_Thirona', 'Pi10_Thirona', 'comorbidities']

graph = nx.from_numpy_array(graph_adj)

nodes_features = []
for node_name in nodes_names:
    node_features = []
    for clinical_variable in clinical_variables_cols:
        node_features.append(
            abs(original_dataset_sid[node_name].corr(complete_original_dataset[clinical_variable].astype('float64'))))
    nodes_features.append(node_features)

features = np.array(nodes_features)
nodes_labels = [abs(x) for x in nodes_correlation_with_phenotype.values.tolist()]
graph.remove_edges_from(nx.selfloop_edges(graph))

x = np.zeros(features.shape)
graph_nodes = list(graph.nodes)
for m in range(features.shape[0]):
    x[graph_nodes[m]] = features[m]
x = torch.from_numpy(x).float()

# Edges Indexes
edge_index = np.array(list(graph.edges))
edge_index = np.concatenate((edge_index, edge_index[:, ::-1]), axis=0)
edge_index = torch.from_numpy(edge_index).long().permute(1, 0)

# Edges Weights
edge_weight = np.array(list(nx.get_edge_attributes(graph, 'weight').values()))
edge_weight = np.concatenate((edge_weight, edge_weight), axis=0)
edge_weight = torch.from_numpy(edge_weight).float()

nodes_labels = torch.from_numpy(np.array(nodes_labels)).float()
dataset = Data(x=x, edge_index=edge_index, edge_attr=edge_weight, y=nodes_labels) # torch.zeros(len(nodes_names)))
dataset

Data(x=[27, 7], edge_index=[2, 78], edge_attr=[78], y=[27])

In [4]:
transform = RandomNodeSplit(num_val=5, num_test=5)
data = transform(dataset)
data

Data(x=[27, 7], edge_index=[2, 78], edge_attr=[78], y=[27], train_mask=[27], val_mask=[27], test_mask=[27])

In [39]:
original_dataset_sid['finalgold_visit'] = complete_original_dataset['finalgold_visit']
original_dataset_sid.drop(['finalgold_visit'], axis=1, inplace=True)
original_dataset_sid

Unnamed: 0_level_0,(N(1) + N(8))-acetylspermidine,5-acetylamino-6-amino-3-methyluracil,5-hydroxyhexanoate,adrenate (22:4n6),C-glycosyltryptophan,phosphocholine,ergothioneine,myristoleoylcarnitine (C14:1)*,"N2,N2-dimethylguanosine",X - 12026,...,Complement component C9,Carbonic anhydrase 6,Kallistatin,Beta-2-microglobulin,C-reactive protein,Growth/differentiation factor 15,"Alpha-(1,3)-fucosyltransferase 5",Trefoil factor 3,Troponin T,N-terminal pro-BNP
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10010J,-0.190253,2.198677,0.951882,1.728713,1.696511,1.810291,-1.921257,0.875180,0.892684,0.124223,...,0.322382,-1.118447,-0.234189,0.926362,0.557443,0.017068,-0.382598,0.234377,0.593199,1.282038
10031R,1.137203,-0.556750,1.417606,0.412620,0.341081,0.396214,-0.667501,1.829685,-0.880821,-1.277666,...,0.226270,-0.377116,-1.673974,0.100996,0.220795,-0.145500,-0.386756,-0.132752,-0.795531,-0.020434
10032T,-0.852246,0.353310,0.415074,0.528622,0.050416,0.134069,0.375303,0.350066,-2.254200,-0.518312,...,-0.428367,0.574186,-0.634489,-0.364988,0.629139,0.187559,-1.008337,-0.288907,-0.917175,-0.896100
10052Z,0.247807,0.975486,-1.502064,0.469000,0.333686,-0.163554,-0.732381,0.195025,-0.199153,-0.187908,...,-0.409941,-0.775491,-0.197459,-0.731378,1.171704,-1.459689,2.014551,-0.646074,0.414110,-1.828103
10055F,-0.366317,-1.130497,0.858598,-0.866032,-0.241559,0.173388,-0.236713,1.537696,-1.105873,-1.534229,...,0.660275,0.204040,-0.372867,0.890467,-0.852658,1.371039,1.157176,0.630683,0.051998,2.123272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25551W,0.829446,0.050146,-2.103680,-0.102467,-0.408373,-0.135074,1.477414,-1.314134,0.006312,-0.615117,...,-1.925122,-0.239220,1.567190,0.312708,-3.290274,-0.639323,-1.808501,-0.282904,-0.152816,0.776388
25563D,-0.209134,-0.472576,0.019962,-1.653494,-0.981828,1.120985,0.874699,0.033871,-0.957397,-1.584809,...,0.294543,-0.309081,-0.945435,-0.337557,1.224334,-0.182733,0.960652,-0.789511,-0.684374,1.373968
25564F,-0.098917,-1.349235,1.152642,-0.118369,-0.103682,1.691015,-0.610257,0.828027,0.085634,-0.500804,...,-0.173596,0.309346,-0.362805,-1.158474,0.221569,-1.306005,0.326384,-0.922360,-0.366161,-1.541756
25571C,-2.186909,-0.205182,-0.552108,-0.625187,-0.446872,2.273861,0.261602,-0.991433,-0.777158,-0.851403,...,0.294511,-2.127945,0.022630,-0.590303,1.132313,-1.366281,-0.561698,-0.902983,-0.610586,-0.965605


In [5]:
# Unifying Classes 0, -1, {1, 2} -> 1, {3, 4} -> 2
complete_original_dataset['finalgold_visit'] = np.where(complete_original_dataset['finalgold_visit'] == 2, 1, complete_original_dataset['finalgold_visit'])
complete_original_dataset['finalgold_visit'] = np.where((complete_original_dataset['finalgold_visit'] == 3) |
                                                        (complete_original_dataset['finalgold_visit'] == 4), 2, complete_original_dataset['finalgold_visit'])

In [120]:
dataset_to_save = original_dataset_sid
dataset_to_save['finalgold_visit'] = complete_original_dataset['finalgold_visit']
additional_100_samples = dataset_to_save.sample(n=100)


Unnamed: 0_level_0,(N(1) + N(8))-acetylspermidine,5-acetylamino-6-amino-3-methyluracil,5-hydroxyhexanoate,adrenate (22:4n6),C-glycosyltryptophan,phosphocholine,ergothioneine,myristoleoylcarnitine (C14:1)*,"N2,N2-dimethylguanosine",X - 12026,...,Carbonic anhydrase 6,Kallistatin,Beta-2-microglobulin,C-reactive protein,Growth/differentiation factor 15,"Alpha-(1,3)-fucosyltransferase 5",Trefoil factor 3,Troponin T,N-terminal pro-BNP,finalgold_visit
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10010J,-0.190253,2.198677,0.951882,1.728713,1.696511,1.810291,-1.921257,0.875180,0.892684,0.124223,...,-1.118447,-0.234189,0.926362,0.557443,0.017068,-0.382598,0.234377,0.593199,1.282038,1.0
10031R,1.137203,-0.556750,1.417606,0.412620,0.341081,0.396214,-0.667501,1.829685,-0.880821,-1.277666,...,-0.377116,-1.673974,0.100996,0.220795,-0.145500,-0.386756,-0.132752,-0.795531,-0.020434,1.0
10032T,-0.852246,0.353310,0.415074,0.528622,0.050416,0.134069,0.375303,0.350066,-2.254200,-0.518312,...,0.574186,-0.634489,-0.364988,0.629139,0.187559,-1.008337,-0.288907,-0.917175,-0.896100,1.0
10052Z,0.247807,0.975486,-1.502064,0.469000,0.333686,-0.163554,-0.732381,0.195025,-0.199153,-0.187908,...,-0.775491,-0.197459,-0.731378,1.171704,-1.459689,2.014551,-0.646074,0.414110,-1.828103,2.0
10055F,-0.366317,-1.130497,0.858598,-0.866032,-0.241559,0.173388,-0.236713,1.537696,-1.105873,-1.534229,...,0.204040,-0.372867,0.890467,-0.852658,1.371039,1.157176,0.630683,0.051998,2.123272,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25551W,0.829446,0.050146,-2.103680,-0.102467,-0.408373,-0.135074,1.477414,-1.314134,0.006312,-0.615117,...,-0.239220,1.567190,0.312708,-3.290274,-0.639323,-1.808501,-0.282904,-0.152816,0.776388,0.0
25563D,-0.209134,-0.472576,0.019962,-1.653494,-0.981828,1.120985,0.874699,0.033871,-0.957397,-1.584809,...,-0.309081,-0.945435,-0.337557,1.224334,-0.182733,0.960652,-0.789511,-0.684374,1.373968,0.0
25564F,-0.098917,-1.349235,1.152642,-0.118369,-0.103682,1.691015,-0.610257,0.828027,0.085634,-0.500804,...,0.309346,-0.362805,-1.158474,0.221569,-1.306005,0.326384,-0.922360,-0.366161,-1.541756,0.0
25571C,-2.186909,-0.205182,-0.552108,-0.625187,-0.446872,2.273861,0.261602,-0.991433,-0.777158,-0.851403,...,-2.127945,0.022630,-0.590303,1.132313,-1.366281,-0.561698,-0.902983,-0.610586,-0.965605,0.0


In [125]:
additional_100_samples = dataset_to_save.sample(n=100)
dataset_to_save = pd.concat([dataset_to_save, additional_100_samples], ignore_index=True)
dataset_to_save

Unnamed: 0,(N(1) + N(8))-acetylspermidine,5-acetylamino-6-amino-3-methyluracil,5-hydroxyhexanoate,adrenate (22:4n6),C-glycosyltryptophan,phosphocholine,ergothioneine,myristoleoylcarnitine (C14:1)*,"N2,N2-dimethylguanosine",X - 12026,...,Carbonic anhydrase 6,Kallistatin,Beta-2-microglobulin,C-reactive protein,Growth/differentiation factor 15,"Alpha-(1,3)-fucosyltransferase 5",Trefoil factor 3,Troponin T,N-terminal pro-BNP,finalgold_visit
0,-0.190253,2.198677,0.951882,1.728713,1.696511,1.810291,-1.921257,0.875180,0.892684,0.124223,...,-1.118447,-0.234189,0.926362,0.557443,0.017068,-0.382598,0.234377,0.593199,1.282038,1.0
1,1.137203,-0.556750,1.417606,0.412620,0.341081,0.396214,-0.667501,1.829685,-0.880821,-1.277666,...,-0.377116,-1.673974,0.100996,0.220795,-0.145500,-0.386756,-0.132752,-0.795531,-0.020434,1.0
2,-0.852246,0.353310,0.415074,0.528622,0.050416,0.134069,0.375303,0.350066,-2.254200,-0.518312,...,0.574186,-0.634489,-0.364988,0.629139,0.187559,-1.008337,-0.288907,-0.917175,-0.896100,1.0
3,0.247807,0.975486,-1.502064,0.469000,0.333686,-0.163554,-0.732381,0.195025,-0.199153,-0.187908,...,-0.775491,-0.197459,-0.731378,1.171704,-1.459689,2.014551,-0.646074,0.414110,-1.828103,2.0
4,-0.366317,-1.130497,0.858598,-0.866032,-0.241559,0.173388,-0.236713,1.537696,-1.105873,-1.534229,...,0.204040,-0.372867,0.890467,-0.852658,1.371039,1.157176,0.630683,0.051998,2.123272,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999,0.659278,-1.946808,-0.669276,-0.096424,-0.720189,-0.871795,-0.931859,-0.699814,-0.660706,-1.673829,...,-0.547514,-0.272055,-0.321034,-0.286900,-0.391613,-1.214222,-1.845630,-0.256115,-1.807875,0.0
1000,-0.642646,-0.705518,-0.616199,-0.780682,0.009038,0.139536,0.690948,-0.929369,-0.061987,0.062773,...,1.089632,-0.693704,-0.392294,-0.084729,-0.509537,0.512960,-0.751631,-0.731859,-0.257789,2.0
1001,1.855791,0.055643,-0.198139,-1.327981,-0.685873,0.710818,2.217788,0.675797,0.321138,0.948820,...,2.216299,0.737917,-1.432378,0.033276,-0.812547,0.827589,-1.083431,-0.406224,0.667079,0.0
1002,-1.707297,-0.743802,-0.170627,0.745788,-0.739919,-0.352481,-2.994006,-0.717883,-0.347868,0.572050,...,1.357573,0.909138,0.075615,0.007780,1.562260,-0.538427,-0.361136,0.534807,-0.756106,0.0


In [127]:
import re
dataset_to_save = dataset_to_save.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
dataset_to_save.to_csv("OriginalDatasetwithGOLD.csv")

In [6]:
import torch.optim as optim
import numpy as np
def embedding_autoencoder(nodes_embeddings):
    # Define the autoencoder model
    class Autoencoder(nn.Module):
        def __init__(self, input_dim, encoding_dim):
            super(Autoencoder, self).__init__()
            self.encoder = nn.Sequential(
                nn.Linear(input_dim, 8),   # Input layer -> Hidden layer 1
                nn.ReLU(),
                nn.Linear(8, 4),           # Hidden layer 1 -> Hidden layer 2
                nn.ReLU(),
                nn.Linear(4, encoding_dim) # Hidden layer 2 -> Encoding layer
            )
            self.decoder = nn.Sequential(
                nn.Linear(encoding_dim, 4), # Encoding layer -> Hidden layer 2
                nn.ReLU(),
                nn.Linear(4, 8),            # Hidden layer 2 -> Hidden layer 1
                nn.ReLU(),
                nn.Linear(8, input_dim)     # Hidden layer 1 -> Output layer
            )
    
        def forward(self, x):
            x = self.encoder(x)
            x = self.decoder(x)
            return x

    # Set random seed for reproducibility
    torch.manual_seed(42)

    # Define hyperparameters
    input_dim = nodes_embeddings.shape[1]
    encoding_dim = 1
    learning_rate = 0.001
    num_epochs = 300
    
    # Create the autoencoder model
    model = Autoencoder(input_dim, encoding_dim)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(num_epochs):
        # Forward pass
        output = model(nodes_embeddings)
        loss = criterion(output, nodes_embeddings)
    
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        # # Print progress
        # if (epoch+1) % 10 == 0:
        #     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    # # Test the autoencoder
    # test_input = torch.randn(1, input_dim)
    # encoded_output = model.encoder(test_input)
    # decoded_output = model.decoder(encoded_output)
    # 
    # print(f'Original input: {test_input}')
    # print(f'Encoded output: {encoded_output}')
    # print(f'Decoded output: {decoded_output}')
    
    encoded_output = model.encoder(nodes_embeddings)
    # print("Encoded Output %s" % encoded_output)
    return encoded_output

In [229]:
subgraph_nodes_embeddings = pd.read_csv('HiddenLayerOutput_GCN_trimmed_fev1_0.515_0.111_adj.csv')
subgraph_nodes_embeddings = torch.tensor(subgraph_nodes_embeddings.values, dtype=torch.float32)
t = embedding_autoencoder(subgraph_nodes_embeddings)
t.detach().numpy().tolist()[0][0]

Epoch [10/200], Loss: 0.0736
Epoch [20/200], Loss: 0.0651
Epoch [30/200], Loss: 0.0577
Epoch [40/200], Loss: 0.0513
Epoch [50/200], Loss: 0.0458
Epoch [60/200], Loss: 0.0410
Epoch [70/200], Loss: 0.0369
Epoch [80/200], Loss: 0.0333
Epoch [90/200], Loss: 0.0301
Epoch [100/200], Loss: 0.0274
Epoch [110/200], Loss: 0.0250
Epoch [120/200], Loss: 0.0229
Epoch [130/200], Loss: 0.0210
Epoch [140/200], Loss: 0.0193
Epoch [150/200], Loss: 0.0175
Epoch [160/200], Loss: 0.0156
Epoch [170/200], Loss: 0.0137
Epoch [180/200], Loss: 0.0119
Epoch [190/200], Loss: 0.0103
Epoch [200/200], Loss: 0.0089


-0.6838477849960327

In [7]:
def integrate_final_embeddings():
    # Integrating Network Information (from Embeddings) and Attempting to Predict the Phenotype
    subgraph_adj = pd.read_csv('../GNNs/data/COPD/SparsifiedNetworks/trimmed_fev1_0.515_0.111_adj.csv', index_col=0).to_numpy()
    subgraph = nx.from_numpy_array(subgraph_adj)

    subgraph_nodes_dict = {}
    for subgraph_node in subgraph.nodes():
        subgraph_node_name = original_dataset.iloc[:0, subgraph_node].name
        subgraph_nodes_dict[subgraph_node] = subgraph_node_name

    subgraph_nodes_embeddings = pd.read_csv('HiddenLayerOutput_GCN_trimmed_fev1_0.515_0.111_adj.csv')
    embeddings_dim = subgraph_nodes_embeddings.shape[1]
    # subgraph_nodes_embeddings = subgraph_nodes_embeddings.T
    
    # Aggregating dimensions of the embedding space
    # subgraph_nodes_embeddings = subgraph_nodes_embeddings.mean(axis=0)
    # subgraph_nodes_embeddings = subgraph_nodes_embeddings.max(axis=0)
    # pca = PCA()
    # pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
    # Xt = pca.fit_transform(subgraph_nodes_embeddings)
    # explained_variance_ratio = pca.explained_variance_ratio_
    # subgraph_nodes_embeddings = Xt[:,0]
    # 
    # print("explained variance ratio %s" % explained_variance_ratio)
    # print(subgraph_nodes_embeddings[0])
    
    subgraph_nodes_embeddings = embedding_autoencoder(torch.tensor(subgraph_nodes_embeddings.values, dtype=torch.float32))
    subgraph_nodes_embeddings = subgraph_nodes_embeddings.detach().numpy().tolist()
    
    subgraph_nodes_dict_inv = dict((v, k) for k, v in subgraph_nodes_dict.items())
    original_dataset_with_embeddings_df = pd.DataFrame(index=range(0, len(original_dataset_sid)))
    columns_names = original_dataset_sid.columns

    for column_name in columns_names:
        # Get the Metabolite or Protein Embedding Created using Node2Vec
        subgraph_node_embedding = subgraph_nodes_embeddings[subgraph_nodes_dict_inv[column_name]][0]
        # Attempting to Exaggerate Differences in Nodes 14 and 25 (Hub Nodes)
        # if subgraph_nodes_dict_inv[column_name] in [14, 25]:
        #     subgraph_node_embedding = subgraph_node_embedding * 100
        # print(len(subgraph_node_embedding))
        column_value = original_dataset_sid[column_name]
        z = [[vy * subgraph_node_embedding] for _, vy in enumerate(column_value)]
        original_dataset_with_embeddings_df = pd.concat([original_dataset_with_embeddings_df, pd.DataFrame(z)], axis=1)

    # original_dataset_with_embeddings_df.columns = range(0, len(original_dataset_sid.columns)*embeddings_dim)
    original_dataset_with_embeddings_df.columns = original_dataset_sid.columns

    # i = 0
    # original_dataset_with_embeddings_df_sliced = []
    # while i < len(subgraph_node_embedding):
    #     tmp = original_dataset_with_embeddings_df[[x for x in range(i, len(original_dataset_with_embeddings_df.columns), len(subgraph_node_embedding))]]
    #     tmp.index = original_dataset_sid.index
    #     original_dataset_with_embeddings_df_sliced.append(tmp)
    #     i += 1

    # for dim_idx, original_dataset_with_embeddings in enumerate(original_dataset_with_embeddings_df_sliced):
    #     # print("****** Dim %d" % dim_idx)
    #     original_dataset_with_embeddings_new = pd.DataFrame(original_dataset_with_embeddings)
    #     if dim_idx == 10: break
    # return original_dataset_with_embeddings_df_sliced
    return original_dataset_with_embeddings_df

In [43]:
integrate_final_embeddings()

Unnamed: 0,(N(1) + N(8))-acetylspermidine,5-acetylamino-6-amino-3-methyluracil,5-hydroxyhexanoate,adrenate (22:4n6),C-glycosyltryptophan,phosphocholine,ergothioneine,myristoleoylcarnitine (C14:1)*,"N2,N2-dimethylguanosine",X - 12026,...,Complement component C9,Carbonic anhydrase 6,Kallistatin,Beta-2-microglobulin,C-reactive protein,Growth/differentiation factor 15,"Alpha-(1,3)-fucosyltransferase 5",Trefoil factor 3,Troponin T,N-terminal pro-BNP
0,0.020155,0.085807,-0.137053,0.060389,0.127048,-0.241752,0.358008,-0.128705,0.076749,0.010245,...,-0.038307,0.189493,0.034384,0.110211,-0.099851,0.001265,0.036117,0.006813,0.251452,0.160592
1,-0.120475,-0.021728,-0.204108,0.014414,0.025543,-0.052912,0.124383,-0.269076,-0.075729,-0.105374,...,-0.026887,0.063893,0.245779,0.012016,-0.039549,-0.010779,0.036510,-0.003859,-0.337219,-0.002560
2,0.090287,0.013788,-0.059763,0.018466,0.003776,-0.017904,-0.069934,-0.051481,-0.193806,-0.042747,...,0.050901,-0.097281,0.093158,-0.043423,-0.112693,0.013895,0.095186,-0.008398,-0.388783,-0.112248
3,-0.026253,0.038070,0.216268,0.016384,0.024989,0.021841,0.136472,-0.028681,-0.017122,-0.015498,...,0.048712,0.131387,0.028992,-0.087013,-0.209878,-0.108141,-0.190172,-0.018780,0.175538,-0.228993
4,0.038808,-0.044119,-0.123622,-0.030253,-0.018090,-0.023155,0.044109,-0.226136,-0.095078,-0.126534,...,-0.078458,-0.034569,0.054746,0.105940,0.152730,0.101574,-0.109237,0.018332,0.022042,0.265967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,-0.087872,0.001957,0.302890,-0.003579,-0.030582,0.018038,-0.275302,0.193258,0.000543,-0.050731,...,0.228754,0.040530,-0.230100,0.037203,0.589362,-0.047364,0.170722,-0.008223,-0.064778,0.097253
900,0.022156,-0.018443,-0.002874,-0.057762,-0.073527,-0.149700,-0.162992,-0.004981,-0.082313,-0.130706,...,-0.034999,0.052366,0.138812,-0.040160,-0.219306,-0.013538,-0.090685,-0.022949,-0.290100,0.172107
901,0.010479,-0.052656,-0.165958,-0.004135,-0.007765,-0.225824,0.113716,-0.121771,0.007362,-0.041303,...,0.020628,-0.052411,0.053268,-0.137826,-0.039688,-0.096756,-0.030811,-0.026811,-0.155213,-0.193125
902,0.231681,-0.008008,0.079493,-0.021840,-0.033465,-0.303659,-0.048747,0.145802,-0.066817,-0.070219,...,-0.034996,0.360527,-0.003323,-0.070229,-0.202823,-0.101221,0.053024,-0.026247,-0.258822,-0.120955


In [235]:
original_dataset_sid

Unnamed: 0_level_0,(N(1) + N(8))-acetylspermidine,5-acetylamino-6-amino-3-methyluracil,5-hydroxyhexanoate,adrenate (22:4n6),C-glycosyltryptophan,phosphocholine,ergothioneine,myristoleoylcarnitine (C14:1)*,"N2,N2-dimethylguanosine",X - 12026,...,Complement component C9,Carbonic anhydrase 6,Kallistatin,Beta-2-microglobulin,C-reactive protein,Growth/differentiation factor 15,"Alpha-(1,3)-fucosyltransferase 5",Trefoil factor 3,Troponin T,N-terminal pro-BNP
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10010J,-0.190253,2.198677,0.951882,1.728713,1.696511,1.810291,-1.921257,0.875180,0.892684,0.124223,...,0.322382,-1.118447,-0.234189,0.926362,0.557443,0.017068,-0.382598,0.234377,0.593199,1.282038
10031R,1.137203,-0.556750,1.417606,0.412620,0.341081,0.396214,-0.667501,1.829685,-0.880821,-1.277666,...,0.226270,-0.377116,-1.673974,0.100996,0.220795,-0.145500,-0.386756,-0.132752,-0.795531,-0.020434
10032T,-0.852246,0.353310,0.415074,0.528622,0.050416,0.134069,0.375303,0.350066,-2.254200,-0.518312,...,-0.428367,0.574186,-0.634489,-0.364988,0.629139,0.187559,-1.008337,-0.288907,-0.917175,-0.896100
10052Z,0.247807,0.975486,-1.502064,0.469000,0.333686,-0.163554,-0.732381,0.195025,-0.199153,-0.187908,...,-0.409941,-0.775491,-0.197459,-0.731378,1.171704,-1.459689,2.014551,-0.646074,0.414110,-1.828103
10055F,-0.366317,-1.130497,0.858598,-0.866032,-0.241559,0.173388,-0.236713,1.537696,-1.105873,-1.534229,...,0.660275,0.204040,-0.372867,0.890467,-0.852658,1.371039,1.157176,0.630683,0.051998,2.123272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25551W,0.829446,0.050146,-2.103680,-0.102467,-0.408373,-0.135074,1.477414,-1.314134,0.006312,-0.615117,...,-1.925122,-0.239220,1.567190,0.312708,-3.290274,-0.639323,-1.808501,-0.282904,-0.152816,0.776388
25563D,-0.209134,-0.472576,0.019962,-1.653494,-0.981828,1.120985,0.874699,0.033871,-0.957397,-1.584809,...,0.294543,-0.309081,-0.945435,-0.337557,1.224334,-0.182733,0.960652,-0.789511,-0.684374,1.373968
25564F,-0.098917,-1.349235,1.152642,-0.118369,-0.103682,1.691015,-0.610257,0.828027,0.085634,-0.500804,...,-0.173596,0.309346,-0.362805,-1.158474,0.221569,-1.306005,0.326384,-0.922360,-0.366161,-1.541756
25571C,-2.186909,-0.205182,-0.552108,-0.625187,-0.446872,2.273861,0.261602,-0.991433,-0.777158,-0.851403,...,0.294511,-2.127945,0.022630,-0.590303,1.132313,-1.366281,-0.561698,-0.902983,-0.610586,-0.965605


In [19]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, hidden_dim_name, layer_num=2, dropout=True, **kwargs):
        super(GCN, self).__init__()
        self.hidden_dim_name = hidden_dim_name
        self.layer_num = layer_num
        self.dropout = dropout

        self.conv_first = tg.nn.GCNConv(input_dim, hidden_dim)
        self.conv_hidden = nn.ModuleList([tg.nn.GCNConv(hidden_dim, hidden_dim) for i in range(layer_num - 2)])
        # self.conv_out = tg.nn.GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        import pandas as pd
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr

        # print("X Before the 1st Conv %s" % x)

        x = self.conv_first(x, edge_index, edge_weight)

        # print("X After the 1st Conv %s" % x)

        t_np = x.cpu().data.numpy()  # convert to Numpy array
        df = pd.DataFrame(t_np)  # convert to a dataframe
        df.to_csv("HiddenLayerOutput_GCN_%s" % self.hidden_dim_name, index=False)  # save to file

        # print("Data after the first convolution %s" % t_np)

        x = F.relu(x)
        # x = F.tanh(x)

        # print("X After RELU %s" % x)

        if self.dropout:
            x = F.dropout(x, training=self.training)

        # print("X After Dropout %s" % x)

        for i in range(self.layer_num-2):
            x = self.conv_hidden[i](x.clone(), edge_index, edge_weight)

            # print("X After %d Conv %s" % (i, x))

            t_np = x.cpu().data.numpy()  # convert to Numpy array
            df = pd.DataFrame(t_np)  # convert to a dataframe
            df.to_csv("HiddenLayerOutput_GCN_%s" % self.hidden_dim_name, index=False)  # save to file

            # print("Data after inner convolution %s" % t_np)
            x = F.relu(x)
            # x = F.tanh(x)
            if self.dropout:
                x = F.dropout(x, training=self.training)

        # x = self.conv_out(x, edge_index, edge_weight)
        # print("X After Conv Out %s" % x)

        # print("Data after the last convolution %s" % x.cpu().data.numpy())
        return x

In [9]:
# Define the Neural Network Model
class DownstreamTaskModelNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(DownstreamTaskModelNN, self).__init__()
        # Learnable Scaling Factors
        # self.scalars = nn.Parameter(torch.ones(input_size))
        self.fc1 = nn.Linear(input_size, hidden_size1) # TODO: What other Layer are there!
        self.bn1 = nn.BatchNorm1d(hidden_size1)
        self.relu1 = nn.ReLU()
        # self.tanh = nn.Tanh()
        # self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        # self.bn2 = nn.BatchNorm1d(hidden_size2)
        # self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size1, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # x = x * self.scalars  # Apply the Learned Scalars to the Input 
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        # x = self.tanh(x)
        # x = self.fc2(x)
        # x = self.bn2(x)
        # x = self.relu2(x)
        x = self.fc3(x)
        x = self.softmax(x)
        # print("Scalars Values %s" % self.scalars.data.numpy())
        return x

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
torch.autograd.set_detect_anomaly(True)

writer = SummaryWriter()

# Initialize GNNs
input_dim = dataset.x.shape[1]
output_dim = 1
hidden_dim = 16
layer_num = 3
dropout = 0.5
lr = 0.002244
weight_decay = 0.034040

gnn_model = GCN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, layer_num=layer_num, dropout=dropout, hidden_dim_name=dataset_name)
gnn_optimizer = torch.optim.Adam(gnn_model.parameters(), lr=lr, weight_decay=weight_decay)
gnn_criterion = torch.nn.CrossEntropyLoss() # torch.nn.MSELoss()


X = original_dataset_sid
y = complete_original_dataset['finalgold_visit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # TODO: Maybe better if you update the integration function to take these as parameters and update them 

input_size = X_train.shape[1]
hidden_size1 = 64
hidden_size2 = 128
output_size = 3  # 3 Classes
# Initialize the Downstream Task Model
downstream_model = DownstreamTaskModelNN(input_size, hidden_size1, hidden_size2, output_size)
prediction_optimizer = optim.Adam(downstream_model.parameters(), lr=0.001631, weight_decay=0.030408)
prediction_criterion = nn.CrossEntropyLoss()

# wandb.init(project='debuggingnn')
# wandb.watch(gnn_model, gnn_criterion, log='all')
# wandb.watch(downstream_model, prediction_criterion, log='all')

num_epochs = 16
for epoch in range(num_epochs):
    gnn_optimizer.zero_grad()
    # Forward Pass through GNN Layers
    gnn_model.train()  # TODO: Do we Need this Training?
    out = gnn_model(data)# .flatten()
    # print("Shape of hidden space")
    # print(out.shape)

    # Integrate embeddings in the original dataset
    # TODO: Refactor this Code and Consider a Better Way to Pick a Dimension, should we do backpropagate here to get better prediction loss
    dim_losses = []
    X_sliced = integrate_final_embeddings()

    # for i in range(hidden_dim): # TODO: Consider processing dimensions and batches together; also run multiple threads for each dimension and do the optimization for
    #     print("***** Processing Dimension %d *****" % i)
    # all dimensions
    # original_dataset_tmp = pd.DataFrame(X_sliced[8])
    # Incorporate Embeddings into the Original Dataset
    original_dataset_tmp = X_sliced
    X_train, X_test, y_train, y_test = train_test_split(original_dataset_tmp, y, test_size=0.3, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch Tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled)
    y_train_tensor = torch.LongTensor(y_train.to_numpy())
    X_test_tensor = torch.FloatTensor(X_test_scaled)
    y_test_tensor = torch.LongTensor(y_test.to_numpy())

    # Create DataLoader for Training
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

    for batch_idx, (inputs, labels) in enumerate(train_loader):
        prediction_optimizer.zero_grad()
        outputs = downstream_model(inputs)
        prediction_loss = prediction_criterion(outputs, labels)
        prediction_loss.backward(retain_graph=True)

        

        gnn_optimizer.zero_grad()
        out.grad = None # Reset gradients
        out.backward(torch.ones_like(out), retain_graph=True)
    # gnn_loss = gnn_criterion(outputs, labels)
    # print(
    #     f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx + 1}, GNN Loss: {gnn_loss.item():.4f}, Phenotype Loss: {prediction_loss.item():.4f}")

    # prediction_loss.backward(retain_graph=True)
    # gnn_loss.backward()

    # Gradient clipping
    # torch.nn.utils.clip_grad_norm_(gnn_model.parameters(), 0.01)
    # torch.nn.utils.clip_grad_norm_(downstream_model.parameters(), 0.01)


        prediction_optimizer.step()
        gnn_optimizer.step()
    
        print(f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx + 1}, Phenotype Loss: {prediction_loss.item():.4f}")
        # prediction_optimizer.step()
    
        print("Gradients for the GNN Model!!!! ")
        # Track gradients (gnn_model) and model parameters in TensorBoard
        for name, param in gnn_model.named_parameters():
            print(f'{name}.grad', param.grad, epoch)
            writer.add_histogram(name, param, epoch)
            if param.grad is not None:
                writer.add_histogram(f'{name}.grad', param.grad, epoch)
    
print("Training finished.")

RuntimeError: Mismatch in shape: grad_output[0] has a shape of torch.Size([27, 16]) and output[0] has a shape of torch.Size([]).

In [42]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from torchmetrics.classification import MulticlassAUROC


# Evaluate the Model on the Test Dataset
with torch.no_grad():
    gnn_model.eval()
    out = gnn_model(data)# .flatten()
    downstream_model.eval()
    
X_sliced = integrate_final_embeddings()

# for i in range(hidden_dim):
original_dataset_tmp = X_sliced
# Incorporate Embeddings into the Original Dataset
X_train, X_test, y_train, y_test = train_test_split(original_dataset_tmp, y, test_size=0.30, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch Tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.LongTensor(y_train.to_numpy())
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.LongTensor(y_test.to_numpy())
    
with torch.no_grad():
    outputs = downstream_model(X_test_tensor)
    l, predicted = torch.max(outputs, 1)
    accuracy = torch.sum(predicted == y_test_tensor).item() / len(y_test)
    # auc_score = roc_auc_score(l, predicted, multi_class='ovr')
    confusion_mtrx = confusion_matrix(y_test_tensor, predicted)
    mc_auroc = MulticlassAUROC(num_classes=3, average='macro', thresholds=None)
    print(f'Test Accuracy: {accuracy}')
    print(f'Classification Report: {classification_report(y_test_tensor, predicted)}')
    print(f'ROC AUC Score: {mc_auroc(outputs, y_test_tensor)}')
    print(f'Confusion Matrix: {confusion_mtrx}')

Test Accuracy: 0.5367647058823529
Classification Report:               precision    recall  f1-score   support

           0       0.59      0.91      0.72       132
           1       0.37      0.29      0.33        90
           2       0.00      0.00      0.00        50

    accuracy                           0.54       272
   macro avg       0.32      0.40      0.35       272
weighted avg       0.41      0.54      0.46       272

ROC AUC Score: 0.7003393769264221
Confusion Matrix: [[120  12   0]
 [ 64  26   0]
 [ 18  32   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [297]:
out = gnn_model(data)
out

tensor([[ 2.5753e-01,  0.0000e+00,  0.0000e+00, -0.0000e+00, -0.0000e+00,
          1.5555e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -5.2061e-01, -2.4834e-01, -8.1043e-02,  0.0000e+00,
         -0.0000e+00],
        [ 3.2311e-01,  0.0000e+00,  1.7915e-01, -2.6798e-01, -0.0000e+00,
          1.9596e-01,  0.0000e+00, -0.0000e+00,  2.3600e-01,  0.0000e+00,
          0.0000e+00, -5.5491e-01, -0.0000e+00,  1.6399e-02,  0.0000e+00,
          0.0000e+00],
        [ 3.3281e-01,  0.0000e+00,  1.8026e-01, -0.0000e+00, -0.0000e+00,
          0.0000e+00,  5.2842e-02, -1.6581e-02,  2.4586e-01,  0.0000e+00,
          0.0000e+00, -4.6927e-01, -1.3565e-01, -6.8181e-02,  4.9792e-02,
         -0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  1.9285e-01, -0.0000e+00, -1.9400e-01,
          0.0000e+00,  0.0000e+00, -0.0000e+00,  2.6600e-01,  0.0000e+00,
          0.0000e+00, -6.0589e-01, -0.0000e+00,  0.0000e+00,  1.2007e-01,
          0.0000e+00],
        [ 3.7736e-01

In [10]:
# Testing GNNs Gradients Update
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
torch.autograd.set_detect_anomaly(True)

writer = SummaryWriter()

# Initialize GNNs
input_dim = dataset.x.shape[1]
output_dim = 1
hidden_dim = 16
layer_num = 2
dropout = 0.5
lr = 0.08658681001095011
weight_decay = 0.001358903786972319

gnn_model = GCN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, layer_num=layer_num, dropout=dropout, hidden_dim_name=dataset_name)
gnn_optimizer = torch.optim.Adam(gnn_model.parameters(), lr=lr, weight_decay=weight_decay)
gnn_criterion = torch.nn.MSELoss() # torch.nn.CrossEntropyLoss() # 



num_epochs = 4

for epoch in range(num_epochs):
    gnn_optimizer.zero_grad()
    print("Gradients for the GNN Model***!!!! ")
    # Track gradients (gnn_model) and model parameters in TensorBoard
    for name, param in gnn_model.named_parameters():
        print(f'{name}.grad', param.grad, epoch)
        writer.add_histogram(name, param, epoch)
        if param.grad is not None:
            writer.add_histogram(f'{name}.grad', param.grad, epoch)
    
    # Forward Pass through GNN Layers
    gnn_model.train() # TODO: Do we Need this Training?
    out = gnn_model(data).flatten()
    gnn_loss = gnn_criterion(out[data.train_mask], data.y[data.train_mask])
    # gnn_loss = torch.tensor(random.random(), requires_grad=True)
    # print("random loss %s" % gnn_loss)
    gnn_loss.backward()
    gnn_optimizer.step()
    print("Epoch %d Loss %s" % (epoch, gnn_loss))
       

    print("Gradients for the GNN Model!!!! ")
    # Track gradients (gnn_model) and model parameters in TensorBoard
    for name, param in gnn_model.named_parameters():
        print(f'{name}.grad', param.grad, epoch)
        writer.add_histogram(name, param, epoch)
        if param.grad is not None:
            writer.add_histogram(f'{name}.grad', param.grad, epoch)

    writer.add_scalar('Loss/train', gnn_loss.item(), epoch)

print("Training finished.")

Gradients for the GNN Model***!!!! 
conv_first.bias.grad None 0
conv_first.lin.weight.grad None 0
conv_out.bias.grad None 0
conv_out.lin.weight.grad None 0
Epoch 0 Loss tensor(0.0411, grad_fn=<MseLossBackward0>)
Gradients for the GNN Model!!!! 
conv_first.bias.grad tensor([ 0.0000, -0.1815,  0.0000,  0.2138,  0.0000, -0.0412, -0.0080, -0.0118,
         0.0000,  0.0919, -0.0499,  0.0471,  0.0000,  0.0205,  0.1852,  0.0035]) 0
conv_first.lin.weight.grad tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0255, -0.0535, -0.0121, -0.0261, -0.0335, -0.0372, -0.0382],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0281,  0.0609,  0.0135,  0.0298,  0.0385,  0.0415,  0.0426],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0040, -0.0045, -0.0021, -0.0056, -0.0061, -0.0054, -0.0031],
        [-0.0008, -0.0013, -0.0004, -0.0008, -0.0010, -0.0010, -0.0009],
        [-0.0010, -0.0014, -0.000

KeyboardInterrupt: 

In [ ]:
# Unused code: 

# writer.add_scalar('Loss/train', gnn_loss.item(), epoch)

# Track gradients (downstream_model) and model parameters in TensorBoard
# for name, param in downstream_model.named_parameters():
#     writer.add_histogram(name, param, epoch)
#     if param.grad is not None:
#         writer.add_histogram(f'{name}.grad', param.grad, epoch)
#
# writer.add_scalar('Loss/train', prediction_loss.item(), epoch)
#
# # print("Loss Value for Dim %d: %f" % (i, prediction_loss))
# dim_losses.append(prediction_loss)
# all_losses.append(prediction_loss)
#
# print("Best Dimension is %s and Loss %s" % (dim_losses.index(min(dim_losses)), min(dim_losses)))
# with torch.no_grad():
#     downstream_model.eval()
#     outputs = downstream_model(X_test_tensor)
#     _, predicted = torch.max(outputs, 1)
#     accuracy = torch.sum(predicted == y_test_tensor).item() / len(y_test)
#     print(f'Test Accuracy: {accuracy}')
#     accuracies.append(accuracy)
#
# X = X_sliced[dim_losses.index(min(dim_losses))]
# # X.reset_index(drop=True, inplace=True)
# # y.reset_index(drop=True, inplace=True)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
#
# # Convert to PyTorch Tensors
# X_train_tensor = torch.FloatTensor(X_train_scaled)
# y_train_tensor = torch.LongTensor(y_train.to_numpy())
# X_test_tensor = torch.FloatTensor(X_test_scaled)
# y_test_tensor = torch.LongTensor(y_test.to_numpy())
#
# # Create DataLoader for Training
# train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
#
# for batch_idx, (inputs, labels) in enumerate(train_loader):
#     outputs = downstream_model(inputs)
#     prediction_loss = prediction_criterion(outputs, labels)
#     # print("Outputs %s\nLabels: %s" % (outputs, labels))
#     gnn_loss = gnn_criterion(outputs, labels)
#     print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}, GNN Loss: {gnn_loss.item():.4f}, Phenotype Loss: {prediction_loss.item():.4f}")
#
#
#
#     prediction_loss.backward(retain_graph=True)
#     gnn_loss.backward(retain_graph=True)
#
#     # Gradient clipping
#     torch.nn.utils.clip_grad_norm_(gnn_model.parameters(), 0.01)
#     torch.nn.utils.clip_grad_norm_(downstream_model.parameters(), 0.01)
#
#     gnn_optimizer.step()
#     prediction_optimizer.step()

In [18]:
# Attempting to run with KNN
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
torch.autograd.set_detect_anomaly(True)


# Initialize GNNs
input_dim = dataset.x.shape[1]
output_dim = 1
hidden_dim = 16
layer_num = 2
dropout = 0.5
lr = 0.008658681001095011
weight_decay = 0.001358903786972319

gnn_model = GCN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, layer_num=layer_num, dropout=dropout, hidden_dim_name=dataset_name)
# for Debugging; Try to Follow Each Loss Independently
gnn_optimizer = torch.optim.Adam(gnn_model.parameters(), lr=lr, weight_decay=weight_decay)
gnn_criterion = torch.nn.MSELoss()


# wandb.init(project='debuggingnn')
# wandb.watch(downstream_model, criterion, log='all')

num_epochs = 10
all_losses= []
accuracies = []
best_scores = []
best_params = []

for epoch in range(num_epochs):
    gnn_optimizer.zero_grad()

    # Forward Pass through GNN Layers
    gnn_model.train() # TODO: Do we Need this Training?
    out = gnn_model(data).flatten()
    gnn_loss = gnn_criterion(out[data.train_mask], data.y[data.train_mask])
    print(f"Epoch {epoch+1}/{num_epochs}, GNN Loss: {gnn_loss.item():.4f},")

    # Integrate embeddings in the original dataset
    # TODO: Refactor this Code and Consider a Better Way to Pick a Dimension, should we do backpropagate here to get better prediction loss
    dim_losses = []
    X_sliced = integrate_final_embeddings()


    for dim_idx, original_dataset_with_embeddings in enumerate(X_sliced):
        # print("****** Dim %d" % dim_idx)
        original_dataset_with_embeddings_new = pd.DataFrame(original_dataset_with_embeddings)
    
        # original_dataset_with_embeddings_new.columns = original_dataset.columns
        # pca = PCA()
        # pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
        # Xt = pca.fit_transform(original_dataset_with_embeddings_new)
        # PC1 = Xt[:,0]
        # 
        # explained_variance_ratio = pca.explained_variance_ratio_
        # PC1_df = pd.DataFrame(PC1)
        # 
        # PC1_correlation_with_phenotype = PC1_df.corrwith(dataset_associated_phenotype['FEV1pp_utah']).tolist()[0]
        # print("PC1 Correlation with the Phenotype: %s" % PC1_correlation_with_phenotype)
        # print("Percentage of Variance Explained by PC1: %s" % explained_variance_ratio[0])
    
        scaler = MinMaxScaler()
        original_dataset_with_embeddings_new_scaled = scaler.fit_transform(original_dataset_with_embeddings_new)
    
        # Prediction using KNN
        X_train, X_test, y_train, y_test = train_test_split(original_dataset_with_embeddings_new_scaled, complete_original_dataset['finalgold_visit'], test_size=0.3, random_state=0)
    
        grid_params = {'n_neighbors': [5, 7, 9, 11, 13, 15, 19, 21],
                       'weights': ['uniform', 'distance'],
                       'metric': ['minkowski', 'euclidean', 'manhattan'],
                       'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
                       }
        gs = GridSearchCV(KNeighborsClassifier(), grid_params, cv=3, n_jobs = -1)
        g_res = gs.fit(X_train, y_train)
        print("Best Score %s" % g_res.best_score_)
        print("Best Params %s" % g_res.best_params_)
        best_scores.append(g_res.best_score_)
        best_params.append(g_res.best_params_)

        # Create a KNN classifier Best Params {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}
        knn = KNeighborsClassifier(n_neighbors=g_res.best_params_['n_neighbors'], algorithm='auto', metric=g_res.best_params_['metric'], weights=g_res.best_params_['weights'])
        # Train the classifier
        knn.fit(X_train, y_train)
        # Make predictions on the test set
        y_pred = knn.predict(X_test)
        # Evaluate the accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)
        print("Accuracy:", accuracy)
        accuracies.append(accuracy)
        
    

    print("Best Accuracy %s" % max(accuracies))
    X = X_sliced[accuracies.index(max(accuracies))]
    X_train, X_test, y_train, y_test = train_test_split(X, complete_original_dataset['finalgold_visit'], test_size=0.3, random_state=0)
   

    # Create a KNN classifier Best Params {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}
    knn = KNeighborsClassifier(n_neighbors=g_res.best_params_['n_neighbors'], algorithm='auto', metric=g_res.best_params_['metric'], weights=g_res.best_params_['weights'])
    # Train the classifier
    knn.fit(X_train, y_train)
    # Make predictions on the test set
    y_pred = knn.predict(X_test)
    # Evaluate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)    


    total_loss = gnn_loss + 0.5 * (1 - accuracy)
    print(f"Epoch {epoch+1}/{num_epochs}, Total Loss: {total_loss:.4f},")
    total_loss.backward()
    gnn_optimizer.step()
      

print("Training finished.")



Epoch 1/10, GNN Loss: 0.0296,


KeyboardInterrupt: 

In [69]:
max(accuracies)

0.4889705882352941