In [1]:
# Importing Packages and Libraries
import csv
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from progressbar import ProgressBar

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

In [None]:
def evaluate_node_embeddings(subgraph, params):
    """
    1. Runs Node2Vec Algorithm to Generate Nodes Embeddings.
    2. Adds Nodes Embeddings to Original Dataset.
    3. Evaluates the Quality of the Embeddings (Correlation of PC1 with the Phenotype & Percentage of Variance Explained by PC1).
    :param params:
    :return:
    """
    node2vec = Node2Vec(subgraph, dimensions=params['dimensions'], walk_length=params['walk_length'], num_walks=params['num_walks'], weight_key='weight', p=params['p'], q=params['q'], quiet=True)
    # print("Running with the following Parameters:\nDimensions: %s\tWalk Length: %s\tNum Walks: %s\tp: %s\tq: %s" % (params['dimensions'], params['walk_length'], params['num_walks'], params['p'], params['q']))
    try:
        model = node2vec.fit()
        model.wv.save_word2vec_format('SubgraphNodesEmbeddings8.embd')
        subgraph_nodes_embeddings = pd.read_csv('SubgraphNodesEmbeddings8.embd', delim_whitespace=True, names = range(params['dimensions']), skiprows=1)
        subgraph_nodes_embeddings = subgraph_nodes_embeddings.T

        subgraph_nodes_dict_inv = dict((v, k) for k, v in subgraph_nodes_dict.items())
        original_dataset_with_embeddings_df = pd.DataFrame(index=range(0, len(original_dataset)))
        columns_names = original_dataset.columns

        for column_name in columns_names:
            # Get the Metabolite or Protein Embedding Created using Node2Vec
            subgraph_node_embedding = subgraph_nodes_embeddings[subgraph_nodes_dict_inv[column_name]]
            column_value = original_dataset[column_name]
            z = [[vy * vx for ix, vx in enumerate(subgraph_node_embedding)] for iy, vy in enumerate(column_value)]
            original_dataset_with_embeddings_df = pd.concat([original_dataset_with_embeddings_df, pd.DataFrame(z)], axis=1)

        original_dataset_with_embeddings_df.columns = range(0, len(original_dataset.columns)*params['dimensions'])

        pca = PCA()
        pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
        Xt = pca.fit_transform(original_dataset_with_embeddings_df)
        PC1 = Xt[:,0]

        explained_variance_ratio = pca.explained_variance_ratio_
        print("Explained Variance Ratio by PC1 %s" % explained_variance_ratio[0])
        print("Explained Variance Ratio by the First 10 PCs %s" % sum(explained_variance_ratio[:10]))
        PC1_df = pd.DataFrame(PC1)
        PC1_correlation_with_phenotype = PC1_df.corrwith(dataset_associated_phenotype['pctEmph']).tolist()[0]
        PC1_correlation_with_phenotype = PC1_df.corrwith(dataset_associated_phenotype['FEV1pp_utah']).tolist()[0]
        print("Correlation with the Phenotype: %s" % PC1_correlation_with_phenotype)
        return [params, PC1_correlation_with_phenotype, explained_variance_ratio[0], sum(explained_variance_ratio[:10])]
    except Exception as E:
        print("Exception %s" % E)
        pass

In [None]:
# Node2Vec Hyperparameters Tuning

param_grid = {
    'dimensions': [2**p for p in range(1, 10)],
    'walk_length' : [2**p for p in range(1, 10)],
    'num_walks':  [2**p for p in range(1, 10)],
    'p': [0.0001, 0.005, 0.01, 0.4, 2, 10, 100],
    'q': [0.0001, 0.005, 0.01, 0.4, 2, 10, 100],
    'window_size': list(range(1, 101)),
    'iter': list(range(1, 1000))
}
grid = ParameterGrid(param_grid)
# Subgraph Adjacency Matrix
subgraph_adj = pd.read_csv('../Data/trimmed_FEV1_0.55_Adjacency.csv', index_col=0).to_numpy()
subgraph = nx.from_numpy_matrix(subgraph_adj)

# Reading the Input Dataset
original_dataset = pd.read_csv('../Data/FEV1_X.csv', index_col=0).reset_index(drop='index')
# Reading the Associated Phenotype
dataset_associated_phenotype = pd.read_csv('../Data/FEV1_Y.csv', index_col=0).reset_index(drop='index')

# Extracting the Network Nodes Names
subgraph_nodes_dict = {}
subgraph_nodes_names = []
for subgraph_node in subgraph.nodes():
    subgraph_node_name = original_dataset.iloc[:0, subgraph_node].name
    subgraph_nodes_names.append(subgraph_node_name)
    subgraph_nodes_dict[subgraph_node] = subgraph_node_name

evaluation_results = []
pbar = ProgressBar()
for params in pbar(grid):
    evaluation_results.append(evaluate_node_embeddings(subgraph, params))

evaluation_results = [x for x in evaluation_results if x is not None]
with open('TmpFile1.csv', 'w+', newline='') as evaluation_results_file:
    writer = csv.writer(evaluation_results_file)
    writer.writerows(evaluation_results)

with open('TmpFile1.csv', 'r') as evaluation_results_file:
    evaluation_results = list(csv.reader(evaluation_results_file))

evaluation_results_list = []
for evaluation_result in evaluation_results:
    evaluation_result_list = []
    evaluation_result_list += list(eval(evaluation_result[0]).values())
    for item in evaluation_result[1:4]:
        evaluation_result_list.append(item)
    evaluation_results_list.append(evaluation_result_list)
evaluation_results_df = pd.DataFrame(evaluation_results_list, columns=['dimensions', 'num_walk', 'p', 'q', 'walk_length', 'PC1_correlation_with_phenotype', 'explained_variance_ratio_PC1', 'explained_variance_ratio_10_PCs'])

evaluation_results_df = evaluation_results_df.astype(float).abs()
evaluation_results_df.to_csv('TwelvethExp.csv')

In [None]:
# Running Node2Vec to Obtain Node Embedding
node2vec = Node2Vec(subgraph, dimensions=params['dimensions'], walk_length=params['walk_length'], num_walks=params['num_walks'], weight_key='weight', p=params['p'], q=params['q'], quiet=True)
# print("Running with the following Parameters:\nDimensions: %s\tWalk Length: %s\tNum Walks: %s\tp: %s\tq: %s" % (params['dimensions'], params['walk_length'], params['num_walks'], params['p'], params['q']))
try:
    model = node2vec.fit()
    model.wv.save_word2vec_format('SubgraphNodesEmbeddings8.embd')
    subgraph_nodes_embeddings = pd.read_csv('SubgraphNodesEmbeddings8.embd', delim_whitespace=True, names = range(params['dimensions']), skiprows=1)
except Exception as E:
    print("Exception %s" % E)
subgraph_nodes_embeddings