In [2]:
import os
import pandas as pd
import torch
from torch_geometric.data import Data
from torch_geometric.nn import Node2Vec
import sys

# Specify the directory path for ppi networks of different sp
directory_path = 'D:\\year 4\\semester 1\\BT\\BT 4033\\ppi_part\\sp_vise_ppi_networks\\'

# Get a list of all file names in the directory
file_list = os.listdir(directory_path)

# Filter out directories (if you only want files)
file_list = [f for f in file_list if os.path.isfile(os.path.join(directory_path, f))]

In [None]:
for file in file_list:
    file_path = directory_path + file   # create full absolute file path
    df = pd.read_csv(file_path, sep="\t")

    #passing index for each node
    col1 = df['protein1'].to_list()
    col2 = df['protein2'].to_list()
    full_list = col1 + col2
    unique_list = list(dict.fromkeys(full_list))    # unique proteins in ppi file

    ind2node = {index: item for index, item in enumerate(unique_list)}  # index to protien dict 
    node2ind = {v: k for k, v in ind2node.items()}  # protein to index dict 

    df['protein1'] = df['protein1'].map(node2ind)
    df['protein2'] = df['protein2'].map(node2ind)

    # prepare interaction to be used as COO format
    first_prot = df['protein1'].to_list()   
    second_prot = df['protein2'].to_list()

    print("List of numbers for Column 1:", first_prot)
    print("List of numbers for Column 2:", second_prot)

    # Step 1: Convert the interaction data into COO format (edge_index)
    # COO format requires edge_index, a 2xN matrix where each column represents an edge (interaction)
    edge_index = torch.tensor([first_prot, second_prot], dtype=torch.long)

    # Step 2: Create a PyTorch Geometric Data object
    # Assuming that all nodes are connected by edges in edge_index, we set num_nodes to the maximum node number.
    # num_nodes = max(max(list1), max(list2)) +1   # Adding 1 because node indices are 0-based
    data = Data(edge_index=edge_index)

    # Check if CUDA is available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Initialize Node2Vec model
    embedding_dim = 40  # Example embedding size
    node2vec = Node2Vec(
        edge_index=data.edge_index.to(device),
        embedding_dim=embedding_dim,
        walk_length=10, # rnadom walk length
        context_size=10,    # window size for skip-gram model
        walks_per_node=10,
        p=1,    # exploration parameter
        q=2,    # return paramter
        sparse=True).to(device)

    num_workers = 4 if sys.platform == 'linux' else 0
    loader = node2vec.loader(batch_size=64, shuffle=True, num_workers=num_workers)
    optimizer = torch.optim.SparseAdam(list(node2vec.parameters()), lr=0.01)

    # training loop function
    def train():
        node2vec.train()
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = node2vec.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        return total_loss / len(loader)
    
    # model training 
    for epoch in range(1, 100):
        loss = train()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    
    node_embeddings = node2vec().detach().cpu().numpy() # node2vec embeddings 

    # assign generated embeddings for respective protein and save 
    node_embedding_df = pd.DataFrame(columns=['protein', 'index', 'embedding'])
    node_embedding_df['protein'] = ind2node.values()
    node_embedding_df['index'] = ind2node.keys()
    node_embedding_df['embedding'] = node_embedding_df['index'].map(lambda idx: node_embeddings[idx])
    node_embedding_df = node_embedding_df[['protein', 'embedding']]

    node_embedding_df.to_csv('node2vec_embeddings.tsv', sep='\t', index=False, header=False, mode='a')