In [None]:
#transform Gene ID into Gene Symbol

import pandas as pd
import numpy as np
import time


def read_protein_info():
    file_path = "./data/Sour_Data/9606.protein.info.v11.5.txt"
    df_protein_info = pd.DataFrame()
    gene_id_array = []
    gene_symbol_array = []
    with open(file_path, "r") as f:
        for line in f:
            line_conten_array = line.split("\t")
            gene_id_array.append(line_conten_array[0])
            gene_symbol_array.append(line_conten_array[1])
    gene_id_array.pop(0)        
    gene_symbol_array.pop(0)

    df_protein_info['Gene_ID'] = gene_id_array
    df_protein_info['Gene_Symbol'] = gene_symbol_array
    return df_protein_info


def ppi_geneid_to_symbol(data):
    index_dict = {}
    for i in range(len(data)):
        index_dict[data.iloc[i, 0]] = i

    df_ppi = pd.read_csv("./data/Sour_Data/9606.protein.physical.links.v11.5.txt.gz", sep= " ")
    gene1_array = []
    gene2_array = []
    score_array = []
    start_time = time.process_time()
    for i in range(len(df_ppi.index)):
        gene1_id = df_ppi.iloc[i,0]
        index = index_dict[gene1_id]
        gene1_symbol = data.iloc[index, 1]
        gene1_array.append(gene1_symbol)

        gene2_id = df_ppi.iloc[i,1]
        index = index_dict[gene2_id]
        gene2_symbol = data.iloc[index, 1]
        gene2_array.append(gene2_symbol)

        score_array.append(df_ppi.iloc[i,2])

        if i % 10000 == 0:
            print("Pecent:{:.2f}%, Cost {:.2f} seconds".format(i * 100 / len(df_ppi.index), time.process_time() - start_time))
            
    print("Writing file...")
    df = pd.DataFrame({"Gene1":gene1_array, "Gene2":gene2_array, "Score":score_array})
    df.to_csv("./data/Hand_Data/ppi_network.csv", sep= "\t", index=0)
    print("Completed!")


if __name__ == "__main__":
    df_ppi = pd.read_csv("./data/9606.protein.physical.links.v11.5.txt.gz", sep= " ")
    df_protein_info = read_protein_info()
    ppi_geneid_to_symbol(df_protein_info)

In [9]:
# pagerank PPIN
import pandas as pd
import os, time
import networkx as nx
from collections import defaultdict
from networkx.algorithms.link_analysis import pagerank

def network_propagate(G):
    # network propagation
    biomarker_genes = ['CD274', 'PDCD1', 'CTLA4', 'CD8A', 'CD8B']
    propagate_input = {}
    for node in G.nodes():
        if node in biomarker_genes:
            propagate_input[node] = 1
        else:
            propagate_input[node] = 0
    propagate_scores = pagerank(G, personalization=propagate_input, max_iter=100, tol=1e-06) ## NETWORK PROPAGATION
    out_put = defaultdict(list)
    out_put_colname = ['Gene_Symbol', 'Propagate_Score']
    
    for ensp in list(propagate_scores.keys()):
        out_put['Gene_Symbol'].append(ensp)
        out_put['Propagete_Score'].append(propagate_scores[ensp])
    out_df = pd.DataFrame(data=out_put, columns=out_put_colname)
    out_df.to_csv("./data/Result/propagation_score.txt", sep='\t', index=False)
        


def construct_grap(file_path):
    print("Constructing Grap...")
    df_ppinet = pd.read_csv(file_path, sep="\t")
    tmp_G = nx.Graph()
    nodes1 = df_ppinet.values[:,0]
    nodes2 = df_ppinet.values[:,1]
    scores = df_ppinet.values[:,2]
    for n1, n2, score in zip(nodes1, nodes2, scores):
        if score > 400:
            tmp_G.add_edge(n1, n2)
    LCC_genes = max(nx.connected_components(tmp_G), key=len)
    G = tmp_G.subgraph(LCC_genes) ## Largest Connected Componenets
    print("Constrcution completed!")
    print("Network nodes: {}".format(len(G.nodes())))
    print("Network edges: {}".format(len(G.edges())))
    return G


if __name__ == "__main__":
    G = construct_grap("./data/Hand_Data/ppi_network.csv")
    network_propagate(G)
    print("Network propagation completed")

Constructing Grap...
Constrcution completed!
Network nodes: 14572
Network edges: 208111
Network propagation completed
