In [2]:
import json 
import os 

files = os.listdir('PPI_indiv')

In [3]:
with open("PPI_indiv/P05067.json") as f:
    file_content = json.load(f)

file_content

{'P05067': [{'entry1': 'P05067',
   'entry2_id': 'Q9NY61',
   'entry2_name': 'AATF',
   'interaction_type': 'binary',
   'nb_exp': 3},
  {'entry1': 'P05067',
   'entry2_id': 'P16112',
   'entry2_name': 'ACAN',
   'interaction_type': 'binary',
   'nb_exp': 3},
  {'entry1': 'P05067',
   'entry2_id': 'P60709',
   'entry2_name': 'ACTB',
   'interaction_type': 'binary',
   'nb_exp': 8},
  {'entry1': 'P05067',
   'entry2_id': 'P61158',
   'entry2_name': 'ACTR3',
   'interaction_type': 'binary',
   'nb_exp': 3},
  {'entry1': 'P05067',
   'entry2_id': 'O14672',
   'entry2_name': 'ADAM10',
   'interaction_type': 'binary',
   'nb_exp': 7},
  {'entry1': 'P05067',
   'entry2_id': 'A0AVL1',
   'entry2_name': 'ADAM9',
   'interaction_type': 'binary',
   'nb_exp': 3},
  {'entry1': 'P05067',
   'entry2_id': 'P18509',
   'entry2_name': 'ADCYAP1',
   'interaction_type': 'binary',
   'nb_exp': 3},
  {'entry1': 'P05067',
   'entry2_id': 'P41586-2',
   'entry2_name': 'ADCYAP1R1',
   'interaction_type': 'bi

In [4]:
with open("../PCFs/files_for_ml/protein_props.json") as f:
    human_proteins = list((json.load(f)).keys())

print("Total Human proteins from Uniprot:", len(human_proteins))

Total Human proteins from Uniprot: 20434


In [5]:
## Directed PPI graph
count = 0
ppi_network = {}
for file in files:
    with open(f"PPI_indiv/{file}") as f:
        file_content = json.load(f)
    key = (list(file_content.keys()))[0]
    entries = file_content[key]
    count += 1
    for entry in entries:
        entry1 = entry["entry1"].split("-")[0]
        entry2 = entry["entry2_id"].split("-")[0]
        if entry2 in human_proteins:
            type_interaction = entry["interaction_type"]
            if(type_interaction == "binary"): type_interaction = 0
            if(type_interaction == "xeno"): type_interaction = 1
            if entry1 in ppi_network: ppi_network[entry1].append((entry2, type_interaction))
            else: ppi_network[entry1] = [(entry2, type_interaction)]    

In [6]:
count

20434

In [7]:
ppi_network["Q9NSA3"]

[('P35222', 0),
 ('Q9BRT9', 0),
 ('Q0VD86', 0),
 ('P14923', 0),
 ('Q8IUG1', 0),
 ('P60409', 0),
 ('P26371', 0),
 ('Q4VC12', 0),
 ('P0DPK4', 0),
 ('Q7Z4N8', 0),
 ('Q04864', 0),
 ('Q86W54', 0),
 ('Q8N6Y0', 0)]

In [8]:
len(ppi_network)

12213

#### Properties

In [9]:
# Degree of neighbors
degree_binary = {}
degree_xeno = {}
degree_all = {}

for protein in human_proteins:
    if protein in ppi_network:
        count_binary = 0
        for (v, it) in ppi_network[protein]:
            if it == 0: count_binary += 1
        degree_binary[protein] = count_binary 
        degree_xeno[protein] = len(ppi_network[protein]) - count_binary
        degree_all[protein] = len(ppi_network[protein])
    else:
        degree_binary[protein] = 0
        degree_xeno[protein] = 0
        degree_all[protein] = 0


In [11]:
# Average degree of neighbors
avg_degree_nbr_binary = {}
avg_degree_nbr_xeno = {}
avg_degree_nbr_all = {}

for protein in human_proteins:
    if protein in ppi_network:
        xeno_nbr_degree = 0
        binary_nbr_degree = 0
        for (v, it) in ppi_network[protein]:
            if it == 0:
                binary_nbr_degree += degree_binary[v]
            if it == 1:
                xeno_nbr_degree += degree_xeno[v]
        
        if degree_binary[protein] == 0: avg_degree_nbr_binary[protein] = 0
        else: avg_degree_nbr_binary[protein] = binary_nbr_degree/degree_binary[protein]
        if degree_xeno[protein] == 0: avg_degree_nbr_xeno[protein] = 0
        else: avg_degree_nbr_xeno[protein] = xeno_nbr_degree/degree_xeno[protein]
        avg_degree_nbr_all[protein] = (binary_nbr_degree + xeno_nbr_degree)/degree_all[protein]
    else:
        avg_degree_nbr_all[protein] = 0
        avg_degree_nbr_xeno[protein] = 0
        avg_degree_nbr_binary[protein] = 0 

len(avg_degree_nbr_all), len(avg_degree_nbr_binary), len(avg_degree_nbr_xeno)

(20434, 20434, 20434)

In [14]:
def there_exists(sc):
    for x in sc:
        for y in sc:
            if(x != y):
                if y not in [v for (v, it) in ppi_network[x]]:
                    return x

In [None]:
# connected_component_sizes_all = {}
# count = 0
# for protein in human_proteins:
#     count += 1
#     if protein in ppi_network:
#         sc = [protein]
#         final_sc = []
#         while(len(sc) != 0):
#             print(sc)
#             prot = sc[0]
#             sc.remove(prot)
#             final_sc.append(prot)
#             for (v, it) in ppi_network[prot]:
#                 sc.append(v)
#             for x in sc:
#                 if x in final_sc:
#                     sc.remove(x)
#         connected_component_sizes_all[protein] = len(sc)            
#     else:
#         connected_component_sizes_all[protein] = 1 #default is just that protein
#     print("Protein", protein, "has a connected comp size of", connected_component_sizes_all[protein])
    

In [None]:
# len(connected_component_sizes_all)

In [15]:
strongly_connected_component_sizes_all = {}
for protein in human_proteins:
    if protein in ppi_network:
        sc = [protein]
        for (v, it) in ppi_network[protein]:
            sc.append(v)
        while(True):
            a = there_exists(sc)
            if(a): sc.remove(a)
            else: break 
        strongly_connected_component_sizes_all[protein] = len(sc)            
    else:
        strongly_connected_component_sizes_all[protein] = 1 #default is just that protein

In [16]:
len(strongly_connected_component_sizes_all)

20434

In [24]:
# Create DataFrames from dictionaries
import pandas as pd

df1 = pd.DataFrame.from_dict(degree_binary, orient='index', columns=['degree_binary'])
df2 = pd.DataFrame.from_dict(degree_xeno, orient='index', columns=['degree_xeno'])
df3 = pd.DataFrame.from_dict(degree_all, orient='index', columns=['degree_all'])
df4 = pd.DataFrame.from_dict(avg_degree_nbr_binary, orient='index', columns=['avg_degree_nbr_binary'])
df5 = pd.DataFrame.from_dict(avg_degree_nbr_xeno, orient='index', columns=['avg_degree_nbr_xeno'])
df6 = pd.DataFrame.from_dict(avg_degree_nbr_all, orient='index', columns=['avg_degree_nbr_all'])
df7 = pd.DataFrame.from_dict(strongly_connected_component_sizes_all,orient='index', columns=['strongly_connected_component_sizes_all'])
# df8 = pd.DataFrame.from_dict(connected_component_sizes_all,orient='index', columns=['connected_component_sizes_all'])
# Combine DataFrames
df = pd.concat([df1,df2,df3,df4,df5,df6, df7], axis=1)

# Save to CSV
df.to_csv('files_for_ml/ppi_network_properties.csv')