# Using the multiscale interactome

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
#!pip install -r requirements.txt 

In [5]:
#!pip install networkx==2.3

Collecting networkx==2.3
  Using cached networkx-2.3-py2.py3-none-any.whl
Installing collected packages: networkx
Successfully installed networkx-2.3


In [2]:
from msi.msi import MSI
from diff_prof.diffusion_profiles import DiffusionProfiles
import multiprocessing
import numpy as np
import pickle
import networkx as nx

from tests.msi import test_msi
from tests.diff_prof import test_diffusion_profiles

In [3]:
import sys
sys.version

'3.8.15 (default, Nov 24 2022, 14:38:14) [MSC v.1916 64 bit (AMD64)]'

# Construct the multiscale interactome

In [4]:
msi = MSI()
msi.load()

In [5]:
# Test against reference
test_msi()

# Calculate diffusion profiles

In [6]:
dp = DiffusionProfiles(alpha = 0.8595436247434408, max_iter = 1000, tol = 1e-06, weights = {'down_biological_function': 4.4863053901688685, 'indication': 3.541889556309463, 'biological_function': 6.583155399238509, 'up_biological_function': 2.09685000906964, 'protein': 4.396695660380823, 'drug': 3.2071696595616364}, num_cores = int(multiprocessing.cpu_count()/2) - 4, save_load_file_path = "results/")

In [7]:
dp.calculate_diffusion_profiles(msi)

# Load saved diffusion profiles

In [8]:
dp_saved = DiffusionProfiles(alpha = None, max_iter = None, tol = None, weights = None, num_cores = None, save_load_file_path = "results/")

msi.load_saved_node_idx_mapping_and_nodelist(dp_saved.save_load_file_path)
dp_saved.load_diffusion_profiles(msi.drugs_in_graph + msi.indications_in_graph)

In [9]:
# Diffusion profile for Rosuvastatin (DB01098)
dp_saved.drug_or_indication2diffusion_profile["DB01098"]

array([6.88378645e-04, 3.78321317e-05, 2.92853909e-05, ...,
       6.39679047e-06, 6.44090963e-06, 6.44090963e-06])

In [10]:
# Diffusion profile for Acute Myelocytic Leukemia
AML_1 = dp_saved.drug_or_indication2diffusion_profile["C0023467"]
print(AML_1)

[1.86349951e-03 7.30288162e-05 8.52060373e-05 ... 5.05220476e-06
 3.31453757e-06 3.31453757e-06]


In [11]:
# selecting diffusion values for nodes that are not drugs or indications (i.e. selecting proteins and biological functions)
node_diff = dict(zip(msi.nodelist, AML_1))
AML = []
probio_nodes = []
for k,v in node_diff.items():
    if k not in msi.drugs_in_graph + msi.indications_in_graph:
        AML.append(v)
        probio_nodes.append(k)

In [12]:
AML = np.array(AML)
type(AML)

numpy.ndarray

In [13]:
# Test against reference
test_diffusion_profiles("data/10_top_msi/10_top_msi/", "results/")

In [14]:
len(msi.nodelist)

29959

In [15]:
len(AML)

27458

In [16]:
len(probio_nodes)

27458

In [17]:
# creating a dictionary that connects the nodes of proteins and biological functions to their respective visitation frequency
# in AML
dict_AML = {'nodelist': probio_nodes, 'diff_AML': AML}

In [18]:
import pandas as pd 
df_AML = pd.DataFrame(dict_AML)

In [19]:
df_AML.head()

Unnamed: 0,nodelist,diff_AML
0,90,7.3e-05
1,5371,0.000147
2,6774,0.001794
3,375,1.9e-05
4,226,2.8e-05


In [20]:
# creating a dataframe of the "dict_AML" dictionary to be saved as a csv file 
# the dataframe is saved in descending order 
df_AML_probio = df_AML.sort_values(by ='diff_AML', ascending=False)

In [21]:
df_AML_probio.head()

Unnamed: 0,nodelist,diff_AML
19031,GO:0045944,0.004866
18946,GO:0045893,0.002785
17716,GO:0045892,0.00275
17713,GO:0000122,0.002493
2748,4609,0.00192


In [22]:
#df_AML_probio.to_csv('top_3/AML_probio.csv', index=False)
#df_AML.to_csv('tester.csv', index=False)

In [23]:
# creating a dictionary that connects all the nodes (includes drug and disease nodes) to their respective visitation frequency 
# in AML
dict_nodes_AML = {'nodelist': msi.nodelist, 'diff_AML': AML_1} 

In [24]:
# creating a dataframe of the "dict_nodes_AMl" dictionary to be saved as a csv file 
# the dataframe is saved in descending order 
df_nodes_AML = pd.DataFrame(dict_nodes_AML).sort_values(by ='diff_AML', ascending=False)

In [25]:
df_nodes_AML.head()

Unnamed: 0,nodelist,diff_AML
3659,C0023467,0.200218
21532,GO:0045944,0.004866
3662,C0036341,0.00298
21447,GO:0045893,0.002785
20217,GO:0045892,0.00275


In [26]:
#df_nodes_AML.to_csv('AML_diff.csv', index=False)

# Testing 5 Baseline Metrics

In [27]:
# order the keys based on its values
def rank_generator(dict_rank, reverse):
    sorted_rank = sorted(dict_rank.items(), key=lambda x:x[1], reverse = reverse)
    rank_dict = dict(sorted_rank)
    return rank_dict

# Reference: https://www.freecodecamp.org/news/sort-dictionary-by-value-in-python/

In [28]:
# saving the results as a csv file
def csv_generator(rank_ordered,foldername,name,header_1,header_2):
    dict_drugs = {'{}'.format(header_1): rank_ordered.keys() , '{}'.format(header_2): rank_ordered.values()}
    df_drugs= pd.DataFrame(dict_drugs)
    df_drugs.to_csv('{}/rank_list_{}.csv'.format(foldername,name), index=False)

In [25]:
# L2 norm

# for every drug in the list its diffusion profile is determined 
# a dictionary is created to connect all the nodes to their respective visitation frequency in the drug
# the values of this dictionary are selected to be in a list if the key represents a protein or biological function
# the generated list is compared with the "AML" diffusion profile on the basis of a baseline metric
# the resulting value is added in a dictionary with the key representing the drug

rank_list_l2 = {}
for drug in msi.drugs_in_graph:
    drug_diff_l2 = dp_saved.drug_or_indication2diffusion_profile[drug]
    node_diff_l2 = dict(zip(msi.nodelist, drug_diff_l2))
    drug_diff_updated_l2 = []
    for k,v in node_diff_l2.items():
        if k not in msi.drugs_in_graph + msi.indications_in_graph:
            drug_diff_updated_l2.append(v)
    L2 = np.linalg.norm (np.array(drug_diff_updated_l2) - AML, ord=2)
    rank_list_l2[drug] = L2

# Reference:https://numpy.org/doc/stable/reference/generated/numpy.linalg.norm.html

In [48]:
# ordering the drugs based on the drug with the lowest baseline metric value (higher proximity -> higher similarity)
rank_ordered_l2 = rank_generator(rank_list_l2, False)

In [49]:
# the ordered dictionary is saved as a csv file
#csv_generator(rank_ordered_l2,'Baseline_Metrics_Results','l2', 'drugs', 'proximity')

In [39]:
# L1 norm

rank_list_l1 = {}
for drug in msi.drugs_in_graph:
    drug_diff_l1 = dp_saved.drug_or_indication2diffusion_profile[drug]
    node_diff_l1 = dict(zip(msi.nodelist, drug_diff_l1))
    drug_diff_updated_l1 = []
    for k,v in node_diff_l1.items():
         if k not in msi.drugs_in_graph + msi.indications_in_graph:
                 drug_diff_updated_l1.append(v)
    L1 = np.linalg.norm (np.array(drug_diff_updated_l1) - AML, ord=1)
    rank_list_l1[drug] = L1

In [40]:
rank_ordered_l1 = rank_generator(rank_list_l1, False)

In [41]:
#csv_generator(rank_ordered_l1,'Baseline_Metrics_Results','l1','drugs', 'proximity')

In [42]:
from scipy.spatial import distance
# canberra distance 

rank_list_can = {}
for drug in msi.drugs_in_graph:
    drug_diff_can = dp_saved.drug_or_indication2diffusion_profile[drug]
    node_diff_can = dict(zip(msi.nodelist, drug_diff_can))
    drug_diff_updated_can = []
    for k,v in node_diff_can.items():
        if k not in msi.drugs_in_graph + msi.indications_in_graph:
            drug_diff_updated_can.append(v)
    can = distance.canberra(np.array(drug_diff_updated_can), AML)
    rank_list_can[drug] = can
    
# Reference:https://www.statology.org/canberra-distance-python/

In [43]:
rank_ordered_can = rank_generator(rank_list_can, False)

In [44]:
#csv_generator(rank_ordered_can,'Baseline_Metrics_Results','can','drugs', 'proximity')

In [29]:
# cosine similarity 

rank_list_cos = {}
for drug in msi.drugs_in_graph:
    drug_diff_cos = dp_saved.drug_or_indication2diffusion_profile[drug]
    node_diff_cos = dict(zip(msi.nodelist, drug_diff_cos))
    drug_diff_updated_cos = []
    for k,v in node_diff_cos.items():
        if k not in msi.drugs_in_graph + msi.indications_in_graph:
            drug_diff_updated_cos.append(v)
    cos = np.dot(np.array(drug_diff_updated_cos), AML)/(np.linalg.norm(np.array(drug_diff_updated_cos))*np.linalg.norm(AML))
    rank_list_cos[drug] = cos

# Reference:https://www.geeksforgeeks.org/how-to-calculate-cosine-similarity-in-python/

In [30]:
rank_ordered_cos = rank_generator(rank_list_cos, True)

In [31]:
#csv_generator(rank_ordered_cos,'Baseline_Metrics_Results','cos','drugs', 'proximity')

In [48]:
from scipy.spatial.distance import correlation  
# correlation distance 

rank_list_corr = {}
for drug in msi.drugs_in_graph:
    drug_diff_corr = dp_saved.drug_or_indication2diffusion_profile[drug]
    node_diff_corr =  dict(zip(msi.nodelist, drug_diff_corr))
    drug_diff_updated_corr = []
    for k,v in node_diff_corr.items():
        if k not in msi.drugs_in_graph + msi.indications_in_graph:
             drug_diff_updated_corr.append(v)                       
    corr = correlation(np.array(drug_diff_updated_corr), AML)
    rank_list_corr[drug] = corr

# Reference:https://www.discoverbits.in/2272/python-calculate-correlation-distance-between-numpy-arrays

In [49]:
rank_ordered_corr = rank_generator(rank_list_corr, False)

In [50]:
#csv_generator(rank_ordered_corr,'Baseline_Metrics_Results', 'corr','drugs', 'proximity')

# Observing Results of Baseline Metrics

In [29]:
# observing top 10 drugs generated by L2 norm
df_l2 = pd.read_csv('C:\\Users\\SuruthyS\\Downloads\\multiscale-interactome-master(1)\\multiscale-interactome-master\\Baseline_Metrics_Results\\rank_list_l2.csv')

In [30]:
df_l2.head(20)

Unnamed: 0,drugs,proximity
0,DB12010,0.023352
1,DB01593,0.026248
2,DB00142,0.032783
3,DB00157,0.035658
4,DB00114,0.035731
5,DB00898,0.040186
6,DB00334,0.040322
7,DB00143,0.04115
8,DB00543,0.041597
9,DB00201,0.042141


In [31]:
# observing top 10 drugs generated by L1 norm
df_l1 = pd.read_csv('C:\\Users\\SuruthyS\\Downloads\\multiscale-interactome-master(1)\\multiscale-interactome-master\\Baseline_Metrics_Results\\rank_list_l1.csv')

In [32]:
df_l1.head(20)

Unnamed: 0,drugs,proximity
0,DB12010,0.645531
1,DB01169,0.671125
2,DB01593,0.685979
3,DB06616,0.697358
4,DB02709,0.69851
5,arsenic-trioxide,0.707135
6,DB00945,0.707553
7,DB00675,0.707915
8,DB01254,0.708285
9,DB00852,0.723267


In [33]:
# observing top 10 drugs generated by canberra distance
df_can = pd.read_csv('C:\\Users\\SuruthyS\\Downloads\\multiscale-interactome-master(1)\\multiscale-interactome-master\\Baseline_Metrics_Results\\rank_list_can.csv')

In [34]:
df_can.head(20)

Unnamed: 0,drugs,proximity
0,DB12010,5719.216362
1,DB06616,6213.844531
2,DB01593,6316.660137
3,DB04272,6324.698902
4,DB00945,6373.940095
5,DB02709,6411.336769
6,DB01169,6414.249132
7,DB01196,6519.086541
8,DB00675,6557.420769
9,DB01254,6559.59395


In [35]:
# observing top 10 drugs generated by cosine similarity
df_cos = pd.read_csv('C:\\Users\\SuruthyS\\Downloads\\multiscale-interactome-master(1)\\multiscale-interactome-master\\Baseline_Metrics_Results\\rank_list_cos.csv')

In [36]:
df_cos.head(20)

Unnamed: 0,drugs,proximity
0,DB12010,0.286733
1,DB01593,0.187858
2,DB00852,0.171707
3,DB06616,0.144363
4,DB02709,0.139082
5,DB01254,0.136954
6,DB01268,0.135717
7,DB00755,0.134618
8,DB06202,0.133704
9,DB00675,0.127056


In [37]:
# observing top 10 drugs generated by correlation distance 
df_corr = pd.read_csv('C:\\Users\\SuruthyS\\Downloads\\multiscale-interactome-master(1)\\multiscale-interactome-master\\Baseline_Metrics_Results\\rank_list_corr.csv')

In [38]:
df_corr.head(20)

Unnamed: 0,drugs,proximity
0,DB12010,0.751854
1,DB00852,0.843909
2,DB01593,0.849449
3,DB06202,0.870464
4,DB06616,0.872956
5,DB01268,0.874881
6,DB02709,0.879739
7,DB01254,0.881903
8,DB00755,0.882047
9,DB06803,0.882639


# Selecting Frequently Occurring Drug Nodes

In [39]:
# for every drug in the list, it is checked if it is present in the specified range of the ranked results for each baseline metric
# if the drug is present, a value of 1 is added to the respective key representing the drug in a dictionary
# if the drug is not present, a value of 0 is added 

def drug_counter(slice1,slice2):
    drug_count = {}
    for drug in msi.drugs_in_graph:
        if drug in list(df_l2.loc[slice1:slice2,'drugs']):
            drug_count[drug] = 1
        else: 
            drug_count[drug] = 0
        if drug in list(df_l1.loc[slice1:slice2,'drugs']):
             drug_count[drug] +=1
        else: 
            drug_count[drug] +=0
        if drug in list(df_can.loc[slice1:slice2,'drugs']):
             drug_count[drug] +=1
        else: 
            drug_count[drug] +=0
        if drug in list(df_cos.loc[slice1:slice2,'drugs']):
            drug_count[drug] +=1
        else: 
            drug_count[drug] +=0
        if drug in list(df_corr.loc[slice1:slice2,'drugs']):
            drug_count[drug] +=1
        else: 
            drug_count[drug] +=0  
    return drug_count

In [40]:
# counting the presence of drugs in the top 10 range of the ranked results for each baseline metric
# ordering the drugs based on the count
top_10_drugs = drug_counter(0,9)
drug_count10 = rank_generator(top_10_drugs,True)

In [41]:
# counting the presence of drugs in the top 20 range of the ranked results for each baseline metric
top_20_drugs = drug_counter(0,19)
drug_count20 = rank_generator(top_20_drugs,True)

In [42]:
# counting the presence of drugs in the top 50 range of the ranked results for each baseline metric
top_50_drugs = drug_counter(0,49)
drug_count50 = rank_generator(top_50_drugs,True)


In [43]:
drug_count50

{'DB12010': 5,
 'DB08896': 5,
 'DB00852': 5,
 'DB01254': 5,
 'DB02709': 5,
 'DB06616': 5,
 'DB00945': 5,
 'DB01593': 5,
 'DB00675': 4,
 'DB05294': 4,
 'DB01229': 4,
 'DB00255': 4,
 'DB01406': 3,
 'DB00188': 3,
 'DB00755': 3,
 'DB11619': 3,
 'DB06603': 3,
 'DB09371': 3,
 'DB01248': 3,
 'DB05015': 3,
 'DB01169': 3,
 'DB04272': 3,
 'DB00162': 3,
 'DB08901': 3,
 'DB00304': 2,
 'DB00317': 2,
 'estropipate': 2,
 'DB00139': 2,
 'DB00031': 2,
 'DB00764': 2,
 'DB00459': 2,
 'DB08604': 2,
 'DB00361': 2,
 'DB01393': 2,
 'DB09559': 2,
 'chlormadinone-acetate': 2,
 'DB00717': 2,
 'DB00947': 2,
 'DB01394': 2,
 'DB00619': 2,
 'DB01179': 2,
 'megestrol-acetate': 2,
 'DB00142': 2,
 'DB06589': 2,
 'DB00823': 2,
 'DB09123': 2,
 'DB00570': 2,
 'DB00378': 2,
 'DB06803': 2,
 'DB01196': 2,
 'DB06202': 2,
 'DB01431': 2,
 'DB00396': 2,
 'DB00834': 2,
 'DB00412': 2,
 'DB08867': 2,
 'DB00025': 2,
 'DB04845': 2,
 'DB00603': 2,
 'DB00070': 2,
 'DB00957': 2,
 'DB13999': 2,
 'DB00398': 2,
 'DB00136': 2,
 'DB06401': 

# Visitation Frequency of Selected Drug Nodes in Diffusion Profile of AML

In [44]:
# the list of drugs selected from analyzing the results from the baseline metrics
# the drug is selected if the count is more than 1 

drug_code10 = ['DB12010','DB01593','DB06616','DB02709','DB00945','DB01169',
               'DB00675','DB01254','DB00852','DB01268','DB06202']

drug_name10 = ['fostamatinib', 'zinc', 'bosutinib', 'resveratrol', 'aspirin', 
               'arsenic trioxide', 'tamoxifen', 'dasatinib','pseudoephedrine','sunitinib','lasofoxifene']



drug_code20 = ['DB12010', 'DB01593', 'DB06616', 'DB02709', 'DB00945', 'DB01169',
               'DB00675', 'DB01254', 'DB00852','DB01268','DB06202', 'DB05294', 'DB04272', 'DB00570', 
               'DB01064', 'DB11619', 'DB00459','DB00957','DB00603','DB00304','DB00823','DB01393']

drug_name20 = ['fostamatinib', 'zinc', 'bosutinib', 'resveratrol', 'aspirin', 
               'arsenic trioxide', 'tamoxifen', 'dasatinib', 'pseudoephedrine', 'sunitinib','lasofoxifene',
               'vandetanib', 'citric acid','vinblastine','isoprenaline','gestrinone', 'acitretin', 'norgestimate',
               'medroxyprogesterone acetate','desogestrel','ethynodiol-diacetate','bezafibrate']

drug_code50 = ['DB12010', 'DB01593', 'DB06616', 'DB02709', 'DB00945', 'DB01169',
               'DB00675', 'DB01254', 'DB00852','DB01268','DB06202','DB05294', 'DB04272', 'DB00570', 
               'DB01064', 'DB11619','DB00459','DB00957','DB00603','DB00304','DB00823','DB01393', 'DB00139','DB00755',
               'DB00126','DB00162','DB08896','DB00188','DB06401','estropipate','DB04938','DB00947',
               'DB01394','DB01179','DB01406','DB06176','DB01196','DB00070',
               'DB13999','DB00025','DB00020','DB00361','DB04845','DB09559',
               'DB00317','DB05015','DB06603','DB00255','DB08901','DB00031',
               'DB01229','DB06803','DB00398','DB00619','DB00142','DB00834','DB06589','DB08867','DB00396','DB00412',
               'DB00136','DB00717','chlormadinone-acetate','DB00378','DB01431','DB08604','DB00764','DB00351',
               'DB09123','DB09371','DB01248']

drug_name50 = ['fostamatinib', 'zinc', 'bosutinib', 'resveratrol', 'aspirin', 
               'arsenic trioxide', 'tamoxifen', 'dasatinib', 'pseudoephedrine','sunitinib','lasofoxifene', 
               'vandetanib', 'citric acid','vinblastine','isoprenaline','gestrinone', 'acitretin', 'norgestimate',
               'medroxyprogesterone acetate','desogestrel','ethynodiol-diacetate','bezafibrate',
               'succinic acid','tretinoin','vitamin C','retinol','regorafenib',
               'bortezomib','bazedoxifene','estropipate','ospemifene','fulvestrant',
               'colchicine','podophyllotoxin','danazol','romidepsin','estramustine',
               'hyaluronidase (ovine)','moroctocog alfa',
               'antihemophilic factor, human recombinant','sargramostim','vinorelbine',
               'ixabepilone','necitumumab','gefitinib','belinostat','panobinostat',
               'diethylstilbestrol','ponatinib','tenecteplase','paclitaxel','niclosamide','sorafenib','imatinib',
               'L-glutamic acid','mifepristone','pazopanib','ulipristal','progesterone','rosiglitazone','calcitriol',
               'norethindrone','chlormadinone-acetate','dydrogesterone','allylestrenol','triclosan','mometasone',
               'megestrol-acetate','dienogest', 'noretynodrel','docetaxel']



In [45]:
# using the "node_diff" dictionary from "Load saved diffusion profiles" section to isolate specific visitation frequencies
# in the AML diffusion profile into a new dictionary

def drug_frequency(drug_code,drug_name):
    selected_drug_freq = {}
    for i in range(len(drug_code)):
        selected_drug_freq[drug_name[i]]  = node_diff[drug_code[i]]
    return selected_drug_freq

In [46]:
# creating a dictionary of visitation frequencies in AML for the drugs that were selected from the top 10 range
# ordering the drugs based on the drug with the highest visitation frequency 
selected_drug_freq10 = drug_frequency(drug_code10,drug_name10)

order_selected_drug_freq10 = rank_generator(selected_drug_freq10, True)
print(order_selected_drug_freq10)

{'fostamatinib': 0.0018634995105531482, 'zinc': 0.0008000071629013131, 'lasofoxifene': 0.0001942667488805765, 'sunitinib': 0.00014113346636793504, 'dasatinib': 0.00012046650657495685, 'resveratrol': 0.0001090195629960484, 'bosutinib': 9.938982185924182e-05, 'aspirin': 8.825419430207057e-05, 'arsenic trioxide': 8.520603732889221e-05, 'tamoxifen': 8.35055898154411e-05, 'pseudoephedrine': 6.620425436692788e-05}


In [47]:
# creating a dictionary of visitation frequencies in AML for the drugs that were selected from the top 20 range 
selected_drug_freq20 = drug_frequency(drug_code20,drug_name20)

order_selected_drug_freq20 = rank_generator(selected_drug_freq20, True)
print(order_selected_drug_freq20)

{'fostamatinib': 0.0018634995105531482, 'zinc': 0.0008000071629013131, 'lasofoxifene': 0.0001942667488805765, 'acitretin': 0.00014354674446212813, 'sunitinib': 0.00014113346636793504, 'dasatinib': 0.00012046650657495685, 'resveratrol': 0.0001090195629960484, 'bosutinib': 9.938982185924182e-05, 'aspirin': 8.825419430207057e-05, 'arsenic trioxide': 8.520603732889221e-05, 'tamoxifen': 8.35055898154411e-05, 'citric acid': 7.716029073555645e-05, 'vinblastine': 7.002662258327722e-05, 'pseudoephedrine': 6.620425436692788e-05, 'isoprenaline': 6.524080220737557e-05, 'vandetanib': 6.354319486281367e-05, 'bezafibrate': 4.012207466527589e-06, 'gestrinone': 3.680067206619602e-06, 'norgestimate': 2.9960354798530793e-06, 'medroxyprogesterone acetate': 2.4138425884740407e-06, 'desogestrel': 2.4138425884740407e-06, 'ethynodiol-diacetate': 2.4138425884740407e-06}


In [48]:
# creating a dictionary of visitation frequencies in AML for the drugs that were selected from the top 50 range 
selected_drug_freq50 = drug_frequency(drug_code50,drug_name50)

order_selected_drug_freq50 = rank_generator(selected_drug_freq50, True)
print(order_selected_drug_freq50)

{'fostamatinib': 0.0018634995105531482, 'zinc': 0.0008000071629013131, 'L-glutamic acid': 0.0006080107990023996, 'tenecteplase': 0.00031516407642539785, 'lasofoxifene': 0.0001942667488805765, 'acitretin': 0.00014354674446212813, 'sunitinib': 0.00014113346636793504, 'niclosamide': 0.00013925806100771127, 'dasatinib': 0.00012046650657495685, 'imatinib': 0.00011135695424135698, 'resveratrol': 0.0001090195629960484, 'ponatinib': 0.00010326346636503278, 'bosutinib': 9.938982185924182e-05, 'pazopanib': 9.72552798796233e-05, 'aspirin': 8.825419430207057e-05, 'vitamin C': 8.610771474299346e-05, 'sorafenib': 8.570597673166342e-05, 'arsenic trioxide': 8.520603732889221e-05, 'tamoxifen': 8.35055898154411e-05, 'colchicine': 8.229283694935743e-05, 'paclitaxel': 7.952371895102533e-05, 'vinorelbine': 7.864713190579412e-05, 'docetaxel': 7.767908485564744e-05, 'citric acid': 7.716029073555645e-05, 'bortezomib': 7.552047127870017e-05, 'ixabepilone': 7.27989621405753e-05, 'regorafenib': 7.046236404338804

In [49]:
#csv_generator(order_selected_drug_freq10, 'visit_freq_drug_disease','drug_in_disease_10','drug','visitation_freq')

In [50]:
#csv_generator(order_selected_drug_freq20, 'visit_freq_drug_disease','drug_in_disease_20','drug','visitation_freq')

In [51]:
#csv_generator(order_selected_drug_freq50, 'visit_freq_drug_disease','drug_in_disease_50','drug','visitation_freq')

# Visitation Frequency of Disease Node in Diffusion Profile of the Selected Drugs

In [52]:
# observing the frequncy of AML indication 
# a specific diffusion profile is isolated into a new dictionary based on the node in the loop
# the AML visiation frequency is selected from that dictionary and added into a new dictionary with the key identifying that
# node

def AML_frequency(drug_code,drug_name): 
    AML_freq = {}
    for i in range(len(drug_code)):
        drug_diffusion = dp_saved.drug_or_indication2diffusion_profile[drug_code[i]]
        node_2_diff = dict(zip(msi.nodelist, drug_diffusion))
        AML_visit = node_2_diff['C0023467']
        AML_freq[drug_name[i]] = AML_visit
    return AML_freq

In [53]:
# a dictionary of AML visitation frequencies in the drugs that were selected from the top 10 range
# ordering the drugs based on the drug with the highest visitation frequency 
AML_freq10 = AML_frequency(drug_code10,drug_name10)

order_AML_freq10 = rank_generator(AML_freq10, True)
print(order_AML_freq10)

{'lasofoxifene': 0.0023207584787101783, 'sunitinib': 0.0016211112204296575, 'zinc': 0.00036870371331334205, 'fostamatinib': 0.0003132134139001645, 'dasatinib': 0.00025247495484174737, 'pseudoephedrine': 0.0002516226190809841, 'arsenic trioxide': 0.00024074594290653546, 'tamoxifen': 0.00023078294472109377, 'aspirin': 0.0002154890139908415, 'bosutinib': 0.00021180071462059577, 'resveratrol': 0.0001975928538665396}


In [54]:
# a dictionary of AML visitation frequencies in the drugs that were selected from the top 20 range 
AML_freq20 = AML_frequency(drug_code20,drug_name20)

order_AML_freq20 = rank_generator(AML_freq20, True)
print(order_AML_freq20)

{'lasofoxifene': 0.0023207584787101783, 'vinblastine': 0.0021063197944846818, 'sunitinib': 0.0016211112204296575, 'isoprenaline': 0.000581491634767292, 'citric acid': 0.0005402107110398322, 'zinc': 0.00036870371331334205, 'acitretin': 0.0003544511009444701, 'fostamatinib': 0.0003132134139001645, 'gestrinone': 0.00025479117141002754, 'dasatinib': 0.00025247495484174737, 'pseudoephedrine': 0.0002516226190809841, 'arsenic trioxide': 0.00024074594290653546, 'norgestimate': 0.00023733919197040007, 'tamoxifen': 0.00023078294472109377, 'aspirin': 0.0002154890139908415, 'bosutinib': 0.00021180071462059577, 'medroxyprogesterone acetate': 0.00020999806265926363, 'desogestrel': 0.00020999806265926363, 'ethynodiol-diacetate': 0.00020999806265926363, 'vandetanib': 0.00020385940103471734, 'resveratrol': 0.0001975928538665396, 'bezafibrate': 0.000194307601138483}


In [55]:
# a dictionary of AML visitation frequencies in the drugs that were selected from the top 50 range
AML_freq50 = AML_frequency(drug_code50,drug_name50)

order_AML_freq50 = rank_generator(AML_freq50, True)
print(order_AML_freq50)

{'podophyllotoxin': 0.0029031831127784772, 'lasofoxifene': 0.0023207584787101783, 'vinblastine': 0.0021063197944846818, 'docetaxel': 0.0020600003087816758, 'paclitaxel': 0.0018400487383100094, 'sunitinib': 0.0016211112204296575, 'niclosamide': 0.001425712510859475, 'tenecteplase': 0.0013211567765072977, 'L-glutamic acid': 0.0012536450960489408, 'imatinib': 0.0012048357857752452, 'ixabepilone': 0.0011228354895298337, 'vinorelbine': 0.0010614653206164268, 'colchicine': 0.0010210736140456513, 'pazopanib': 0.0009045566448521088, 'sorafenib': 0.0008182529579709132, 'ponatinib': 0.000666602406058952, 'isoprenaline': 0.000581491634767292, 'citric acid': 0.0005402107110398322, 'calcitriol': 0.00046130985515168007, 'zinc': 0.00036870371331334205, 'acitretin': 0.0003544511009444701, 'fostamatinib': 0.0003132134139001645, 'progesterone': 0.00029783614259632076, 'hyaluronidase (ovine)': 0.0002896035175060024, 'sargramostim': 0.0002889546898681727, 'romidepsin': 0.0002821238009349946, 'regorafenib'

In [56]:
#csv_generator(order_AML_freq10, 'visit_freq_drug_disease','disease_in_drug_10','drug','visitation_freq_of_AML')

In [57]:
#csv_generator(order_AML_freq20, 'visit_freq_drug_disease','disease_in_drug_20','drug','visitation_freq_of_AML')

In [58]:
#csv_generator(order_AML_freq50, 'visit_freq_drug_disease','disease_in_drug_50','drug','visitation_freq_of_AML')

# Determining ri(d)xrj(c)

In [59]:
# for each node in the inputted list, its respective visitation frequency in AML and the visitation frequency of AML in the 
# node is multiplied 
# the result is added into a new dictionary with the key identifying that node 

def d_c_multiplication(drug_name,selected_drug_freq,AML_freq):
    d_c = {}
    for i in drug_name:
        d_c_result = selected_drug_freq[i]*AML_freq[i]
        d_c[i] = d_c_result
    return d_c
    

In [60]:
# a dictionary of values from multiplying the visitation frequency of the drug in the AML diffusion profile with the 
# visitation frequency of AML in the drug diffusion profile for the selected drugs from the top 10 range
# ordering the drugs based on the drug with the highest product value 
dc_10 = d_c_multiplication(drug_name10,selected_drug_freq10,AML_freq10)

order_dc10 = rank_generator(dc_10, True)
print(order_dc10)

{'fostamatinib': 5.836730435016371e-07, 'lasofoxifene': 4.508462045960589e-07, 'zinc': 2.949656116389859e-07, 'sunitinib': 2.287930459071912e-07, 'dasatinib': 3.0414775807455296e-08, 'resveratrol': 2.1541486579672202e-08, 'bosutinib': 2.105083529580113e-08, 'arsenic trioxide': 2.0513007798073614e-08, 'tamoxifen': 1.9271665918279272e-08, 'aspirin': 1.9017809310709328e-08, 'pseudoephedrine': 1.665848787811007e-08}


In [61]:
# a dictionary of values from multiplying the visitation frequency of the drug in the AML diffusion profile with the 
# visitation frequency of AML in the drug diffusion profile for the selected drugs from the top 20 range

dc_20 = d_c_multiplication(drug_name20,selected_drug_freq20,AML_freq20)

order_dc20 = rank_generator(dc_20, True)
print(order_dc20)

{'fostamatinib': 5.836730435016371e-07, 'lasofoxifene': 4.508462045960589e-07, 'zinc': 2.949656116389859e-07, 'sunitinib': 2.287930459071912e-07, 'vinblastine': 1.4749846128806486e-07, 'acitretin': 5.0880301611595835e-08, 'citric acid': 4.168281552229513e-08, 'isoprenaline': 3.7936980729096366e-08, 'dasatinib': 3.0414775807455296e-08, 'resveratrol': 2.1541486579672202e-08, 'bosutinib': 2.105083529580113e-08, 'arsenic trioxide': 2.0513007798073614e-08, 'tamoxifen': 1.9271665918279272e-08, 'aspirin': 1.9017809310709328e-08, 'pseudoephedrine': 1.665848787811007e-08, 'vandetanib': 1.2953877644565522e-08, 'gestrinone': 9.376486344422362e-10, 'bezafibrate': 7.796024080908861e-10, 'norgestimate': 7.110766399029797e-10, 'medroxyprogesterone acetate': 5.069022671439707e-10, 'desogestrel': 5.069022671439707e-10, 'ethynodiol-diacetate': 5.069022671439707e-10}


In [62]:
# a dictionary of values from multiplying the visitation frequency of the drug in the AML diffusion profile with the 
# visitation frequency of AML in the drug diffusion profile for the selected drugs from the top 50 range

dc_50 = d_c_multiplication(drug_name50,selected_drug_freq50,AML_freq50)

order_dc50 = rank_generator(dc_50, True)
print(order_dc50)

{'L-glutamic acid': 7.622297565141564e-07, 'fostamatinib': 5.836730435016371e-07, 'lasofoxifene': 4.508462045960589e-07, 'tenecteplase': 4.1638115528107823e-07, 'zinc': 2.949656116389859e-07, 'sunitinib': 2.287930459071912e-07, 'niclosamide': 1.9854195981672596e-07, 'docetaxel': 1.6001893878851172e-07, 'podophyllotoxin': 1.5505754850608772e-07, 'vinblastine': 1.4749846128806486e-07, 'paclitaxel': 1.4632751872155395e-07, 'imatinib': 1.3416684346492336e-07, 'pazopanib': 8.797290966206486e-08, 'colchicine': 8.40270444339499e-08, 'vinorelbine': 8.348120308394617e-08, 'ixabepilone': 8.17412582923767e-08, 'sorafenib': 7.012916897646986e-08, 'ponatinib': 6.88356751369185e-08, 'acitretin': 5.0880301611595835e-08, 'citric acid': 4.168281552229513e-08, 'isoprenaline': 3.7936980729096366e-08, 'dasatinib': 3.0414775807455296e-08, 'resveratrol': 2.1541486579672202e-08, 'bosutinib': 2.105083529580113e-08, 'vitamin C': 2.1034409591529435e-08, 'arsenic trioxide': 2.0513007798073614e-08, 'regorafenib':

In [63]:
#csv_generator(order_dc10, 'visit_freq_drug_disease','d_c_multiplication_10','drug','multiplication_result')

In [64]:
#csv_generator(order_dc20, 'visit_freq_drug_disease','d_c_multiplication_20','drug','multiplication_result')

In [65]:
#csv_generator(order_dc50, 'visit_freq_drug_disease','d_c_multiplication_50','drug','multiplication_result')

# Determining ri(d)xrj(c) without the influence of Baseline Metrics 

In [66]:
# the multiplication of the visitation frequency of each drug in the AML diffusion profile with the visitation frequency of AML 
# in each drug diffusion profile (no selection of drugs)

AML_frequency_total = AML_frequency(msi.drugs_in_graph,msi.drugs_in_graph) 
drug_frequency_total = drug_frequency(msi.drugs_in_graph,msi.drugs_in_graph)
d_c_total = d_c_multiplication(msi.drugs_in_graph,drug_frequency_total,AML_frequency_total)

In [67]:
# ordering the drugs based on the drug with the highest product value
order_dc_total = rank_generator(d_c_total, True)
print(order_dc_total)

{'DB00056': 1.5778308390603061e-06, 'DB01078': 9.378050182994979e-07, 'DB00142': 7.622297565141564e-07, 'DB00470': 6.3877985387072e-07, 'DB12010': 5.836730435016371e-07, 'DB06202': 4.508462045960589e-07, 'DB00031': 4.1638115528107823e-07, 'DB00309': 3.357571651102861e-07, 'DB01593': 2.949656116389859e-07, 'DB00541': 2.436399934116303e-07, 'DB00643': 2.420513001442182e-07, 'DB01268': 2.287930459071912e-07, 'DB06803': 1.9854195981672596e-07, 'DB01109': 1.8502981219829e-07, 'DB01248': 1.6001893878851172e-07, 'DB01179': 1.5505754850608772e-07, 'DB00570': 1.4749846128806486e-07, 'DB01229': 1.4632751872155395e-07, 'DB00619': 1.3416684346492336e-07, 'DB11363': 1.0294561438925959e-07, 'hyaluronic-acid': 1.0256890601646676e-07, 'DB06589': 8.797290966206486e-08, 'DB08818': 8.655544483413146e-08, 'DB01394': 8.40270444339499e-08, 'DB00361': 8.348120308394617e-08, 'DB04845': 8.17412582923767e-08, 'DB08875': 7.276762472371881e-08, 'DB00398': 7.012916897646986e-08, 'DB08901': 6.88356751369185e-08, 'D

In [68]:
# the names of the top 3 drugs are derived from the Supplementary Data 1 file provided by the paper 
# "Identification of disease treatment mechanisms through the multiscale interactome"

# DB00056 - Gemtuzumab ozogamicin
# DB01078 - deslanoside
# DB00142 - L-Glutamic Acid

# Ranking Visitation Frequency of Proteins and Biological Functions for Top 3 Drugs

In [71]:
# selection of drugs from the top 3 position from the results of ri(d)xrj(c) for top10, top20, and top50 (fostamatinib, lasofoxifene, zinc, L-glutamic acid)
top_3_drugs = {'fostamatinib':'DB12010','lasofoxifene':'DB06202', 'zinc':'DB01593','L-glutamic acid':'DB00142'}



In [72]:
# for every drug, a diffusion profile of proteins and biological functions is isolated and added as a value in a disctionary 
# with the key corresponding to the drug 
top_3_diffusions = {}
for drug in top_3_drugs:
    drug_diff = dp_saved.drug_or_indication2diffusion_profile[top_3_drugs[drug]]
    node_diff =  dict(zip(msi.nodelist, drug_diff))
    drug_diff_updated = []
    for k,v in node_diff.items():
        if k not in msi.drugs_in_graph + msi.indications_in_graph:
             drug_diff_updated.append(v)                       
    top_3_diffusions[drug] = np.array(drug_diff_updated)


In [73]:
print( top_3_diffusions)

{'fostamatinib': array([7.49452371e-04, 8.35848009e-05, 2.36383085e-04, ...,
       7.40116592e-06, 2.69612970e-06, 2.69612970e-06]), 'lasofoxifene': array([5.25918192e-05, 6.62055758e-05, 1.98486194e-04, ...,
       2.52806871e-06, 1.56651171e-06, 1.56651171e-06]), 'zinc': array([5.45178362e-05, 7.86467697e-05, 8.95335338e-05, ...,
       3.57654189e-06, 2.50118665e-06, 2.50118665e-06]), 'L-glutamic acid': array([4.34057819e-05, 1.35991519e-04, 6.29291463e-05, ...,
       3.54793425e-06, 2.82870249e-06, 2.82870249e-06])}


In [74]:
top_3_data = pd.DataFrame(top_3_diffusions)
top_3_data['nodes'] = probio_nodes
top_3_data.head()

Unnamed: 0,fostamatinib,lasofoxifene,zinc,L-glutamic acid,nodes
0,0.000749,5.3e-05,5.5e-05,4.3e-05,90
1,8.4e-05,6.6e-05,7.9e-05,0.000136,5371
2,0.000236,0.000198,9e-05,6.3e-05,6774
3,1.8e-05,1.1e-05,1.5e-05,2e-05,375
4,3.4e-05,2.4e-05,0.001655,8.5e-05,226


In [75]:
# isolating and ordering the diffusion profile of fostamatinib
data_fostamatinib = top_3_data.loc[:,['nodes','fostamatinib']].sort_values(by ='fostamatinib', ascending=False)

In [76]:
#data_fostamatinib.to_csv('top_3/fostamatinib_diff.csv', index=False)

In [77]:
# isolating and ordering the diffusion profile of lasofoxifene
data_lasofoxifene = top_3_data.loc[:,['nodes','lasofoxifene']].sort_values(by ='lasofoxifene', ascending=False)

In [78]:
#data_lasofoxifene.to_csv('top_3/lasofoxifene_diff.csv', index=False)

In [79]:
# isolating and ordering the diffusion profile of zinc
data_zinc = top_3_data.loc[:,['nodes','zinc']].sort_values(by ='zinc', ascending=False)

In [80]:
#data_zinc.to_csv('top_3/zinc_diff.csv', index=False)

In [81]:
#isolating and ordering the diffusion profile of L-glutamic acid
data_L_glutamic = top_3_data.loc[:,['nodes','L-glutamic acid']].sort_values(by ='L-glutamic acid', ascending=False)

In [82]:
#data_L_glutamic.to_csv('top_3/L_glutamic_diff.csv', index=False)

# Ranking Visitation Frequency of all drugs in the diffusion profile of AML

In [119]:
# using the "node_diff" dictionary from "Load saved diffusion profiles" section to isolate specific visitation frequencies 
# for all the drugs in the AML diffusion profile into a new dictionary
drug_freq = {}
for i in msi.drugs_in_graph:
    drug_freq[i]  = node_diff[i]

In [120]:
# ordering the drugs based on the drug with the highest visitation frequency
order_drug_freq = rank_generator(drug_freq, True)

In [121]:
#csv_generator(order_drug_freq, 'visit_freq_drug_disease','all_drugs_frequency','drug','frequncy in AML')

# Computing the treatment importance of genes 

In [86]:
'''We define the treatment importance (TI) of gene as the product of the visitation
frequency of the corresponding protein in the drug and disease diffusion profiles. 
(Source:"Identification of disease treatment mechanisms through the multiscale interactome")'''

# isolating the the protein diffusion profile of the 5 selected drugs
top_3_data_by_proteins = top_3_data[top_3_data['nodes'].str.match('GO') == False]
top_3_data_by_proteins.head()


Unnamed: 0,fostamatinib,lasofoxifene,zinc,L-glutamic acid,nodes
0,0.000749,5.3e-05,5.5e-05,4.3e-05,90
1,8.4e-05,6.6e-05,7.9e-05,0.000136,5371
2,0.000236,0.000198,9e-05,6.3e-05,6774
3,1.8e-05,1.1e-05,1.5e-05,2e-05,375
4,3.4e-05,2.4e-05,0.001655,8.5e-05,226


In [87]:
# isolating the the protein diffusion profile of AML
AML_by_proteins = df_AML[df_AML['nodelist'].str.match('GO') == False]
AML_by_proteins.head()

Unnamed: 0,nodelist,diff_AML
0,90,7.3e-05
1,5371,0.000147
2,6774,0.001794
3,375,1.9e-05
4,226,2.8e-05


In [88]:
# Reference: https://www.geeksforgeeks.org/how-to-add-column-from-another-dataframe-in-pandas/
# joining the two dataframes together 
diff_top_proteins = top_3_data_by_proteins.loc[:,['fostamatinib','lasofoxifene','zinc','L-glutamic acid']]
top3_AML = AML_by_proteins.join(diff_top_proteins)
top3_AML.head()

Unnamed: 0,nodelist,diff_AML,fostamatinib,lasofoxifene,zinc,L-glutamic acid
0,90,7.3e-05,0.000749,5.3e-05,5.5e-05,4.3e-05
1,5371,0.000147,8.4e-05,6.6e-05,7.9e-05,0.000136
2,6774,0.001794,0.000236,0.000198,9e-05,6.3e-05
3,375,1.9e-05,1.8e-05,1.1e-05,1.5e-05,2e-05
4,226,2.8e-05,3.4e-05,2.4e-05,0.001655,8.5e-05


In [89]:
# calculating the treatment importance for fostamatinib and AML 
top3_AML['protein_product_fost'] = top3_AML['diff_AML']*top3_AML['fostamatinib']

In [90]:
# calculating the treatment importance for lasofoxifene and AML
top3_AML['protein_product_laso'] = top3_AML['diff_AML']*top3_AML['lasofoxifene']

In [91]:
# calculating the treatment importance for zinc and AML
top3_AML['protein_product_zinc'] = top3_AML['diff_AML']*top3_AML['zinc']

In [92]:
# calculating the treatment importance for L-glutamic acid and AML
top3_AML['protein_product_glut'] = top3_AML['diff_AML']*top3_AML['L-glutamic acid']
top3_AML.head()

Unnamed: 0,nodelist,diff_AML,fostamatinib,lasofoxifene,zinc,L-glutamic acid,protein_product_fost,protein_product_laso,protein_product_zinc,protein_product_glut
0,90,7.3e-05,0.000749,5.3e-05,5.5e-05,4.3e-05,5.473162e-08,3.840718e-09,3.981373e-09,3.169873e-09
1,5371,0.000147,8.4e-05,6.6e-05,7.9e-05,0.000136,1.23074e-08,9.748403e-09,1.15803e-08,2.002399e-08
2,6774,0.001794,0.000236,0.000198,9e-05,6.3e-05,4.240589e-07,3.560738e-07,1.606185e-07,1.128916e-07
3,375,1.9e-05,1.8e-05,1.1e-05,1.5e-05,2e-05,3.305954e-10,2.077925e-10,2.804629e-10,3.785501e-10
4,226,2.8e-05,3.4e-05,2.4e-05,0.001655,8.5e-05,9.785764e-10,6.688813e-10,4.694732e-08,2.400212e-09


In [93]:
#top3_AML.to_csv('top_3/treatment_importance.csv', index=False)

In [94]:
# ordering the treatment importance values 

# fostamatinib
TI_fostamatinib = top3_AML.loc[:,['nodelist','protein_product_fost']].sort_values(by ='protein_product_fost', ascending=False)
#TI_fostamatinib.to_csv('top_3/TI_fostamatinib.csv', index=False)


In [95]:
# lasofoxifene 
TI_lasofoxifene = top3_AML.loc[:,['nodelist','protein_product_laso']].sort_values(by ='protein_product_laso', ascending=False)
#TI_lasofoxifene.to_csv('top_3/TI_lasofoxifene.csv', index=False)

In [96]:
# zinc
TI_zinc = top3_AML.loc[:,['nodelist','protein_product_zinc']].sort_values(by ='protein_product_zinc', ascending=False)
#TI_zinc.to_csv('top_3/TI_zinc.csv', index=False)

In [97]:
# L-glutamic acid  
TI_L_glutamic  = top3_AML.loc[:,['nodelist','protein_product_glut']].sort_values(by ='protein_product_glut', ascending=False)
#TI_L_glutamic.to_csv('top_3/TI_L_glutamic.csv', index=False)

# Visualizing Networks

In [4]:
multiscale_graph = nx.read_gpickle("C:\\Users\\SuruthyS\\Downloads\\multiscale-interactome-master(1)\\multiscale-interactome-master\\results\\graph.pkl")

In [99]:
type(multiscale_graph)

networkx.classes.digraph.DiGraph

In [100]:
# subgraph for fostamatinib using top 20 proteins and biological function in the drug and in the disease (AML)
sub = multiscale_graph.subgraph(['DB12010', 'GO:0006468', 'GO:0046777','GO:0035556','GO:0045944',
                'GO:0018108','GO:0045893','GO:0016310','3320','GO:0043066','GO:0018105',
                'GO:0006464','GO:0000122','1956','GO:0043484','4750','GO:0045892','GO:0010628',
                '120892','4914','351', 'C0023467', 'GO:0045944', 'GO:0045893', 'GO:0045892',
                'GO:0000122','4609','864','6774','3417','302','2623','4869','6688','11168',
                '2624','3845','64324','2672','3315','10919','861'])



In [101]:
# subgraph for lasofoxifene using top 20 proteins and biological function in the drug and in the disease (AML)
sub2 = multiscale_graph.subgraph(['DB06202','2099','2100','1269','2769','GO:0007204','GO:0000122','GO:0045893',
                                  'GO:0051091','GO:0045944','GO:0006366','GO:0051480','GO:0071392','GO:0010524',
                                  'GO:0010629','GO:0060402','GO:0043433','GO:0051482','GO:0045899','GO:0045892',
                                  'GO:1903799','C0023467', 'GO:0045944','GO:0045893', 'GO:0045892','GO:0000122',
                                  '4609','864','6774','3417','302','2623','4869','6688',
                                  '11168','2624','3845','64324','2672','3315','10919','861'
                                   ])

In [102]:
# subgraph for zinc using top 20 proteins and biological function in the drug and in the disease (AML)
sub3 = multiscale_graph.subgraph(['DB01593','1369','2335','335','718','3507','351','1191','727','7157','731','GO:0006956',
                                  '2','333','348','3827','GO:0018149','733','732','GO:0045944','388697','C0023467', 
                                  'GO:0045944', 'GO:0045893', 'GO:0045892','GO:0000122','4609','864','6774','3417', '302', 
                                  '2623','4869','6688','11168','2624','3845','64324','2672','3315','10919','861'])

In [103]:
# subgraph for L-glutamic acid  using top 20 proteins and biological function in the drug and in the disease (AML)
sub4 = multiscale_graph.subgraph(['DB00142','2902','2898','2806','6505','2891','79751','2805','2903','124454','6507',
                                  '2747','2746','6506','GO:0097553','2744','GO:0045471','2895','6898','2356','2917','C0023467',
                                  'GO:0045944','GO:0045893', 'GO:0045892','GO:0000122','4609','864','6774','3417','302','2623',
                                  '4869','6688','11168','2624','3845','64324','2672','3315','10919','861'
                                   ])

In [6]:
# saving the generated networks as graphml for cytoscape
#nx.write_graphml_lxml(sub,"networks/fostamatinib.graphml") 
#nx.write_graphml_lxml(sub2,"networks/lasofoxifene.graphml")
#nx.write_graphml_lxml(sub3,"networks/zinc.graphml") 
#nx.write_graphml_lxml(sub4,"networks/L_glutamic .graphml")
#nx.write_graphml_lxml(multiscale_graph,"networks/graph.graphml")


In [105]:
# generating top 20 diffusion profile files for cytoscape 
top20_fost = data_fostamatinib[0:20] 
#top20_fost.to_csv('top_3/top20_fost.csv',index=False)

In [106]:
top20_lasofoxifene = data_lasofoxifene [0:20]
#top20_lasofoxifene.to_csv('top_3/top20_laso.csv', index=False)

In [107]:
top20_zinc = data_zinc[0:20]
#top20_zinc.to_csv('top_3/top20_zinc.csv', index=False)

In [108]:
top20_L_glutamic = data_L_glutamic[0:20]
#top20_L_glutamic.to_csv('top_3/top20_glut.csv', index=False)

In [109]:
top20_AML = df_AML_probio [0:20] 
#top20_AML.to_csv('top_3/top20_AML.csv', index=False)

### Obtaining diffusion profile for top 20 proteins and biological functions of the drug in AML and vice versa

In [110]:
# saving the results as a csv file
def csv_generator_2(dictionary,foldername,name,header_1,header_2):
    dict_diff = {'{}'.format(header_1): dictionary.keys() , '{}'.format(header_2): dictionary.values()}
    df_diff= pd.DataFrame(dict_diff)
    df_diff.to_csv('{}/{}.csv'.format(foldername,name), index=False)

In [111]:
# diffusion profile of top 20 proteins and biological functions of fostamatinib in AML
# Reference: https://sparkbyexamples.com/pandas/pandas-extract-column-value-based-on-another-column/

fost20 = ['GO:0006468', 'GO:0046777','GO:0035556','GO:0045944','GO:0018108','GO:0045893','GO:0016310','3320',
          'GO:0043066','GO:0018105','GO:0006464','GO:0000122','1956','GO:0043484','4750','GO:0045892','GO:0010628',
          '120892','4914','351']


def diffinAML(drug):
    inAML_diff = {}
    for prot_biol in drug:
        inAML_diff[prot_biol] = df_AML_probio.loc[df_AML_probio['nodelist']==prot_biol,'diff_AML'].iloc[0]
    return inAML_diff

fostinAML_diff  =  diffinAML(fost20)  
#csv_generator_2(fostinAML_diff,'top_3','fostinAML_diff','nodes','fostinAML_diff')

In [112]:
# diffusion profile of top 20 proteins and biological functions of AML in fostamatinib

AML20 = ['GO:0045944', 'GO:0045893', 'GO:0045892','GO:0000122','4609','864','6774','3417', '302', 
        '2623','4869','6688','11168','2624','3845','64324','2672','3315','10919','861']


def diffindrug(drug):
    inDRUG_diff = {}
    for prot_biol in AML20:
        inDRUG_diff[prot_biol] = drug.loc[drug['nodes']==prot_biol, drug.columns.values[1]].iloc[0]
    return inDRUG_diff

AMLinfost_diff =  diffindrug(data_fostamatinib)
#csv_generator_2(AMLinfost_diff,'top_3','AMLinfost_diff','nodes','AMLinfost_diff')

In [113]:
# diffusion profile of top 20 proteins and biological functions of lasofoxifene in AML

laso20 = ['2099','2100','1269','2769','GO:0007204','GO:0000122','GO:0045893','GO:0051091','GO:0045944',
          'GO:0006366','GO:0051480','GO:0071392','GO:0010524','GO:0010629','GO:0060402','GO:0043433',
          'GO:0051482','GO:0045899','GO:0045892','GO:1903799']
lasoinAML_diff = diffinAML(laso20)
#csv_generator_2(lasoinAML_diff,'top_3','lasoinAML_diff','nodes','lasoinAML_diff')

In [114]:
# diffusion profile of top 20 proteins and biological functions of AML in lasofoxifene
AMLinlaso_diff = diffindrug(data_lasofoxifene)
#csv_generator_2(AMLinlaso_diff,'top_3','AMLinlaso_diff','nodes','AMLinlaso_diff')

In [115]:
# diffusion profile of top 20 proteins and biological functions of zinc in AML

zinc20 = ['1369','2335','335','718','3507','351','1191','727','7157','731','GO:0006956',
         '2','333','348','3827','GO:0018149','733','732','GO:0045944','388697']
zincinAML_diff = diffinAML(zinc20)
#csv_generator_2(zincinAML_diff,'top_3','zincinAML_diff','nodes','zincinAML_diff')

In [116]:
# diffusion profile of top 20 proteins and biological functions of AML in zinc
AMLinzinc_diff = diffindrug(data_zinc)
#csv_generator_2(AMLinzinc_diff,'top_3','AMLinzinc_diff','nodes','AMLinzinc_diff')

In [117]:
# diffusion profile of top 20 proteins and biological functions of L-glutamic acid in AML
glut20 = ['2902','2898','2806','6505','2891','79751','2805','2903','124454','6507',
          '2747','2746','6506','GO:0097553','2744','GO:0045471','2895','6898','2356','2917']
glutinAML_diff = diffinAML(glut20)
#csv_generator_2(glutinAML_diff,'top_3','glutinAML_diff','nodes','glutinAML_diff')

In [118]:
# diffusion profile of top 20 proteins and biological functions of AML in L-glutamic acid
AMLinglut_diff = diffindrug(data_L_glutamic)
#csv_generator_2(AMLinglut_diff,'top_3','AMLinglut_diff','nodes','AMLinglut_diff')