# importing stuff and initial cleaning

In [3]:
import networkx as nx
import markov_clustering as mc
import random
import numpy as np 
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import math
import pickle

In [4]:
g = nx.read_weighted_edgelist('4932.protein.links.v12.0.txt',comments = "#",nodetype=str)

# remove some edges
for u, v in g.edges:
    if g.get_edge_data(u, v)['weight'] < 500:
      g.remove_edge(u, v)

#remove weights
for node, edges in nx.to_dict_of_dicts(g).items():
    for edge, attrs in edges.items():
        attrs.pop('weight', None)

matrix = nx.to_numpy_array(g)
node_list = list(g.nodes)
related_proteins = ['4932.YMR190C','4932.YNL088W','4932.YLR234W','4932.YPL024W','4932.YMR167W' ]

# functions

In [61]:
def mcl(mtx,inflation_parameter):
    result = mc.run_mcl(mtx,inflation = inflation_parameter)
    clusters = mc.get_clusters(result)

    #relabelling node names 
    for i in range(0,len(clusters)):
        clu_list = list(clusters[i])
        
        for j in range(0,len(clu_list)):
            name = node_list[clu_list[j]]
            clu_list[j] = name
        clusters[i] = tuple(clu_list)

    return inflation_parameter, result, clusters   

def clu_to_adj_mtx(cluster):
    node_index = []
    for i in cluster:
        node_index.append(node_list.index(i))

    mat1 = matrix[node_index, :]
    out_mat = mat1[:, node_index]
    
    return out_mat
    
def adj_mtx_to_graph(mat,name):
    graph = nx.from_numpy_array(mat)
    graph = nx.relabel_nodes(graph,name)

    return graph

def graph_to_cent_meas(graph):
    result_dict={}
    result_dict['degree'] = sorted(nx.degree_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    result_dict['eigenvector'] = sorted(nx.eigenvector_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    #result_dict['katz'] = sorted(nx.katz_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    result_dict['closeness'] = sorted(nx.closeness_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    result_dict['betweenness'] = sorted(nx.betweenness_centrality(graph).items(), key=lambda x:x[1],reverse = True)

    return result_dict

def important_nodes(cent_meas_of_clus, n_of_nodes):
    for i in cent_meas_of_clus:
        print(i+':')
        for j in range(0,n_of_nodes):
            print(cent_meas_of_clus[i][j])
        print('\n')

# clusters needs to be in list of tuples/list, name needs to be in string
# the community outputed is sorted from largest to smallest, ask function to ignore the first two lines if you want to preserve ordering
# this function ignores any communities with score == 0 
def similarity_of_clusterings(clusters1,name1, clusters2,name2):
    clusters1 = sorted(clusters1,key = len, reverse = True)
    clusters2 = sorted(clusters2,key = len, reverse = True)

    sim_list = []
    for i in range(0,len(clusters1)) :
        similarity = []
        for j in range(0,len(clusters2)):
            counter = 0
            for k in clusters1[i]:
                if k in clusters2[j]:
                    counter +=1
            score = 2*counter/(len(clusters1[i])+len(clusters2[j]))
            if score > 0:
                similarity.append((str(name2)+str(j)+' '+str(len(clusters2[j]))+':',score))
        similarity = sorted(similarity, key = lambda x:x[1], reverse= True)
        similarity.insert(0,str(str(name1)+str(i)+' '+str(len(clusters1[i]))+':'))
        sim_list.append(similarity)
    return sim_list

def large_comm(cluster,threshold):
    large = []
    for i in cluster:
        if len(i)>=threshold:
            large.append(i)

    return large

# clusterings output 

it's time consuming to rerun the algorithm everytime, so we run this once and output it

In [42]:
# 1.4 is the inflation parameter that maximises modularity 
inflation_parameter = 1.4

markov_clustering_14 = mcl(matrix,inflation_parameter)

In [47]:
# exports it into a pickle file 
with open('markov_clustering_14.pkl', 'wb') as file:
    pickle.dump(markov_clustering_14, file)

# retrieving clustering from pickle file 

In [6]:
with open('markov_clustering_14.pkl', 'rb') as file:
    markov_clustering_14 = pickle.load(file)

clusters = markov_clustering_14[2]

#sort the cluster based on size
clusters = sorted(clusters, key=len, reverse=True)

large_clusters = []
for i in clusters:
    if len(i)>=20:
        large_clusters.append(i)


modularity = nx.community.modularity(g, clusters, weight='None', resolution=1)

In [7]:
# matches number to the name of the node 
cluster_nodename_dict_list=[]
for i in clusters: 
    clu_dict = {}
    for j in range(0,len(i)):
        clu_dict[j] = i[j]

    cluster_nodename_dict_list.append(clu_dict)

In [8]:
clusters_adj_mtx = []
for i in range(0,len(clusters)):
    clusters_adj_mtx.append(clu_to_adj_mtx(clusters[i]))

clusters_graph = []
for i in range(0,len(clusters_adj_mtx)):
    clusters_graph.append(adj_mtx_to_graph(clusters_adj_mtx[i],cluster_nodename_dict_list[i]))

clusters_cent_meas = []
for i in range(0,len(clusters_graph)):
    clusters_cent_meas.append(graph_to_cent_meas(clusters_graph[i]))

# sgs1 cluster and its centrality measure 

In [70]:
# checking which cluster SGS1 and other related proteins are in 
for i in clusters:
    if '4932.YMR190C' in i: 
        sgs1_clu_index = clusters.index(i)

In [72]:
sgs1_clu_index

1

In [84]:
# looking specifically at SGS1 cluster

#number of nodes 
print(len(clusters[sgs1_clu_index]))

#centrality measure for each node in the cluster
sgs1_cluster_cent_meas = clusters_cent_meas[sgs1_clu_index]

880


In [None]:
#drawing them 
nx.draw(clusters_graph[sgs1_clu_index])

In [132]:
#centrality measure for sgs1 and the gang
for i in clusters_cent_meas[sgs1_clu_index]:
    print(i+':')
    ranking = []
    for j in related_proteins:
        for k in range(0,len(clusters_cent_meas[sgs1_clu_index][i])):
            if j in clusters_cent_meas[sgs1_clu_index][i][k][0]:
                ranking.append([j,k,clusters_cent_meas[sgs1_clu_index][i][k]])
    ranking = sorted(ranking, key = lambda x:x[1])
    for i in ranking: 
        print(i)
    print('\n')

degree:
['4932.YMR190C', 8, ('4932.YMR190C', 0.21501706484641636)]
['4932.YNL088W', 32, ('4932.YNL088W', 0.15244596131968144)]
['4932.YLR234W', 115, ('4932.YLR234W', 0.10807736063708759)]
['4932.YMR167W', 123, ('4932.YMR167W', 0.10352673492605233)]
['4932.YPL024W', 209, ('4932.YPL024W', 0.07963594994311718)]


eigenvector:
['4932.YMR190C', 6, ('4932.YMR190C', 0.10679821320995969)]
['4932.YNL088W', 23, ('4932.YNL088W', 0.08646876957883082)]
['4932.YLR234W', 66, ('4932.YLR234W', 0.06428752482548797)]
['4932.YMR167W', 99, ('4932.YMR167W', 0.05445876652636618)]
['4932.YPL024W', 135, ('4932.YPL024W', 0.047717226242775926)]


closeness:
['4932.YMR190C', 13, ('4932.YMR190C', 0.5257177033492823)]
['4932.YNL088W', 23, ('4932.YNL088W', 0.5107495642068565)]
['4932.YLR234W', 117, ('4932.YLR234W', 0.47564935064935066)]
['4932.YMR167W', 358, ('4932.YMR167W', 0.4327917282127031)]
['4932.YPL024W', 368, ('4932.YPL024W', 0.43215339233038347)]


betweenness:
['4932.YMR190C', 16, ('4932.YMR190C', 0.012125

In [152]:
#first 10 nodes sorted by centrality
important_nodes(sgs1_cluster_cent_meas,10)

degree
('4932.YBR010W', 0.3811149032992036)
('4932.YNL031C', 0.32195676905574516)
('4932.YBR009C', 0.3174061433447099)
('4932.YBR160W', 0.2832764505119454)
('4932.YDR224C', 0.2707622298065984)
('4932.YER095W', 0.2502844141069397)
('4932.YNL030W', 0.22866894197952217)
('4932.YBR136W', 0.2229806598407281)
('4932.YMR190C', 0.21501706484641636)
('4932.YML032C', 0.21274175199089873)


eigenvector
('4932.YBR010W', 0.1482960111600498)
('4932.YNL031C', 0.1304573035229128)
('4932.YER095W', 0.120144037957283)
('4932.YBR009C', 0.11525875900007806)
('4932.YBR136W', 0.11163289456637947)
('4932.YBR160W', 0.11059798230011285)
('4932.YMR190C', 0.10679821320995969)
('4932.YPL153C', 0.10520570883202644)
('4932.YML032C', 0.10351118956891829)
('4932.YDR224C', 0.10211511612207437)


closeness
('4932.YBR010W', 0.6078838174273858)
('4932.YNL031C', 0.5856095936042638)
('4932.YBR009C', 0.5798153034300791)
('4932.YBR160W', 0.5700389105058365)
('4932.YDR224C', 0.5584498094027954)
('4932.YER095W', 0.5419235511713

# other clusters and their "important" nodes

I think it is kind of pointless and annoying to look at small communities & singletons, which is why they'll be removed for the following instances. In particular, any community with less than 20 nodes. We then take the top few nodes for each centrality measure for each community. I think it might be worthwhile to look at them based on community size.

In [12]:
# remove small clusters
large_clusters_cent_meas=[]
for i in range(0,len(clusters_cent_meas)):
    if len(clusters_cent_meas[i]['degree'])>=20:
        large_clusters_cent_meas.append(clusters_cent_meas[i])

In [13]:
len(large_clusters_cent_meas)

36

In [190]:
# prints the first n nodes in list
nodes_to_print = 10

counter = 0
for i in large_clusters_cent_meas:
    print(counter,'~~~~~~~~~~~~~~~~~~~~')
    important_nodes(i,nodes_to_print)
    counter+=1

0 ~~~~~~~~~~~~~~~~~~~~
degree:
('4932.YLL013C', 0.19596354166666666)
('4932.YGL026C', 0.16796875)
('4932.YBR196C', 0.12369791666666666)
('4932.YBL099W', 0.123046875)
('4932.YDR050C', 0.12044270833333333)
('4932.YCR012W', 0.1171875)
('4932.YKL211C', 0.11653645833333333)
('4932.YLL041C', 0.11328125)
('4932.YKL192C', 0.111328125)
('4932.YER069W', 0.107421875)


eigenvector:
('4932.YDR050C', 0.12172156501784563)
('4932.YBR196C', 0.12146130361943822)
('4932.YNR001C', 0.11945201156953499)
('4932.YPL262W', 0.11779783238672428)
('4932.YCR012W', 0.11400826860998146)
('4932.YGL026C', 0.11176724477178801)
('4932.YLR304C', 0.11151186138834446)
('4932.YCR005C', 0.1102609390837423)
('4932.YBL099W', 0.10840472235330212)
('4932.YKL085W', 0.10779431384183395)


closeness:
('4932.YLL013C', 0.5106382978723404)
('4932.YBL099W', 0.471889400921659)
('4932.YCR012W', 0.47116564417177914)
('4932.YGL026C', 0.4692942254812099)
('4932.YKL192C', 0.4668693009118541)
('4932.YGR192C', 0.4620938628158845)
('4932.YDR05

# comparison with clustering using other algorithm

Import pickle or json file, then calculate the maximum similarity between the two communities. My idea is that to 2x the number of common nodes and normalise it by the sum of nodes in both community

$$\text{similarity} = \frac{2* \text{number of common nodes}}{\text{number of nodes in comm 1}+\text{number of nodes in comm 2}}$$ 

In [73]:
# fast label prop - Rocky
with open('Fast_PP_1.pkl', 'rb') as file:
    label_prop_1 = pickle.load(file)
label_prop_1 = sorted(label_prop_1, key=len,reverse = True)


In [23]:
# info map - Yuan
with open('infomap_output.pkl', 'rb') as file:
    infomap = pickle.load(file)

infomap_list = []
for i in infomap:
    infomap_list.append(infomap[i])

infomap_list = sorted(infomap_list, key=len,reverse = True)

[['4932.YML056C',
  '4932.YDL063C',
  '4932.YOL023W',
  '4932.YGR081C',
  '4932.YDR101C',
  '4932.Q0140',
  '4932.YHR075C',
  '4932.YBR031W',
  '4932.YPR143W',
  '4932.YLR185W',
  '4932.YGR128C',
  '4932.YDR500C',
  '4932.YMR173W',
  '4932.YBL091C',
  '4932.YLR106C',
  '4932.YJR119C',
  '4932.YCR028C-A',
  '4932.YGR130C',
  '4932.YPL239W',
  '4932.YBR122C',
  '4932.YDR012W',
  '4932.YPL226W',
  '4932.YJR153W',
  '4932.YKL067W',
  '4932.YOR061W',
  '4932.YKL081W',
  '4932.YPL048W',
  '4932.YDR051C',
  '4932.YOR253W',
  '4932.YDR104C',
  '4932.YKL078W',
  '4932.YPL193W',
  '4932.YOR207C',
  '4932.YNL014W',
  '4932.YDR520C',
  '4932.YLR150W',
  '4932.YIL035C',
  '4932.YLR244C',
  '4932.YLR249W',
  '4932.YMR176W',
  '4932.YHR059W',
  '4932.YMR014W',
  '4932.YDR361C',
  '4932.YLR366W',
  '4932.YPR080W',
  '4932.YBR118W',
  '4932.YBR146W',
  '4932.YNL177C',
  '4932.YDR226W',
  '4932.YLR388W',
  '4932.YJR101W',
  '4932.YEL050C',
  '4932.YIL070C',
  '4932.YPL118W',
  '4932.YOR133W',
  '4932.YB

In [65]:
for i in clusters: 
    print(len(i))

1537
880
722
598
213
136
136
123
115
90
88
85
77
74
72
54
47
47
45
41
41
38
38
37
37
36
32
30
30
27
27
23
22
22
21
20
19
19
18
18
18
18
16
16
15
14
13
13
13
12
12
12
12
11
11
11
11
10
9
9
9
9
9
9
9
8
8
7
7
6
6
5
5
5
5
5
5
5
4
4
4
4
4
4
3
3
3
3
3
2
2
2
2
2
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [69]:
sim_lim_lmc = similarity_of_clusterings(large_comm(infomap_list,30),'im',large_comm(clusters,30),'mc')
sim_lim_lmc

for i in sim_lim_lmc:
    if i[1][1]>=0.5:
        print(i[0],i[1])
    else: 
        print(i[0],'no suitable comm')

im0 713: ('mc2 722:', 0.9449477351916377)
im1 441: no suitable comm
im2 263: no suitable comm
im3 237: no suitable comm
im4 210: no suitable comm
im5 208: ('mc4 213:', 0.9216152019002375)
im6 196: no suitable comm
im7 194: no suitable comm
im8 144: no suitable comm
im9 119: ('mc5 136:', 0.9098039215686274)
im10 117: ('mc8 115:', 0.9137931034482759)
im11 111: no suitable comm
im12 107: ('mc6 136:', 0.8065843621399177)
im13 103: no suitable comm
im14 97: ('mc7 123:', 0.8818181818181818)
im15 89: no suitable comm
im16 89: ('mc11 85:', 0.9540229885057471)
im17 88: no suitable comm
im18 83: no suitable comm
im19 72: no suitable comm
im20 69: ('mc10 88:', 0.8152866242038217)
im21 67: no suitable comm
im22 65: ('mc12 77:', 0.9014084507042254)
im23 58: ('mc14 72:', 0.8307692307692308)
im24 55: ('mc13 74:', 0.8372093023255814)
im25 53: no suitable comm
im26 45: no suitable comm
im27 43: ('mc15 54:', 0.865979381443299)
im28 42: no suitable comm
im29 41: ('mc24 37:', 0.8205128205128205)
im30 40: 

In [17]:
# fluid percolation - Luka
with open('Fluid_Percolation.pkl', 'rb') as file:
    fluid_perc = pickle.load(file)
#fluid_perc = sorted(label_prop_1, key=len,reverse = True)

with open('Fluid_Percolation_36.pkl', 'rb') as file:
    fluid_perc_36 = pickle.load(file)

In [75]:
#simiarity of label prop and markov clustering
sim_llp_lmc = similarity_of_clusterings(large_comm(label_prop_1,30),'lp',large_comm(clusters,30),'mc')
for i in sim_llp_lmc:
    if i[1][1]>=0.5:
        print(i[0],i[1])
    else: 
        print('no suitable comm')

lp0 1030: ('mc2 722:', 0.7682648401826484)
lp1 696: ('mc1 880:', 0.7461928934010152)
no suitable comm
no suitable comm
no suitable comm
lp5 116: ('mc5 136:', 0.8809523809523809)
no suitable comm
no suitable comm
lp8 98: ('mc4 213:', 0.6045016077170418)
no suitable comm
lp10 84: ('mc4 213:', 0.5050505050505051)
no suitable comm
lp12 77: ('mc11 85:', 0.8765432098765432)
no suitable comm
lp14 64: ('mc10 88:', 0.75)
no suitable comm
lp16 51: ('mc12 77:', 0.765625)
lp17 48: ('mc19 41:', 0.7415730337078652)
no suitable comm
lp19 42: ('mc14 72:', 0.7017543859649122)
no suitable comm
no suitable comm
lp22 36: ('mc13 74:', 0.6363636363636364)
lp23 34: ('mc15 54:', 0.75)
lp24 32: ('mc21 38:', 0.9142857142857143)
no suitable comm
no suitable comm
lp27 31: ('mc25 36:', 0.8059701492537313)


In [23]:
#simiarity of fluid perc and markov clustering
similarity_of_clusterings(fluid_perc,'fp',clusters,'mc')

[['fp0:',
  ('mc2', 0.9206989247311828),
  ('mc0', 0.04081632653061224),
  ('mc9', 0.03271028037383177),
  ('mc90', 0.005208333333333333),
  ('mc3', 0.002932551319648094),
  ('mc76', 0.0025940337224383916),
  ('mc61', 0.0025806451612903226),
  ('mc55', 0.002574002574002574),
  ('mc42', 0.0025575447570332483),
  ('mc35', 0.002544529262086514),
  ('mc34', 0.0025412960609911056),
  ('mc32', 0.0025380710659898475),
  ('mc30', 0.0025220680958385876),
  ('mc23', 0.0024906600249066002),
  ('mc19', 0.0024783147459727386),
  ('mc18', 0.002466091245376079),
  ('mc16', 0.0024600246002460025),
  ('mc13', 0.002380952380952381),
  ('mc12', 0.002372479240806643),
  ('mc5', 0.0022172949002217295),
  ('mc1', 0.001215066828675577)],
 ['fp1:',
  ('mc4', 0.647985989492119),
  ('mc12', 0.2574712643678161),
  ('mc15', 0.1650485436893204),
  ('mc21', 0.12626262626262627),
  ('mc0', 0.025329815303430078),
  ('mc82', 0.016574585635359115),
  ('mc63', 0.01634877384196185),
  ('mc27', 0.015463917525773196),
  ('

In [21]:
similarity_of_clusterings(fluid_perc_36,'fp',clusters,'mc')

[['fp0:',
  ('mc2', 0.6765475152571927),
  ('mc34', 0.04484304932735426),
  ('mc0', 0.01834862385321101),
  ('mc90', 0.00468384074941452),
  ('mc76', 0.004651162790697674),
  ('mc55', 0.0045871559633027525),
  ('mc51', 0.004576659038901602),
  ('mc42', 0.0045351473922902496),
  ('mc35', 0.0044943820224719105),
  ('mc19', 0.004291845493562232),
  ('mc16', 0.00423728813559322),
  ('mc11', 0.00392156862745098)],
 ['fp1:',
  ('mc2', 0.5511961722488038),
  ('mc75', 0.018292682926829267),
  ('mc0', 0.01827956989247312),
  ('mc33', 0.011594202898550725),
  ('mc51', 0.005970149253731343),
  ('mc43', 0.0058997050147492625),
  ('mc37', 0.005847953216374269),
  ('mc34', 0.005813953488372093),
  ('mc30', 0.005714285714285714),
  ('mc13', 0.005037783375314861),
  ('mc9', 0.004842615012106538),
  ('mc7', 0.004484304932735426),
  ('mc5', 0.004357298474945534),
  ('mc6', 0.004357298474945534),
  ('mc1', 0.0033250207813798837),
  ('mc3', 0.002171552660152009)],
 ['fp2:',
  ('mc7', 0.514745308310992),
 