# importing stuff and initial cleaning

In [37]:
import networkx as nx
import markov_clustering as mc
import random
import numpy as np 
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import math
import pickle

In [38]:
g = nx.read_weighted_edgelist('4932.protein.links.v12.0.txt',comments = "#",nodetype=str)

# remove some edges
for u, v in g.edges:
    if g.get_edge_data(u, v)['weight'] < 500:
      g.remove_edge(u, v)

#remove weights
for node, edges in nx.to_dict_of_dicts(g).items():
    for edge, attrs in edges.items():
        attrs.pop('weight', None)

matrix = nx.to_numpy_array(g)
node_list = list(g.nodes)
related_proteins = ['4932.YMR190C','4932.YNL088W','4932.YLR234W','4932.YPL024W','4932.YMR167W' ]

# functions

In [186]:
def mcl(mtx,inflation_parameter):
    result = mc.run_mcl(mtx,inflation = inflation_parameter)
    clusters = mc.get_clusters(result)

    #relabelling node names 
    for i in range(0,len(clusters)):
        clu_list = list(clusters[i])
        
        for j in range(0,len(clu_list)):
            name = node_list[clu_list[j]]
            clu_list[j] = name
        clusters[i] = tuple(clu_list)

    return inflation_parameter, result, clusters   

def clu_to_adj_mtx(cluster):
    node_index = []
    for i in cluster:
        node_index.append(node_list.index(i))

    mat1 = matrix[node_index, :]
    out_mat = mat1[:, node_index]
    
    return out_mat
    
def adj_mtx_to_graph(mat,name):
    graph = nx.from_numpy_array(mat)
    graph = nx.relabel_nodes(graph,name)

    return graph

def graph_to_cent_meas(graph):
    result_dict={}
    result_dict['degree'] = sorted(nx.degree_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    result_dict['eigenvector'] = sorted(nx.eigenvector_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    #result_dict['katz'] = sorted(nx.katz_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    result_dict['closeness'] = sorted(nx.closeness_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    result_dict['betweenness'] = sorted(nx.betweenness_centrality(graph).items(), key=lambda x:x[1],reverse = True)

    return result_dict

def important_nodes(cent_meas_of_clus, n_of_nodes):
    for i in cent_meas_of_clus:
        print(i+':')
        for j in range(0,n_of_nodes):
            print(cent_meas_of_clus[i][j])
        print('\n')
        

# clusterings output 

it's time consuming to rerun the algorithm everytime, so we run this once and output it

In [42]:
# 1.4 is the inflation parameter that maximises modularity 
inflation_parameter = 1.4

markov_clustering_14 = mcl(matrix,inflation_parameter)

In [47]:
# exports it into a pickle file 
with open('markov_clustering_14.pkl', 'wb') as file:
    pickle.dump(markov_clustering_14, file)

# retrieving clustering from pickle file 

In [None]:
with open('markov_clustering_14.pkl', 'rb') as file:
    markov_clustering_14 = pickle.load(file)

clusters = markov_clustering_14[2]

#sort the cluster based on size
clusters = sorted(clusters, key=len, reverse=True)

modularity = nx.community.modularity(g, clusters, weight='None', resolution=1)

In [63]:
# matches number to the name of the node 
cluster_nodename_dict_list=[]
for i in clusters: 
    clu_dict = {}
    for j in range(0,len(i)):
        clu_dict[j] = i[j]

    cluster_nodename_dict_list.append(clu_dict)

In [67]:
clusters_adj_mtx = []
for i in range(0,len(clusters)):
    clusters_adj_mtx.append(clu_to_adj_mtx(clusters[i]))

clusters_graph = []
for i in range(0,len(clusters_adj_mtx)):
    clusters_graph.append(adj_mtx_to_graph(clusters_adj_mtx[i],cluster_nodename_dict_list[i]))

clusters_cent_meas = []
for i in range(0,len(clusters_graph)):
    clusters_cent_meas.append(graph_to_cent_meas(clusters_graph[i]))

# sgs1 cluster and its centrality measure 

In [70]:
# checking which cluster SGS1 and other related proteins are in 
for i in clusters:
    if '4932.YMR190C' in i: 
        sgs1_clu_index = clusters.index(i)

In [72]:
sgs1_clu_index

1

In [84]:
# looking specifically at SGS1 cluster

#number of nodes 
print(len(clusters[sgs1_clu_index]))

#centrality measure for each node in the cluster
sgs1_cluster_cent_meas = clusters_cent_meas[sgs1_clu_index]

880


In [None]:
#drawing them 
nx.draw(clusters_graph[sgs1_clu_index])

In [132]:
#centrality measure for sgs1 and the gang
for i in clusters_cent_meas[sgs1_clu_index]:
    print(i+':')
    ranking = []
    for j in related_proteins:
        for k in range(0,len(clusters_cent_meas[sgs1_clu_index][i])):
            if j in clusters_cent_meas[sgs1_clu_index][i][k][0]:
                ranking.append([j,k,clusters_cent_meas[sgs1_clu_index][i][k]])
    ranking = sorted(ranking, key = lambda x:x[1])
    for i in ranking: 
        print(i)
    print('\n')

degree:
['4932.YMR190C', 8, ('4932.YMR190C', 0.21501706484641636)]
['4932.YNL088W', 32, ('4932.YNL088W', 0.15244596131968144)]
['4932.YLR234W', 115, ('4932.YLR234W', 0.10807736063708759)]
['4932.YMR167W', 123, ('4932.YMR167W', 0.10352673492605233)]
['4932.YPL024W', 209, ('4932.YPL024W', 0.07963594994311718)]


eigenvector:
['4932.YMR190C', 6, ('4932.YMR190C', 0.10679821320995969)]
['4932.YNL088W', 23, ('4932.YNL088W', 0.08646876957883082)]
['4932.YLR234W', 66, ('4932.YLR234W', 0.06428752482548797)]
['4932.YMR167W', 99, ('4932.YMR167W', 0.05445876652636618)]
['4932.YPL024W', 135, ('4932.YPL024W', 0.047717226242775926)]


closeness:
['4932.YMR190C', 13, ('4932.YMR190C', 0.5257177033492823)]
['4932.YNL088W', 23, ('4932.YNL088W', 0.5107495642068565)]
['4932.YLR234W', 117, ('4932.YLR234W', 0.47564935064935066)]
['4932.YMR167W', 358, ('4932.YMR167W', 0.4327917282127031)]
['4932.YPL024W', 368, ('4932.YPL024W', 0.43215339233038347)]


betweenness:
['4932.YMR190C', 16, ('4932.YMR190C', 0.012125

In [152]:
#first 10 nodes sorted by centrality
important_nodes(sgs1_cluster_cent_meas,10)

degree
('4932.YBR010W', 0.3811149032992036)
('4932.YNL031C', 0.32195676905574516)
('4932.YBR009C', 0.3174061433447099)
('4932.YBR160W', 0.2832764505119454)
('4932.YDR224C', 0.2707622298065984)
('4932.YER095W', 0.2502844141069397)
('4932.YNL030W', 0.22866894197952217)
('4932.YBR136W', 0.2229806598407281)
('4932.YMR190C', 0.21501706484641636)
('4932.YML032C', 0.21274175199089873)


eigenvector
('4932.YBR010W', 0.1482960111600498)
('4932.YNL031C', 0.1304573035229128)
('4932.YER095W', 0.120144037957283)
('4932.YBR009C', 0.11525875900007806)
('4932.YBR136W', 0.11163289456637947)
('4932.YBR160W', 0.11059798230011285)
('4932.YMR190C', 0.10679821320995969)
('4932.YPL153C', 0.10520570883202644)
('4932.YML032C', 0.10351118956891829)
('4932.YDR224C', 0.10211511612207437)


closeness
('4932.YBR010W', 0.6078838174273858)
('4932.YNL031C', 0.5856095936042638)
('4932.YBR009C', 0.5798153034300791)
('4932.YBR160W', 0.5700389105058365)
('4932.YDR224C', 0.5584498094027954)
('4932.YER095W', 0.5419235511713

# other clusters and their "important" nodes

I think it is kind of pointless and annoying to look at small communities & singletons, which is why they'll be removed for the following instances. In particular, any community with less than 20 nodes. We then take the top few nodes for each centrality measure for each community. I think it might be worthwhile to look at them based on community size.

In [176]:
# remove small clusters
large_clusters_cent_meas=[]
for i in range(0,len(clusters_cent_meas)):
    if len(clusters_cent_meas[i]['degree'])>=20:
        large_clusters_cent_meas.append(clusters_cent_meas[i])

In [190]:
# prints the first n nodes in list
nodes_to_print = 10

counter = 0
for i in large_clusters_cent_meas:
    print(counter,'~~~~~~~~~~~~~~~~~~~~')
    important_nodes(i,nodes_to_print)
    counter+=1

0 ~~~~~~~~~~~~~~~~~~~~
degree:
('4932.YLL013C', 0.19596354166666666)
('4932.YGL026C', 0.16796875)
('4932.YBR196C', 0.12369791666666666)
('4932.YBL099W', 0.123046875)
('4932.YDR050C', 0.12044270833333333)
('4932.YCR012W', 0.1171875)
('4932.YKL211C', 0.11653645833333333)
('4932.YLL041C', 0.11328125)
('4932.YKL192C', 0.111328125)
('4932.YER069W', 0.107421875)


eigenvector:
('4932.YDR050C', 0.12172156501784563)
('4932.YBR196C', 0.12146130361943822)
('4932.YNR001C', 0.11945201156953499)
('4932.YPL262W', 0.11779783238672428)
('4932.YCR012W', 0.11400826860998146)
('4932.YGL026C', 0.11176724477178801)
('4932.YLR304C', 0.11151186138834446)
('4932.YCR005C', 0.1102609390837423)
('4932.YBL099W', 0.10840472235330212)
('4932.YKL085W', 0.10779431384183395)


closeness:
('4932.YLL013C', 0.5106382978723404)
('4932.YBL099W', 0.471889400921659)
('4932.YCR012W', 0.47116564417177914)
('4932.YGL026C', 0.4692942254812099)
('4932.YKL192C', 0.4668693009118541)
('4932.YGR192C', 0.4620938628158845)
('4932.YDR05

# comparison with clustering using other algorithm

Import pickle or json file, then calculate the maximum similarity between the two communities. My idea is that we count how many nodes are different and normalise based on number of nodes in a comm