# importing stuff and initial cleaning

In [1]:
import networkx as nx
import markov_clustering as mc
import random
import numpy as np 
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import math

In [2]:
g = nx.read_weighted_edgelist('4932.protein.links.v12.0.txt',comments = "#",nodetype=str)

# remove some edges
for u, v in g.edges:
    if g.get_edge_data(u, v)['weight'] < 500:
      g.remove_edge(u, v)

#remove weights
for node, edges in nx.to_dict_of_dicts(g).items():
    for edge, attrs in edges.items():
        attrs.pop('weight', None)

matrix = nx.to_numpy_array(g)
node_list = list(g.nodes)
related_proteins = ['4932.YMR190C','4932.YNL088W','4932.YLR234W','4932.YPL024W','4932.YMR167W' ]

# functions

In [None]:
def mcl(mtx,inflation_parameter):
    result = mc.run_mcl(mtx,inflation = inflation_parameter)
    clusters = mc.get_clusters(result)

    #relabelling node names 
    for i in range(0,len(clusters)):
        clu_list = list(clusters[i])
        
        for j in range(0,len(clu_list)):
            name = node_list[clu_list[j]]
            clu_list[j] = name
        clusters[i] = tuple(clu_list)

    return inflation_parameter, result, clusters   

def clu_to_adj_mtx(cluster):
    node_index = []
    for i in cluster:
        node_index.append(node_list.index(i))

    mat1 = matrix[node_index, :]
    out_mat = mat1[:, node_index]
    
    return out_mat
    
def adj_mtx_to_graph(mat,name):
    graph = nx.from_numpy_array(mat)
    graph = nx.relabel_nodes(graph,name)

    return graph

def graph_to_cent_meas(graph):
    result_dict={}
    result_dict['degree'] = sorted(nx.degree_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    result_dict['eigenvector'] = sorted(nx.eigenvector_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    #result_dict['katz'] = sorted(nx.katz_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    result_dict['closeness'] = sorted(nx.closeness_centrality(graph).items(), key=lambda x:x[1],reverse = True)
    result_dict['betweenness'] = sorted(nx.betweenness_centrality(graph).items(), key=lambda x:x[1],reverse = True)

    return result_dict

# clustering

In [None]:
# 1.4 is the inflation parameter that maximises modularity 
inflation_parameter = 1.4

markov_clustering_14 = mcl(matrix,1.4)
clusters = markov_clustering_14[2]

#sort the cluster based on size
clusters = sorted(clusters, key=len, reverse=True)

modularity = nx.community.modularity(g, clusters, weight='None', resolution=1)

In [None]:
clusters_adj_mtx = []
for i in range(0,len(clusters)):
    clusters_adj_mtx.append(clu_to_adj_mtx(clusters[i]))

clusters_graph = []
for i in range(0,len(clusters_adj_mtx)):
    clusters_graph.append(adj_mtx_to_graph(clusters_adj_mtx[i]))

clusters_cent_meas = []
for i in range(0,len(clusters_graph)):
    clusters_cent_meas.append(graph_to_cent_meas(clusters_graph[i]))

# sgs1 cluster and its centrality measure 

In [None]:
# checking which cluster SGS1 and other related proteins are in 
for i in clusters:
    if '4932.YMR190C' in i: 
        sgs1_clu_index = clusters.index(i)

In [None]:
# looking specifically at SGS1 cluster

#number of nodes 
print(len(clusters[sgs1_clu_index]))

#drawing them 
nx.draw(clusters_graph[sgs1_clu_index])

#centrality measure for each node in the cluster
clusters_cent_meas[sgs1_clu_index]

In [None]:
#centrality measure for sgs1 and the gang
for i in clusters_cent_meas[sgs1_clu_index]
    print(i)
    for j in related_proteins:
        print(j,clusters_cent_meas[sgs1_clu_index][i].index(j))
    print('')

In [None]:
# first 6 nodes sorted by centrality
for i in clusters_cent_meas[sgs1_clu_index]:
    print(i)
    for j in range(0,6):
        print(clusters_cent_meas[sgs1_clu_index][i][j])
    print('')

# other clusters and their "important" nodes

I think it is kind of pointless and annoying to look at small communities & singletons, which is why they'll be removed for the following instances. In particular, any community with less than 20 nodes. We then take the top few nodes for each centrality measure for each community. I think it might be worthwhile to look at them based on community size.

In [None]:
# remove small clusters
large_clusters_cent_meas=[]
for i in clusters_cent_meas:
    if len(i['degree'])>=20:
        large_clusters_cent_meas.append(i)

In [None]:
# prints the first 6 nodes in list
for i in large_clusters_cent_meas:
    for j in i: 
        print(j)
        for k in range(0,6)
            print(i[j][k])
        print('')

# comparison with clustering using other algorithm

Import pickle or json file, then calculate the maximum similarity between the two communities. My idea is that we count how many nodes are different and normalise based on number of nodes in a cluster