Leiden followed by C means clustering of regulome

First make a proof of concept using igraph

Questions:
- How to measure distance
- How to measure membership
- How to treat small clusters - leave and expand or remove and add nodes to new?

If it works:
- make faster implementation

In [1]:
# Import packages

import igraph as ig
import pandas as pd
import numpy as np
import seaborn as sns
from compress_pickle import load, dump
import dask

def open_pickle(file):
    with open(path+file, 'rb') as pickle_file:
        return load(path=pickle_file, compression='infer')

path = './files/'

In [2]:
# load regulome
regulome_network_edges = pd.read_csv(filepath_or_buffer=path+'/human_regulome_pd.gz', compression='infer')
regulome_network_edges = regulome_network_edges.astype({'ProteinAid': 'str', 'ProteinBid':'str'})

regulome_graph = ig.Graph.DataFrame(regulome_network_edges, directed=False, use_vids=False)
msigdb_c3_tft_dict = open_pickle('msigdb_c3_tft_dict.pkl')

In [3]:
def leiden_clusters(graph, res, b, n_iter):
    return graph.community_leiden(objective_function='modularity',
                            weights='PPV',
                            resolution=res,
                            beta=b,
                            n_iterations=n_iter)

clusters = leiden_clusters(regulome_graph, 4, 0.1, 10)

clusters_dict = {n:set(cluster) for (n, cluster) in enumerate(clusters)}
proteins = pd.concat([regulome_network_edges['ProteinAid'], regulome_network_edges['ProteinBid']]).unique()

In [None]:
regulome_network_edges

In [14]:
import itertools as it
from itertools import chain

distances_dict = dict()
memberships_dict = dict()
graph = regulome_graph
clusters = clusters_dict
m = 0.5

optimize = True


def calc_distances(node):
    distances = dict()
    neighbors = set(graph.neighbors(node))
    for (cluster_id, cluster) in clusters.items():
        connected_nodes = neighbors.intersection(cluster)
        if len(connected_nodes) == 0:
            distances[cluster_id] = 1
            continue
 
        weights = [graph.es(graph.get_eid(node, connected_node))['PPV'] for connected_node in connected_nodes]
        weights_sum = sum(chain.from_iterable(weights))

        if len(connected_nodes) == 0: # maybe keep in case somehow in other appplications weights can be 0
            distances[cluster_id] = 1
            continue

        distances[cluster_id] = 1 / weights_sum # maybe make max distance 1?
    return distances


def calc_membership(distances):
    memberships = dict()
    fuzzy_exp = 2 / (m - 1)

    for (cluster_id, dij) in distances.items():
        denominator = sum([(dij/dik)**fuzzy_exp for dik in distances.values()])
        memberships[cluster_id] = 1 / denominator
    return memberships


def node_J_contribution(distances, memberships):
    Ji = 0
    for (distance, membership) in zip(distances.values(), memberships.values()):
        Ji += (membership ** m) * (distance ** 2)
    return Ji


def network_c_means():
    # maybe filter clusters first - sort of cluster QC
    J = 0
    n_nodes = graph.vcount()
    for i, node in enumerate(graph.vs):
        node = node['name']
        print(f'{i}/{n_nodes}')
        distances_dict[node] = calc_distances(node)
        memberships_dict[node] = calc_membership(distances_dict[node])

        if optimize:
            J += node_J_contribution(distances_dict[node], memberships_dict[node])

    print(J)

network_c_means()

0/15041
1/15041
2/15041
3/15041
4/15041
5/15041
6/15041
7/15041
8/15041
9/15041
10/15041
11/15041
12/15041
13/15041
14/15041
15/15041
16/15041
17/15041
18/15041
19/15041
20/15041
21/15041
22/15041
23/15041
24/15041
25/15041
26/15041
27/15041
28/15041
29/15041
30/15041
31/15041
32/15041
33/15041
34/15041
35/15041
36/15041
37/15041
38/15041
39/15041
40/15041
41/15041
42/15041
43/15041
44/15041
45/15041
46/15041
47/15041
48/15041
49/15041
50/15041
51/15041
52/15041
53/15041
54/15041
55/15041
56/15041
57/15041
58/15041
59/15041
60/15041
61/15041
62/15041
63/15041
64/15041
65/15041
66/15041
67/15041
68/15041
69/15041
70/15041
71/15041
72/15041
73/15041
74/15041
75/15041
76/15041
77/15041
78/15041
79/15041
80/15041
81/15041
82/15041
83/15041
84/15041
85/15041
86/15041
87/15041
88/15041
89/15041
90/15041
91/15041
92/15041
93/15041
94/15041
95/15041
96/15041
97/15041
98/15041
99/15041
100/15041
101/15041
102/15041
103/15041
104/15041
105/15041
106/15041
107/15041
108/15041
109/15041
110/15041


In [4]:
import itertools as it
from itertools import chain
from dask import delayed
from dask.distributed import LocalCluster, Client
from dask.diagnostics import ProgressBar
import joblib
from tqdm import tqdm

distances_dict = dict()
memberships_dict = dict()
cores = 2
memory_limit = '8GB'
J = 0
optimize = True
m = 0.5


# graph = regulome_graph
# clusters = clusters_dict


dask_cluster = LocalCluster(scheduler_port='41395')
client = Client(n_workers=2, threads_per_worker=1, memory_limit=memory_limit)

graph_future = client.scatter(regulome_graph, broadcast=True)
clusters_future = client.scatter(clusters_dict, broadcast=True)

def calc_distances(node, graph, clusters):
    distances = dict()
    neighbors = set(graph.neighbors(node))
    for (cluster_id, cluster) in clusters.items():
        connected_nodes = neighbors.intersection(cluster)
        if len(connected_nodes) == 0:
            distances[cluster_id] = 1
            continue
 
        weights = [graph.es(graph.get_eid(node, connected_node))['PPV'] for connected_node in connected_nodes]
        weights_sum = sum(chain.from_iterable(weights))

        if len(connected_nodes) == 0: # maybe keep in case somehow in other appplications weights can be 0
            distances[cluster_id] = 1
            continue

        distances[cluster_id] = 1 / weights_sum # maybe make max distance 1?
    return distances


def calc_membership(distances, m):
    
    memberships = dict()
    fuzzy_exp = 2 / (m - 1)

    for (cluster_id, dij) in distances.items():
        denominator = sum([(dij/dik)**fuzzy_exp for dik in distances.values()])
        memberships[cluster_id] = 1 / denominator
    return memberships


def node_J_contribution(distances, memberships, m):
    Ji = 0
    for (distance, membership) in zip(distances.values(), memberships.values()):
        Ji += (membership ** m) * (distance ** 2)

# @delayed
def node_iteration(node, graph, clusters, m=0.5):
    distances = calc_distances(node, graph, clusters)
    memberships = calc_membership(distances, m)
    if optimize:
        return node_J_contribution(distances, memberships, m)
    return 

def create_delayed_batches(nodes):
    batch_size = int(len(nodes)/2)
    batches = list()
    for i in range(0, len(nodes), batch_size):
        batches.append(nodes[i:i+batch_size])
    return batches

def network_c_means():
    # maybe filter clusters first - sort of cluster QC
    nodes = [node['name'] for node in graph_future.result().vs]    
    batches = create_delayed_batches(nodes)
    results = list()
    for batch in batches:
        task = [delayed(node_iteration)(node, graph_future, clusters_future, m) for node in batch]
        results.append(client.compute(task))

    client.close()

    print(J)

network_c_means()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 44009 instead
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
2024-11-29 14:16:45,277 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/home/vinic/miniconda3/envs/regulome/lib/python3.13/site-packages/distributed/comm/tcp.py", line 225, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/vinic/miniconda3/envs/regulome/lib/python3.13/site-packages/distributed/wo

0


In [6]:
client.close()

In [None]:
from itertools import chain

def network_c_means(graph, clusters):
    membership_dict = dict()

    # maybe remove small clusters before


    for node in graph.vs:
        node_membership = dict()
        node = node['name']
        neighbors = set(graph.neighbors(node))
        # maybe filter the clusters to which the node is conencted
        for (cluster_id, cluster) in clusters.items():
            connected_nodes = set(neighbors).intersection(cluster)
            
            if len(connected_nodes) == 0:
                node_membership[cluster_id] = 1
                continue

            weights = ([graph.es(graph.get_eid(node, connected_node))['PPV'] for connected_node in connected_nodes])
            weights_sum = sum(chain.from_iterable(weights))
            
            if weights_sum == 0: # dont need this myself, but a thing to look at if want to ship out later
                node_membership[cluster_id] = 1
                continue

            node_membership[cluster_id] = 1 / weights_sum
        
        membership_dict[node] = node_membership 
    return pd.DataFrame.from_dict(membership_dict, orient='columns')
            


network_c_means(regulome_graph, clusters_dict)

In [None]:


def network_c_means(clusters, size_lim, fuzzy):
     