In [4]:
# import pacakges
import pandas as pd
import igraph as ig
import seaborn as sns
from statistics import mean, median, stdev
from matplotlib.pyplot import subplots
import pickle
import itertools

In [5]:
# import edge list ---- USE CYTOSCAPE TO VISUALIZE OR GRAPHISTRY
regulome_network = pd.read_csv("./files/humanRegulomeNetwork", delimiter="\t")
regulome_network_edges = regulome_network[['0:ProteinA', '1:ProteinB', '5:PPV', '18:LLR_GRG']]

# Create network objeect in iGraph
regulome_network_graph = ig.Graph.DataFrame(regulome_network_edges, directed=True, use_vids = False)

**Network Properties**

In [None]:
TFs = set(regulome_network_edges['0:ProteinA'])
targets = set(regulome_network_edges['1:ProteinB'])
non_TF_targets = targets - TFs

degree_distr = regulome_network_graph.degree(mode='all', loops=False)

avglocal_clusteringcoeff = regulome_network_graph.transitivity_avglocal_undirected(mode='zero', weights='5:PPV') # Ask which mode to use - nan or zero?
global_cluseringcoeff = regulome_network_graph.transitivity_undirected(mode='nan')

# path_lengths = regulome_network_graph.distances(source=None, target=None, weights='5:PPV', mode='all', algorithm='Djikstra') 
# path_lengths = list(itertools.chain.from_iterable(path_lengths)).remove(0.0)
# with open('./files/path_lengths.pkl', 'wb') as lengths_file:
#     pickle.dump(path_lengths, lengths_file)

with open('./files/path_lengths.pkl', 'rb') as lengths_file:
    path_lengths = pickle.load(lengths_file )


In [None]:
print(f"Number of TFs: {len(TFs)}")
print(f"Number of targets: {len(targets)}")
print(f"Number of non-TF targets: {len(non_TF_targets)}")

# Degree distributions
print(f"mean degree: {mean(degree_distr)}"
      f"\nmedian degree: {median(degree_distr)}"
      f"\nsd degree: {stdev(degree_distr)}" # several nodes with high degree pushing sd up
      f"\nhighest degree: {max(degree_distr)}"
      f"\nlowest degree: {min(degree_distr)}") # some outliers there with a shit ton of connections

# CLustering Coefficients
print(f"\nClustering Coefficients:"
      f"\nAverage local: {avglocal_clusteringcoeff}"
      f"\nGlobal: {global_cluseringcoeff}")
# slarge average but small global coefficient. Will generate many smaller clusters? 
# few connections between TF-target groups?

# Path lengths
print(f"\nPath lengths:"
      f"\nAverage: {mean(path_lengths)}"
      f"\nDiameter (longest): {max(path_lengths)}")

fig, axes = subplots(1,2, figsize=(12,4))
sns.histplot(degree_distr, binwidth=5,
            binrange=(0,250),
            ax=axes[0]).set_title("Degree distribution") # power distribution

sns.histplot(path_lengths,
             binwidth=1,
             ax=axes[1]).set_title("path lengths")



In [None]:
# visualize small portion of network for protein P20226

mask = regulome_network_edges['0:ProteinA'].isin(['P20226', 'Q16254'])

P20226_network = regulome_network_edges[mask]
P20226_network = P20226_network[regulome_network_edges['5:PPV'] > 0.99]
P20226_network_graph = ig.Graph.DataFrame(P20226_network, directed=True, use_vids = False)
P20226_network_graph.summary()

labels = P20226_network_graph.vs['name']

ig.plot(P20226_network_graph, vertex_label=labels)



In [None]:
# for fun - identify some highyly connected targets

# degree of all targets
target_degrees = regulome_network_edges['1:ProteinB'].value_counts()
highest_degree_target = target_degrees.axes[0].to_list()[0]

shared_Q8IV63 = regulome_network_graph.induced_subgraph(regulome_network_graph.neighborhood(vertices=highest_degree_target, order=2, mode='all'))
ig.plot(shared_Q8IV63)