In [None]:
import pathlib
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from networkx.drawing.nx_agraph import graphviz_layout

In [None]:
def create_graph_from_interactions(filename, sheet, source, target):
    protein_interactions = pd.read_excel(filename, sheet_name=sheet)
    G = nx.from_pandas_edgelist(protein_interactions, source, target)
    return G

def draw_net(G, ax=None, labels=True):
    if ax is None :
        ax = plt.gca()

    pos = graphviz_layout(G, prog='fdp')

    params = {
        # 'with_labels': True,
        'node_color': 'silver',
        'edge_color': 'silver',
        'font_color': 'k',
        'edgecolors' : 'k',
        'node_size' : 150,
        # 'node_shape' : 's',
        'bbox' : dict(facecolor="mediumseagreen", edgecolor='black', boxstyle='round, pad=0.2', alpha=1)
    }
    nx.draw_networkx_nodes(G, ax=ax, pos=pos, **params)
    nx.draw_networkx_edges(G, ax=ax, pos=pos, **params)
    
    if labels:
        nx.draw_networkx_labels(G, ax=ax, pos=pos, **params)

    return ax
    
def net_info(G) :
    return f"{len(G)} nodes and {len(G.edges)} edges"

def highlight_subgraph(G, subgraph, ax=None, color='red', labels=True):
    if ax is None :
        ax = plt.gca()

    draw_net(G, labels=True, ax=ax)
    pos = graphviz_layout(G, prog='fdp')

    params = {
        # 'with_labels': True,
        'node_color': 'red',
        'edge_color': 'red',
        'font_color': 'k',
        'edgecolors' : 'k',
        'node_size' : 150,
        # 'node_shape' : 's',
        'bbox' : dict(facecolor="red", edgecolor='black', boxstyle='round, pad=0.2', alpha=1)
    }

    nx.draw_networkx_nodes(subgraph, pos=pos, ax=ax, **params)
    nx.draw_networkx_edges(subgraph, pos=pos, ax=ax, **params)
    if labels:
        nx.draw_networkx_labels(subgraph, ax=ax, pos=pos, **params)

### Create graph from Y2H interactions

In [None]:
G_Y2H = create_graph_from_interactions(
    filename="../data/publications/journal.pgen.1003398.s008.xlsx",
    sheet="PPIs from Y2H screen Fig1B,C",
    source="Entrez gene ID A human",
    target="Entrez gene ID B human")

print(net_info(G_Y2H))
# fig = plt.figure(figsize=(13, 8))
# draw_net(G_Y2H)

### Create graph from UniHI MAN interactions

In [None]:
G_unihi_man = create_graph_from_interactions(
    filename="../data/publications/journal.pgen.1003398.s008.xlsx",
    sheet="Enrichment UniHI MAN Fig2",
    source="Entrez gene ID A human",
    target="Entrez gene ID B human")

print(net_info(G_unihi_man))
# fig = plt.figure(figsize=(13, 8))
# draw_net(G_unihi_man)

### Create graph from extended interactions

In [None]:
G_extension = create_graph_from_interactions(
    filename="../data/publications/journal.pgen.1003398.s008.xlsx",
    sheet="Extension interactions Fig2",
    source="Entrez gene ID A",
    target="Entrez gene ID B")

print(net_info(G_extension))
# fig = plt.figure(figsize=(13, 8))
# draw_net(G_extension)


### Combine the three networks together

In [None]:
G_1 = G_Y2H
G_2 = nx.compose(G_1, G_unihi_man)
G_3 = nx.compose(G_2, G_extension)

print(net_info(G_3))
# paper says 134 nodes with 625 edges
# Y2H network: 39 nodes and 150 edges
# Y2H & UniHI network: 46 nodes and 212 edges
# Full network: 134 nodes with 624 edges

graph_name = 'full'

### Highlight the nodes that come from the core Y2H interactions

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
highlight_subgraph(G_3, G_Y2H, ax=ax)
plt.show()

### Combine the two lists of expected proteins

In [None]:
proteins_core = pd.read_excel("../data/publications/journal.pgen.1003398.s008.xlsx", sheet_name="46 circadian components Fig1")
proteins_core = proteins_core.iloc[:, 0:4]
proteins_core.columns = ['entrez_human', 'entrez_mouse', 'symbol_human', 'symbol_mouse']

proteins_extension = pd.read_excel("../data/publications/journal.pgen.1003398.s008.xlsx", sheet_name="Extension proteins Fig2")
proteins_extension = proteins_extension.iloc[:, 1:5]
proteins_extension.columns = ['entrez_human', 'entrez_mouse', 'symbol_human', 'symbol_mouse']

proteins = pd.concat([proteins_core, proteins_extension], ignore_index=True) # 134 unique proteins
expected_proteins = list(proteins['entrez_human'].values)

### For every protein that's actually involved in an interaction, add data from the 'expected proteins' table

In [None]:
interacting_proteins = pd.DataFrame(G_3.nodes)
interacting_proteins.columns = ['entrez_human']
interacting_proteins = interacting_proteins.merge(proteins, how='left', left_on='entrez_human', right_on='entrez_human')
entrez_to_symbol = dict(zip(interacting_proteins['entrez_human'], interacting_proteins['symbol_human']))

### Check all expected proteins were involved in a reaction, and vice versa

In [None]:
proteins_not_in_any_interaction = [
    protein
    for protein
    in expected_proteins
    if protein not in G_3.nodes
]

unexpected_interacting_proteins = [
    interacting_protein
    for interacting_protein
    in G_3.nodes
    if interacting_protein not in expected_proteins
]

print(sorted(unexpected_interacting_proteins))
print(sorted(proteins_not_in_any_interaction))

### Merge Affy IDs with Entrez IDs

In [None]:
temporal_node_data = pd.read_csv('../data/temporal_data/circadian/GSE11923_series_matrix.txt', sep='\t', header=63, index_col=0, skipfooter=1, engine='python')
affy_to_entrez = pd.read_csv('../data/genes/DAVID_affy_to_entrez.txt', sep='\t', index_col=0)

temporal_node_data = temporal_node_data.merge(affy_to_entrez['entrez'], how='left', left_index=True, right_index=True)
temporal_node_data['affy'] = temporal_node_data.index

temporal_node_data = interacting_proteins.merge(temporal_node_data, how='inner', left_on='entrez_mouse', right_on='entrez')

columns = list(temporal_node_data.columns.values)
columns = [columns[-1]] + [columns[1], columns[0]] + [columns[3], columns[2]] + columns[4:-2]
temporal_node_data = temporal_node_data[columns]
columns = columns[0:5] + [i+1 for i, _ in enumerate(columns[5:])]
temporal_node_data.columns = columns
temporal_node_data.sort_values('entrez_mouse', inplace=True)

### Restrict to genes of a certain cycle length

In [None]:
# 130 genes in Y2H & UniHI network
# 24 hour genes: 3667 in list, of which 32 are in static network

# 107 genes in Y2H & UniHI network
# 24 hour genes: 3667 in list, of which 28 are in static network

# 357 genes in full static network
#  8 hour genes: 63 in list, of which 0 are in static network
# 12 hour genes: 260 in list, of which 7 are in static network
# 24 hour genes: 3667 in list, of which 58 are in static network

cyclic_proteins_filename = '../data/genes/24_hour_genes.XLS'
cyclic_proteins = pd.read_excel(cyclic_proteins_filename, sheet_name='Sheet1')
cyclic_temporal_node_data = temporal_node_data.merge(cyclic_proteins, how='inner', left_on='affy', right_on='Probe Set ID')

cycle = f'{pathlib.Path(cyclic_proteins_filename).stem}_'
# cycle = ''

### Plot subgraph induced by cyclic genes

In [None]:
symbolic_G_3 = nx.relabel_nodes(G_3, entrez_to_symbol)
cyclic_genes_subgraph = symbolic_G_3.subgraph(cyclic_temporal_node_data['symbol_human'].values)
print(net_info(cyclic_genes_subgraph))

fig, ax = plt.subplots(figsize=(20, 15))
highlight_subgraph(symbolic_G_3, cyclic_genes_subgraph, ax=ax)
plt.savefig(f'../data/output/{graph_name}_circadian_network_with_24h_genes_subgraph.png', dpi=300)


### Create a table of temporal node data using the mean

IMPORTANT: The Affy IDs don't match uniquely to Entrez IDs, so in lots of cases a single Entrez ID can correspond to
several sets of temporal data.

In [None]:
temporal_node_data_mean = pd.DataFrame()
temporal_node_data_mean_normalised = pd.DataFrame()
max_time = 48

# ToDo: use 'groupby' here instead!
for entrez_human in G_3.nodes:
    affy_proteins = temporal_node_data.loc[temporal_node_data['entrez_human'] == entrez_human]
    if affy_proteins.empty:
        print(f'Human entrez ID {entrez_human} either has no temporal data or no affy ID')
    else:
        symbol = affy_proteins['symbol_human'].values[0]
        series = affy_proteins[affy_proteins.columns[5:5 + max_time]].T
        mean = series.apply(lambda row: row.mean(), axis=1)
        temporal_node_data_mean[symbol] = mean
        difference = mean.max() - mean.min()
        temporal_node_data_mean_normalised[symbol] = (mean - mean.min()) / difference

temporal_node_data_mean.to_csv(f'../data/temporal_data/circadian/circadian_temporal_node_data_{cycle}mean_{graph_name}.csv', sep='\t')
temporal_node_data_mean_normalised.to_csv(f'../data/temporal_data/circadian/circadian_temporal_node_data_{cycle}mean_normalised_{graph_name}.csv', sep='\t')

### Create a table of temporal node data by just picking the first corresponding time series

In [None]:
# temporal_node_data_first = pd.DataFrame()
# temporal_node_data_first_normalised = pd.DataFrame()
# max_time = 48
#
# for entrez_human in G_3.nodes:
#     affy_proteins = temporal_node_data.loc[temporal_node_data['entrez_human'] == entrez_human]
#     if not affy_proteins.empty:
#         first_series = affy_proteins[affy_proteins.columns[5:5 + max_time]].iloc[0].T
#         temporal_node_data_first[entrez_human] = first_series
#         temporal_node_data_first_normalised[entrez_human] = first_series / first_series.max()
#
# temporal_node_data_first.to_csv(f'../data/temporal_data/circadian_temporal_node_data_{pathlib.Path(cyclic_proteins_filename).stem}_first_{max_time}.csv', sep='\t')
# temporal_node_data_first_normalised.to_csv(f'../data/temporal_data/circadian_temporal_node_data_{pathlib.Path(cyclic_proteins_filename).stem}_first_{max_time}_normalised.csv', sep='\t')

### Graph all of the temporal data for each Entrez ID.

In [None]:
# max_time = 48
#
# # Plot temporal data for each Affymetrix ID for each protein, as well as mean and normalised versions of this plot.
# for entrez_human in G_3.nodes:
#     affy_proteins = temporal_node_data.loc[temporal_node_data['entrez_human'] == entrez_human]
#     if affy_proteins.empty:
#         print(f'No data for {entrez_human}')
#     else:
#         series = affy_proteins[affy_proteins.columns[5:5 + max_time]].T
#         mean = series.apply(lambda row: row.mean(), axis=1)
#         normalised = series.copy()
#         for column in list(normalised.columns):
#             minimum = normalised[column].min()
#             difference = normalised[column].max() - minimum
#             normalised[column] = (normalised[column] - minimum) / difference
#
#         symbol = affy_proteins["symbol_mouse"].values[0]
#         affy_ids = list(affy_proteins['affy'].values)
#         fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(8, 3*3))
#
#         series.plot(ax=ax1)
#         ax1.legend(affy_ids)
#         ax1.set_title(f'{symbol} Actual')
#
#         mean.plot(ax=ax2, legend=False)
#         ax2.set_title(f'{symbol} Mean')
#
#         normalised.plot(ax=ax3)
#         ax3.legend(affy_ids)
#         ax3.set_title(f'{symbol} Normalised')
#
#
#         fig.tight_layout()
#         fig.savefig(f'../data/output/all_genes/{symbol}.png')

In [None]:
# Save edge list to a file
nx.relabel_nodes(G_3, entrez_to_symbol, copy=False)
nx.write_edgelist(G_3, f"../data/static_networks/circadian_{graph_name}.edgelist", delimiter=', ')
