In [None]:
import pathlib
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from utils.graphs import create_graph_from_interactions, graph_size_info, draw_graph, highlight_subgraphs

### Create graph from Y2H interactions

In [None]:
G_Y2H = create_graph_from_interactions(
    filename="../data/publications/journal.pgen.1003398.s008.xlsx",
    sheet="PPIs from Y2H screen Fig1B,C",
    source="Entrez gene ID A human",
    target="Entrez gene ID B human")

print(graph_size_info(G_Y2H))
# fig = plt.figure(figsize=(13, 8))
# draw_graph(G_Y2H)

### Create graph from UniHI MAN interactions

We use the Entrez IDs as the graph nodes because these are more reliable than gene names/symbols, but for displaying
the graph we'd obviously prefer gene names/symbols. Therefore we create a 'symbolic' version of the graph later on
by replacing entrez IDs with gene symbols.

In [None]:
G_UniHI = create_graph_from_interactions(
    filename="../data/publications/journal.pgen.1003398.s008.xlsx",
    sheet="Enrichment UniHI MAN Fig2",
    source="Entrez gene ID A human",
    target="Entrez gene ID B human")

print(graph_size_info(G_UniHI))
# fig = plt.figure(figsize=(13, 8))
# draw_graph(G_UniHI)

### Create graph from extended interactions

In [None]:
G_extension = create_graph_from_interactions(
    filename="../data/publications/journal.pgen.1003398.s008.xlsx",
    sheet="Extension interactions Fig2",
    source="Entrez gene ID A",
    target="Entrez gene ID B")

print(graph_size_info(G_extension))
fig = plt.figure(figsize=(13, 8))
draw_graph(G_extension)


### Combine the three networks together

In [None]:
G_Y2H_UniHI = nx.compose(G_Y2H, G_UniHI)
G_full = nx.compose(G_Y2H_UniHI, G_extension)

# For the remainder of this notebook, decide which of the three nested notebooks to focus on, and call that one G
G = G_full
graph_name = 'full'

### Combine the two lists of expected proteins

In [None]:
proteins_core = pd.read_excel("../data/publications/journal.pgen.1003398.s008.xlsx", sheet_name="46 circadian components Fig1")
proteins_core = proteins_core.iloc[:, 0:4]
proteins_core.columns = ['entrez_human', 'entrez_mouse', 'symbol_human', 'symbol_mouse']

proteins_extension = pd.read_excel("../data/publications/journal.pgen.1003398.s008.xlsx", sheet_name="Extension proteins Fig2")
proteins_extension = proteins_extension.iloc[:, 1:5]
proteins_extension.columns = ['entrez_human', 'entrez_mouse', 'symbol_human', 'symbol_mouse']

proteins = pd.concat([proteins_core, proteins_extension], ignore_index=True)

### For every protein that's actually involved in an interaction, add data from the 'proteins' table

In [None]:
interacting_proteins = pd.DataFrame(G.nodes)
interacting_proteins.columns = ['entrez_human']
interacting_proteins = interacting_proteins.merge(proteins, how='left', left_on='entrez_human', right_on='entrez_human')

### Use the entrez and symbol columns from this dataframe to create a symbolic version of the network

In [None]:
entrez_to_symbol = dict(zip(interacting_proteins['entrez_human'], interacting_proteins['symbol_human']))
symbolic_G_Y2H = nx.relabel_nodes(G_Y2H, entrez_to_symbol)
symbolic_G_Y2H_UniHI = nx.relabel_nodes(G_Y2H_UniHI, entrez_to_symbol)
symbolic_G_full = nx.relabel_nodes(G_full, entrez_to_symbol)
symbolic_G = nx.relabel_nodes(G, entrez_to_symbol)

# Save graph's edge list to a file
nx.write_edgelist(symbolic_G, f"../data/static_networks/circadian_{graph_name}.edgelist", delimiter=', ')

### Highlight the nodes that come from the Y2H, UniHI and extension interactions

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
symbolic_graphs = [symbolic_G_full, symbolic_G_Y2H_UniHI, symbolic_G_Y2H]
symbolic_graph_colors = ['mediumseagreen', 'blue', 'red']
highlight_subgraphs(symbolic_graphs, symbolic_graph_colors, ax=ax)
plt.savefig(f'../data/output/graphs/circadian_full_partition.png', dpi=250)

### Merge Affy IDs with Entrez IDs

In [None]:
temporal_node_data = pd.read_csv('../data/temporal_data/circadian/GSE11923_series_matrix.txt', sep='\t', header=63, index_col=0, skipfooter=1, engine='python')
affy_to_entrez = pd.read_csv('../data/genes/DAVID_affy_to_entrez.txt', sep='\t', index_col=0)

temporal_node_data = temporal_node_data.merge(affy_to_entrez['entrez'], how='left', left_index=True, right_index=True)
temporal_node_data['affy'] = temporal_node_data.index

temporal_node_data = interacting_proteins.merge(temporal_node_data, how='inner', left_on='entrez_mouse', right_on='entrez')

# Reorganise the DataFrame columns
columns = list(temporal_node_data.columns.values)
columns = [columns[-1]] + [columns[1], columns[0]] + [columns[3], columns[2]] + columns[4:-2]
temporal_node_data = temporal_node_data[columns]
columns = columns[0:5] + [i+18 for i, _ in enumerate(columns[5:])]
temporal_node_data.columns = columns
temporal_node_data.sort_values('entrez_mouse', inplace=True)

### Restrict to genes of a certain cycle length

Comment out the code below to skip this step

In [None]:
cyclic_proteins_filename = '../data/genes/24_hour_genes.XLS'
cyclic_proteins = pd.read_excel(cyclic_proteins_filename, sheet_name='Sheet1')
temporal_node_data = temporal_node_data.merge(cyclic_proteins, how='inner', left_on='affy', right_on='Probe Set ID')

cycle = f'{pathlib.Path(cyclic_proteins_filename).stem}_'
# cycle = ''

### Plot subgraph induced by cyclic genes

Comment out the code below to skip this step

In [None]:
cyclic_genes_subgraph = symbolic_G_full.subgraph(temporal_node_data['symbol_human'].values)
fig, ax = plt.subplots(figsize=(20, 15))
highlight_subgraphs(symbolic_graphs + [cyclic_genes_subgraph], symbolic_graph_colors + ['yellow'], ax=ax)
plt.savefig(f'../data/output/graphs/circadian_full_partition_{cycle}.png', dpi=300)

# Print size info for each of the graphs and its restriction to cyclic genes
for graph in symbolic_graphs:
    cyclic_genes_subgraph = graph.subgraph(temporal_node_data['symbol_human'].values)
    print(graph_size_info(graph))
    print(graph_size_info(cyclic_genes_subgraph))


### Create a table of temporal node data using the mean

IMPORTANT: The Affy IDs don't match uniquely to Entrez IDs, so in lots of cases a single Entrez ID can correspond to
several sets of temporal data.

In [None]:
temporal_node_data_mean = pd.DataFrame()
temporal_node_data_mean_normalised = pd.DataFrame()
max_time = 48

# ToDo: use 'groupby' here instead!
for entrez_human in G.nodes:
    affy_proteins = temporal_node_data.loc[temporal_node_data['entrez_human'] == entrez_human]
    if affy_proteins.empty:
        print(f'Human entrez ID {entrez_human} either has no temporal data or no affy ID')
    else:
        symbol = affy_proteins['symbol_human'].values[0]
        series = affy_proteins[affy_proteins.columns[5:5 + max_time]].T
        mean = series.apply(lambda row: row.mean(), axis=1)
        temporal_node_data_mean[symbol] = mean
        difference = mean.max() - mean.min()
        temporal_node_data_mean_normalised[symbol] = (mean - mean.min()) / difference

temporal_node_data_mean.to_csv(f'../data/temporal_data/circadian/circadian_temporal_node_data_{cycle}mean_{graph_name}.csv', sep='\t')
temporal_node_data_mean_normalised.to_csv(f'../data/temporal_data/circadian/circadian_temporal_node_data_{cycle}mean_normalised_{graph_name}.csv', sep='\t')

### Graph all of the temporal data for each Entrez ID.

In [None]:
# max_time = 48
# time_ticks = [18 + 6*i for i in range(8 + 1)]
#
# # Plot temporal data for each Affymetrix ID for each protein, as well as mean and normalised versions of this plot.
# for entrez_human in G.nodes:
#     affy_proteins = temporal_node_data.loc[temporal_node_data['entrez_human'] == entrez_human]
#     if affy_proteins.empty:
#         print(f'No data for {entrez_human}')
#     else:
#         series = affy_proteins[affy_proteins.columns[5:5 + max_time]].T
#         mean = series.apply(lambda row: row.mean(), axis=1)
#         normalised = series.copy()
#         for column in list(normalised.columns):
#             minimum = normalised[column].min()
#             difference = normalised[column].max() - minimum
#             normalised[column] = (normalised[column] - minimum) / difference
#
#         symbol = affy_proteins["symbol_mouse"].values[0]
#         affy_ids = list(affy_proteins['affy'].values)
#         fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(8, 3*3), sharex='col')
#
#         series.plot(ax=ax1)
#         ax1.legend(affy_ids)
#         ax1.set_title(f'{symbol} Actual')
#
#         mean.plot(ax=ax2, legend=False)
#         ax2.set_title(f'{symbol} Mean')
#         ax2.set_ylabel('Gene expression')
#
#         normalised.plot(ax=ax3)
#         ax3.legend(affy_ids)
#         ax3.set_title(f'{symbol} Normalised')
#         ax3.set_xlabel('Circadian time')
#         ax3.set_xticks(time_ticks)
#
#
#         fig.tight_layout()
#         fig.savefig(f'../data/output/individual_gene_series/all_genes/{symbol}.png')