In [None]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from dokdonia import differentialexpression as DE
from dokdonia import visualization as VI
from dokdonia import clusteranalysis as CA
from dokdonia.pathway import KEGGPathwayParser, GenomeGBK, add_pathways_to_deseq_df, show_pathways_in_ranked_genes
from dokdonia.utils import take_average_values, saveToPickleFile, readFromPickleFile

%matplotlib inline


root_dir = Path(os.getcwd())
results_dir = root_dir / "results"
data_dir = root_dir / "data"

## Load counts

In [None]:
min_count = 10

counts = pd.read_csv(data_dir / 'counts' / 'DokdoniaCounts.csv', index_col=0)
counts = counts[counts.filter(regex='^[^T]+$').columns]
conditions = [name.split('.sam')[0] for name in counts.columns]
counts.columns = conditions
counts = counts[(counts > min_count).all(1)]
counts.reset_index(level=0, inplace=True)

## Load KEGG pathways and genome annotations

In [None]:
gbk = GenomeGBK(data_dir / 'genome' / 'DokdoniaMED134.gbk')

KEGGparser = KEGGPathwayParser.fromKEGGidentifier('dok', only_curated_pathways=True)
gene_pathways, gene_systems = KEGGparser.getGenePathways()
system_pathways = KEGGparser.getSystemPathways()
# gene_info = KEGGparser.getGeneInfoFromKEGGorthology()
gene_list = list(gene_pathways.keys())
print(f'There are a total of {len(gene_list)} genes')

## Compute Transcript / cell values

In [None]:
sample_meta = pd.read_excel(data_dir / "normalization" / "Datos_Dokdonia_9Jun23.xlsx")
sample_meta["Sample"] = sample_meta['Light/Dark'] + '_' + sample_meta['Temperature'].astype(str) + '_' + sample_meta['Replicate']
sample_meta.head()

In [None]:
TC = DE.get_transcript_cell(counts, sample_meta, ["D_25_R1"]).set_index("index")
TC.to_csv(data_dir / "processed" / "DokdoniaMED134_TC.tsv", sep="\t")
TC.head()

## Remove Light/Dark DE genes from datasets

In [None]:
DE_all_T = readFromPickleFile(results_dir / "deseq_results" / "DE_all_T.pkl")
DE_genes_across_T = readFromPickleFile(results_dir / "deseq_results" / "DE_genes_across_T.pkl")


# Remove light-dark DE genes from TC dataset
TCnoDE = TC.loc[(
    (~TC.index.isin(DE_all_T))
    )]

# Find clusters based on expression pattern across temperatures: transcript / cell

In [None]:
# Using Transcripts/cell
clust_tightness = 3
res_id = 'CLUSTER_ONLY_TEMP_DE_GENES_NEW_TRANSCRIPT_CELL_ZSCORES'
workdir = os.path.join(os.getcwd(), data_dir / 'clust_input')
outdir = os.path.join(os.getcwd(), results_dir / 'clust' / res_id)

clusters_TCZ = CA.getGeneClusters(TCnoDE,path_to_wd=workdir, 
                              out_dir=outdir,
                              cluster_tightness=clust_tightness,
                              normalization_file='clust_normalization_only_zscores.txt',
                              replicates_file='clust_replicates_merged_L_D_volume.txt',
                              scaling_factor=1e5)

# Plot clusters
plot_cluster_data_TCZ = pd.read_csv(os.path.join(
    os.getcwd(),results_dir / "clust" / f'{res_id}/Processed_Data/clust_input.tsv_processed.tsv'),
    sep='\t', index_col='Genes')

# VI.plotClusters(plot_cluster_data_TCZ, clusters_TCZ)

## Merge clusters

Merge into 3 groups. Also add a new cluster formed by the genes that correlate negatively with temperature.

In [None]:
clusters_TCZ = {
    "C0": clusters_TCZ["C0"] + clusters_TCZ["C1"],
    "C1": clusters_TCZ["C2"] + clusters_TCZ["C3"],
    "C2": clusters_TCZ["C4"]
}

# Merge clusters in Clust output file
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in clusters_TCZ.items() ]))
df = df.applymap(lambda x: "" if pd.isna(x) else x)

with open(results_dir / "clust" / f"{res_id}/Clusters_Objects_merged.tsv", "w+") as file:
    file.write("\t".join([f"{k} ({len(v)} genes)" for k,v in clusters_TCZ.items()]) + "\n")
    file.write("\t".join(["Genes" for _ in clusters_TCZ]) + "\n")

    for row in df.values:
        file.write("\t".join(row) + "\n")

n_genes_in_clusters = sum([len(c) for c in clusters_TCZ.values()])
print(f"There are a total of {n_genes_in_clusters} clustered genes")

# Save figure data
saveToPickleFile(clusters_TCZ, results_dir / "figures" / "figure_data" / "clusters_TCZ.pkl")
saveToPickleFile(plot_cluster_data_TCZ, results_dir / "figures" / "figure_data" / "plot_cluster_data_TCZ.pkl")

VI.plotClusters(plot_cluster_data_TCZ, clusters_TCZ)

## Annotate and rank genes within clusters

In [None]:
res_id = "CLUSTER_ONLY_TEMP_DE_GENES_NEW_TRANSCRIPT_CELL_ZSCORES"

cluster_data = (1 / 1e5) * pd.read_csv(os.path.join(
    os.getcwd(),results_dir / "clust" / f'{res_id}/Input_files_and_params/Data/clust_input.tsv'),
    sep='\t', index_col='index')
ranked_clusters_avg_expr = CA.rankGenesWithinClusters(clusters_TCZ, cluster_data, method="median")


if not (results_dir / "pathways").exists():
   (results_dir / "pathways").mkdir(exist_ok=False)

no_kegg_pathway = []
ranked_clusters = []
for cluster_id in ranked_clusters_avg_expr:
    ranked_df = show_pathways_in_ranked_genes(
        ranked_clusters_avg_expr[cluster_id],
        gbk, gene_pathways,
        gene_systems, n=None
        )
    no_kegg_pathway.append( 100 * (ranked_df[((ranked_df.subsystem.str.contains("Unspecified")) & ~ ranked_df.subsystem.isna())].shape[0]) / ranked_df.shape[0] )
    ranked_df.insert(0, "cluster", cluster_id)
    ranked_df.to_csv(results_dir / "pathways" / f"ranked_{cluster_id}_TCZ.csv")
    ranked_clusters.append(ranked_df)

merged_ranked_clusters = pd.concat(ranked_clusters).sort_values(by="value", ascending=False)
merged_ranked_clusters.to_csv(results_dir / "pathways" / "ranked_clusters_TCZ.csv")
print(no_kegg_pathway)
print(np.mean(no_kegg_pathway))