In [3]:
## PICRUSt2 data visualisations, based on PICRUSt2 run from 17 September


# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data
data_ec_predicted = pd.read_csv('~/Thesis/data/picrust2_testruns/picrust2_stratified/picrust2_out_pipeline_stratified_lasse/EC_metagenome_out/pred_metagenome_contrib.tsv', sep='\t', index_col=0)
data_weighted_nsti = pd.read_csv('~/Thesis/data/picrust2_testruns/picrust2_stratified/picrust2_out_pipeline_stratified_lasse/EC_metagenome_out/weighted_nsti.tsv', sep='\t', index_col=0)
data_pred_metagenome_unstrat = pd.read_csv('~/Thesis/data/picrust2_testruns/picrust2_stratified/picrust2_out_pipeline_stratified_lasse/EC_metagenome_out/pred_metagenome_unstrat.tsv', sep='\t', index_col=0)
data_path_abun_unstrat = pd.read_csv('~/Thesis/data/picrust2_testruns/picrust2_stratified/picrust2_out_pipeline_stratified_lasse/pathways_out/path_abun_unstrat.tsv', sep='\t', index_col=0)
data_path_abun_contrib = pd.read_csv('~/Thesis/data/picrust2_testruns/picrust2_stratified/picrust2_out_pipeline_stratified_lasse/pathways_out/path_abun_contrib.tsv', sep='\t', index_col=0)


# NSTI filter (keep only relatively closely related reference genomes)


In [None]:
# Heatmap of stratified pathway abundances across samples


# --- Choose what to show (pick ONE of the following blocks) ---

# (A) Show a custom list of pathways (uncomment + edit)
# keep = [
#     "METHANOGENESIS-PWY", "1CMET2-PWY", "CO2FIXATION-PWY",  # example IDs
# ]
# df_plot = df.loc[[p for p in keep if p in df.index]]

# (B) Auto-select top pathways (by total abundance) to keep the figure readable
N = 30  # show top 30 pathways
df_plot = data_path_abun_contrib.loc[data_path_abun_contrib.sum(axis=1).sort_values(ascending=False).head(N).index]

# Optional: instead of totals, use variability to pick interesting rows
# N = 30
# df_plot = df.loc[df.var(axis=1).sort_values(ascending=False).head(N).index]

# --- Transform (optional) ---
# Log-transform to compress large dynamic ranges (safe for zeros)
df_val = np.log10(df_plot + 1)

# Per-row z-score (optional alternative to log): center each pathway across samples
# df_val = df_plot.apply(lambda r: (r - r.mean()) / (r.std(ddof=0) + 1e-9), axis=1)

# --- Plot ---
fig, ax = plt.subplots(figsize=(max(8, 0.4*df_val.shape[1]), max(6, 0.35*df_val.shape[0])))
im = ax.imshow(df_val.values, aspect='auto', interpolation='nearest')

# Axis ticks/labels
ax.set_xticks(range(df_val.shape[1]))
ax.set_xticklabels(df_val.columns, rotation=90, fontsize=8)
ax.set_yticks(range(df_val.shape[0]))
ax.set_yticklabels(df_val.index, fontsize=8)

# Titles/labels
ax.set_title("Pathway abundance heatmap (log10-scaled)", pad=12)
ax.set_xlabel("Samples")
ax.set_ylabel("Pathways")

# Colorbar
cbar = plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
cbar.set_label("log10(abundance + 1)")

plt.tight_layout()

plt.show()
