# Heatmap with Hierarchical Clustering

This notebook loads your CSV, computes row-wise z-scores (clipped to ±1.5), performs hierarchical clustering with **1 − Pearson** distances and **average linkage** (optimal leaf ordering), and renders a heatmap with:

- Row dendrogram on the **left** (leaves touch the heatmap)
- **Row labels on the right**
- Column dendrogram on the **top**
- Column labels at the **bottom**
- **Red–black–green** colormap (black = 0)


In [1]:
from pathlib import Path
in_path = Path(fr"C:\Users\AllDEGs_BioProcesses.csv")
out_pdf = Path(fr"C:\Users\heatmap_dendrograms.pdf")
print("Input:", in_path)
print("Output:", out_pdf)


Input: C:\Users\AllDEGs_BioProcesses.csv
Output: C:\Users\heatmap_dendrograms.pdf


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, leaves_list, dendrogram
from scipy.spatial.distance import squareform
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import gridspec

# ---- Load data ----
df = pd.read_csv(in_path)

# Auto-detect gene column
candidates = ["gene","genes","symbol","symbols","geneid","gene_id","genesymbol",
              "Gene","Genes","Symbol","Symbols","GeneID","Gene_Id","GeneSymbol"]
gene_col = None
for c in candidates:
    if c in df.columns:
        gene_col = c
        break
if gene_col is None:
    gene_col = df.columns[0]

# Numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if not num_cols:
    num_cols = [c for c in df.columns if c != gene_col]
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Build matrix and clean
mat = df[[gene_col] + num_cols].groupby(gene_col, as_index=False).mean(numeric_only=True)
mat = mat.set_index(gene_col)[num_cols]
mat = mat.replace([np.inf, -np.inf], np.nan)
mat = mat.dropna(how="all")
mat = mat.loc[mat.std(axis=1, skipna=True) > 0]
assert mat.shape[0] >= 2 and mat.shape[1] >= 2

# Row-wise z-score and clip
row_means = mat.mean(axis=1)
row_stds = mat.std(axis=1, ddof=0)
mat_z = (mat.sub(row_means, axis=0)).div(row_stds.replace(0, np.nan), axis=0).fillna(0.0)
cap = 1.5
mat_z = mat_z.clip(-cap, cap)

# Distance = 1 - Pearson correlation
def corr_distance_matrix(X, axis=0):
    if axis == 0:
        C = np.corrcoef(X, rowvar=False)
    else:
        C = np.corrcoef(X, rowvar=True)
    C = np.nan_to_num(C, nan=0.0)
    D = 1 - C
    D = np.maximum(D, 0.0)
    return squareform(D, checks=False)

# Clustering with optimal leaf ordering
row_dist = corr_distance_matrix(mat_z.values, axis=1)
row_link = linkage(row_dist, method="average", optimal_ordering=True)
row_order = leaves_list(row_link)

col_dist = corr_distance_matrix(mat_z.values, axis=0)
col_link = linkage(col_dist, method="average", optimal_ordering=True)
col_order = leaves_list(col_link)

# Reorder matrix
mat_ord = mat_z.iloc[row_order, :].iloc[:, col_order]

# Custom red-black-green colormap (black at 0)
cmap_rbk = LinearSegmentedColormap.from_list("RedBlackGreen", ["red","black","green"])

# ---- Plot layout with dendrograms touching the heatmap ----
fig = plt.figure(figsize=(12, 16))

gs = gridspec.GridSpec(
    2, 2,
    width_ratios=[1.2, 10],
    height_ratios=[2, 10],
    wspace=0.0,
    hspace=0.0
)

# Column dendrogram (top)
ax_col = plt.subplot(gs[0, 1])
dendrogram(col_link, ax=ax_col, color_threshold=0, above_threshold_color='black', no_labels=True)
ax_col.set_xticks([]); ax_col.set_yticks([])
for spine in ax_col.spines.values():
    spine.set_visible(False)

# Row dendrogram (left) — flip x-axis so leaves are adjacent to the heatmap
ax_row = plt.subplot(gs[1, 0])
dendrogram(row_link, ax=ax_row, orientation='right',
           color_threshold=0, above_threshold_color='black', no_labels=True)
ax_row.invert_yaxis()           # align top with heatmap top
ax_row.invert_xaxis()           # leaf side touches heatmap
ax_row.set_xticks([]); ax_row.set_yticks([])
for spine in ax_row.spines.values():
    spine.set_visible(False)

# Heatmap (bottom-right)
ax_mat = plt.subplot(gs[1, 1])
im = ax_mat.imshow(mat_ord.values, aspect='auto', interpolation='nearest',
                   cmap=cmap_rbk, vmin=-cap, vmax=cap)

# Row labels on the RIGHT
ax_mat.yaxis.tick_right()
ax_mat.tick_params(labelright=True, labelleft=False)
ax_mat.set_yticks(np.arange(mat_ord.shape[0]))
ax_mat.set_yticklabels(mat_ord.index, fontsize=6)

# Column labels at the BOTTOM
ax_mat.set_xticks(np.arange(mat_ord.shape[1]))
ax_mat.set_xticklabels(mat_ord.columns, rotation=90, fontsize=8)

# Colorbar
cbar = plt.colorbar(im, ax=ax_mat, fraction=0.046, pad=0.04)
cbar.set_label("Row Z-score (clipped)", fontsize=10)

plt.subplots_adjust(wspace=0, hspace=0)
plt.show()

# Save to Desktop
fig = plt.figure(figsize=(12, 16))
plt.savefig(out_pdf, format="pdf", bbox_inches="tight")
plt.close(fig)
print("Wrote heatmap to:", out_pdf)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\YourName\\Desktop\\AllDEGs_BioProcessesofInterest_CorrectedMay2019 annotated.csv'