In [None]:
# Notebook behavior:

# Enable multiple outputs from one cell:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import Markdown, Image

# autoreload extension: automaticall refresh imports when code is changed:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from collections import defaultdict
from typing import Tuple, Union

import matplotlib as mpl
from matplotlib.colors import ListedColormap, BoundaryNorm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import seaborn as sns

# Module needed to read the microstate file:
import cms_analysis_wc as msa

# Purpose of this notebook: Show heatmaps of clustered correlation matrix with different number of clusters
---
---

---
## `generate_df`: Function to test clustered heatmap with larger df (synthetic data)
__Seed for reproducibility: seed = None => dynamic__

In [None]:
def generate_df(seed = 42, n_cols = 18,  n_rows = 20) -> pd.DataFrame:
    """Generate a df using synthetic data.
    Note: seed is for reproducibility; set to None to obtain different data on every call.
    """
    # Generate synthetic data:

    rng = np.random.default_rng(seed=seed)

    # Define your lists of header components
    chain = "A"
    rescodes1 = list(msa.res3_to_res1.values()) + ["HEM", "PAA"]
    rand_ints = rng.integers(low=1, high=350, size=n_cols)

    # Generate column headers
    headers = defaultdict(int)
    for _ in range(n_cols):
        #chn = np.random.choice(chains)
        res = np.random.choice(rescodes1, replace=False)
        seq = np.random.choice(rand_ints, replace=False)
        headers[f"{chain}{res}{seq}"] = 1  # dummy, unused value
    
    data = {x: rng.random(n_rows) for x in headers}
    # Create a sample DataFrame with the generated headers
    df = pd.DataFrame(data, columns=list(headers.keys()))

    # Return the correlation matrix
    return df


def show_two_maps(titles: list=["Unsorted", "Sorted"], fnames: list = ["corr_unsorted.png", "corr_sorted.png"]):
    t1, t2 = titles
    mkd = f"| {t1} | {t2} |\n | --- | --- |\n"
    mkd += f'| <img src="{fnames[0]}" alt="{t1}" width="600"/> | <img src="{fnames[1]}" alt="{t2}" width="600"/>|\n'

    return Markdown(mkd)


def cluster_corr_matrix(df: pd.DataFrame, n_clusters:int=5):
    """ For testing outcome of clustering.
    Args:
      - df (pd.DataFrame): input dataframe;
      - n_clusters (int, 5): Number of candidate clusters, minimum 3;
    """
    fname = "clust_corr_"
    corr_matrix = df.corr()
    
    # Convert correlation matrix to distance matrix
    dist_matrix = pdist(1 - np.abs(corr_matrix))

    # Perform hierarchical clustering
    linkage_matrix = linkage(dist_matrix, method="complete")  #"ward")

    if n_clusters < 3:
        n_clusters = 3

    clusters = fcluster(linkage_matrix, n_clusters, criterion="maxclust")
    fname += f"C{n_clusters}" + ".png"
 
    # Get the order of columns based on clustering
    ordered_cols = [corr_matrix.columns[i] for i in np.argsort(clusters)]
    # Rearrange the correlation matrix
    clustered_corr = corr_matrix.loc[ordered_cols, ordered_cols]

    # Plot the clustered correlation matrix as a heatmap (used for saving the figure
    msa.corr_heatmap(clustered_corr,
                     save_name=fname,
                     #show=True, 
                     #fig_size=(25,10),
                    )
    return

---
---
# Create synthetic data

In [None]:
cols = 16
df = generate_df(n_cols=cols, seed=12)
# variable df:
df0 = generate_df(n_cols=cols, seed=None)

## 1. Create original heatmap 

In [None]:
corr_matrix = df.corr()
print("Original Correlation Matrix:".upper())
msa.corr_heatmap(corr_matrix, save_name="corr.png")

# lower triangula mat
#msa.corr_heatmap(corr_matrix, show=True, lower_tri=True)

## 2. CLUSTERING (with different parameters)

__cluster_corr_matrix?__  

```
Signature: cluster_corr_matrix(df: pandas.core.frame.DataFrame, n_clusters: int = 5)
Docstring:
For testing outcome of clustering.
Args:
  - df (pd.DataFrame): input dataframe;
  - n_clusters (int, 5): Number of candidate clusters, minimum 3;
```

In [None]:
# Create clustermap with defaults:
cluster_corr_matrix(df)

show_two_maps(titles=["Unclustered", "Clustered C5"], fnames=["corr.png", "clust_corr_C5.png"])

### Different number of clusters

In [None]:
# reference map, C=5
for C in [3, 6, 9]:
    cluster_corr_matrix(df, n_clusters=C)

In [None]:
show_two_maps(titles=["Clustered C5", "Clustered C3"],
              fnames=["clust_corr_C5.png", "clust_corr_C3.png"])

In [None]:
show_two_maps(titles=["Clustered C5", "Clustered C6"],
              fnames=["clust_corr_C5.png", "clust_corr_C6.png"])

In [None]:
show_two_maps(titles=["Clustered C5", "Clustered C9"],
              fnames=["clust_corr_C5.png", "clust_corr_C9.png"])