In [None]:
# Notebook behavior:

# Enable multiple outputs from one cell:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# autoreload extension: automaticall refresh imports when code is changed:
%load_ext autoreload
%autoreload 2

# run_ms_analysis.ipynb
---

# Tutorial notebook for processing MCCE microstate file in ms_out folder.

# Required mcce run output files:
### 1. head3.lst
### 2. file in ms_out/, a.k.a. the 'msout file'
 * Notes:
   - The ms_out folder is retained when the `--ms` flag is used at the command line for Step 4: `step4.py --ms`;
   - Format of the msout file name: pHXeHYms.tx at pH point X and Eh point Y;
   - The most typical filename encountered is "pH7eH0ms.txt", meaning the MCCE simulation was a pH titation run at pH7.
---

## Load the needed libraries and modules:

In [None]:
from collections import defaultdict
import math
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, ListedColormap, BoundaryNorm
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import sys

%matplotlib inline

# Module needed to read the microstate file:
import ms_analysis_wc as msa

---
# Inputs
## Required user input: the path to the msout file:
 * e.g. 4lzt/ms_out/pH7eH0ms.txt
## Optional user input: list of residues of interest
---

# Define the path to msout file:

In [None]:
# MCCE microstates file:
msout_file = Path("4lzt/ms_out/pH7eH0ms.txt").resolve()

# Get the related head3.lst path:
h3_path = msout_file.parent.parent.joinpath("head3.lst")
h3_path.exists()

msout_file
h3_path

# More specific name for output folder: KEEP?
outname = "crgms_wc"  # "output"

# Ensure the output directory exists:
output_dir = Path(outname)
if not output_dir.exists():
    output_dir.mkdir()

output_dir

In [None]:
! ls -l {output_dir}

# Residues of interest

### this could be done interactively, e.g.:
```
res_of_interest = input("Enter which kind of residues are of interest,\ne.g. ASP, HIS:  ")
```
Note: Make sure to add the names of your residues of interest, such as quinone, here:

In [None]:
# Which kind of residues to consider (3-letter codes):
res_of_interest = ["ASP", "PL9", "GLU", "ARG", "HIS", "LYS", "TYR", "NTR", "CTR"]

# Specific residues of interest. Here the list supplied must be conformer ids:
choose_resid = ["GLUA0035_", "HISA0015_", "ASPA0119_"]

### Note about the `msa` module:
The loading of the conformers list (into module variable `conformers`) is __performed on import using "./head3.lst" by default__;  
The list will be empty whenever a different location is needed.

### Check whether the conformers list needs reloading:

In [None]:
if not msa.conformers:
    # redo with the appropriate head3.lst path:
    conformers = msa.read_conformers(h3_path)
else:
    conformers = msa.conformers

n_conformers = len(conformers)
print(f"Conformers count: {n_conformers:,}")

# Instantiate `msa.MSout` class with the given msout file:

In [None]:
mc = msa.MSout(msout_file)
print(mc)

#### [ optional ]
#### Learn which methods come with the instantiated class (mc)
 * What's in the MSout class?

### In MCCE, residues are divided into fixed or free residue groups.
 * Only free residues appear in the microstate sate list.
 * Fixed residues can be ionizable residues so we need to include their contribution to the total charge, which is referred to here as __background charge__. 

#### Free residues information

In [None]:
# Uncomment to display:

#print("mc.free_residues list holds each free residues' conformers.\nSample data:\n", mc.free_residues[:2])

In [None]:
# Save the free_residues in a pandas.DataFrame, it will be one of the inputs to the function msa.ConcaCrgMsPandas:
free_residues_df = msa.free_residues_df(mc.free_residues, conformers, colname="Residue")
print(f"Number of free residues: {free_residues_df.shape[0]:,}")

In [None]:
# Uncomment to display:

#free_residues_df.head(3)
#free_residues_df.tail(3)

In [None]:
# get their net charges into a df for combiing with fixed_res

free_res_crg_df = msa.free_res2sumcrg_df(mc.microstates.values(), mc.free_residues, conformers)

In [None]:
# Uncomment to display:

#free_res_crg_df.head()

#### Fixed residues charge.
(Make sure to account for fixed residues charge for MD simulation protonation input.)

__For all fixed residues:__

In [None]:
# Note: pass an empty list to get all residues (default is ionizable residues)

background_crg, all_fixed_res_crg_df, all_fixed_res_crg_dict = msa.fixed_residues_info(mc.fixed_iconfs, conformers, [])

print("Background charge:", background_crg)
print("Number of fixed residues:", len(all_fixed_res_crg_dict))

In [None]:
# Uncomment to display:

#all_fixed_res_crg_df.head()

### Combine free & fixed res with crg and save to csv:

In [None]:
all_res_crg_df = msa.combine_all_free_fixed_residues(free_res_crg_df, all_fixed_res_crg_df)

# Save to csv:
all_res_crg_df.to_csv(output_dir.joinpath("all_res_crg.csv"), index_label="Residue" )

In [None]:
# Uncomment to display:

#all_res_crg_df

__For fixed residues among residues of interest:__  
#### Save the fixed residues in your residues of interest to a separate list, DataFrame & csv:

In [None]:
background_crg, fixed_resoi_crg_df, fixed_resoi_crg_dict = msa.fixed_residues_info(mc.fixed_iconfs,
                                                                                   conformers,
                                                                                   res_of_interest
                                                                                  )
print("Background charge:", background_crg)
n_fixed_resoi = len(fixed_resoi_crg_dict)
if n_fixed_resoi:
    print("Fixed res in residues of interest:", n_fixed_resoi)
    # save to csv:
    fixed_resoi_crg_df.to_csv(output_dir.joinpath("fixed_crg_resoi.csv"), index=False)
else:
    fixed_resoi_crg_df = None
    print("No fixed residues of interest.")

In [None]:
if n_fixed_resoi:
    fixed_resoi_crg_df.head(3)

#### The following saves sorted microstates attributes (energy, count and microstate state) into a list;
The default sort in .sort_microstates() is by energy.

In [None]:
# mc.sort_microstates() returns a list of sorted ms objects:

ms_orig_lst = [[ms.E, ms.count, ms.state] for ms in mc.sort_microstates()]

### `ms_orig_lst` holds each microstate's attributes, i.e: [ms.E, ms.count, ms.state] sorted by energy.
#### The first attribute is the microstate energy; the second is the multiplicity of that state (mso.microstates holds unique ms info), the last one is the list of conformers in that state.

#### The conformer numbers align with head3.lst. Head3.lst starts at 1; ms lists start at 0.

In [None]:
# Sample data (uncomment to display):

#print(ms_orig_lst[:2])

### This will map the microstate conformer id to its corresponding charge, and convert the conformer microstates from `mc` to charge microstates:

In [None]:
# Map conf index to conf charge:
id_vs_charge = msa.iconf2crg(conformers)

# This list is identical to ms_orig_lst but now simply give the charge of the coformers selected for each residue.
crg_orig_lst = msa.ms2crgms(ms_orig_lst, id_vs_charge)

In [None]:
# Sample data (uncomment to display):

#print(crg_orig_lst[:2])

---
# Process the charge microstates into various outputs

# [ Cat ]
#### Only the unfiltered data is plotted and the corresponding figure saved with a wierd name: output/all_en_cr_ph7_vs_log(count).pdf.
#### We could have of the processing bounds saved into a dict & wrap the following processing into one function, but the names of the figures are needed.

## Unfiltered charge microstates

In [None]:
crgms_files = msa.findUniqueCrgmsCountOrder(crg_orig_lst)
print(f"Size of 'all_crg_ms_unique' (crgms_files[0]): {len(crgms_files[0]):,}")

# crgms_files:
#  0: all_crg_ms_unique,
#  1: all_count,
#  2: unique_crg_state_order,
#  3: energy_diff_all (used in plot)

all_crg_count_res = msa.ConcaCrgMsPandas(crgms_files[0],
                                         crgms_files[1],
                                         crgms_files[2], 
                                         free_residues_df,
                                         background_crg,
                                         res_of_interest
                                         )

In [None]:
# Sample data (uncomment to display):

#all_crg_count_res.head(3)

### This will plot the tautomer charge distribution of unqiue charge microstates based on the occupancy. 
Each dot are unique charge microstate and color coded based on the energy difference of that microstate appear in Monte Carlo run. Total charge is sum of background and free charge (protein charge).

In [None]:
title = "All Microstates Energy pH7" 
msa.jointplot(crgms_files,
              background_crg,
              title,
              out_dir=output_dir,
              save_name="crgms_logcount_vs_all_E_ph7.pdf",
              show=True)

# [ Raihan ]
#### A very important thing here in this dot plot we need to be clear what is these dots numbers mean?
#### In fact we have a different type of output figure for this. I will share with you. I believe these values are eneergy range of the microstates, but the numbers sometimes do not make sense.
---

#### This is to get the unique charge microstates >= lowest energy + 1.36 kcal/mol. 
Make sure your supply list is sorted based on energy in ascending order. 
The order in the dataframe is within the selected energy range.

In [None]:
begin_E = crg_orig_lst[0][0]
end_E = crg_orig_lst[0][0] + 1.36
print(f"Using energy bounds ({begin_E}, {end_E})")

lowest_crgms_files = msa.findUniqueCrgmsCountOrder(crg_orig_lst,
                                                   begin_energy = begin_E,
                                                   end_energy = end_E
                                                   )
low_crg_count_res = msa.ConcaCrgMsPandas(lowest_crgms_files[0],
                                         lowest_crgms_files[1],
                                         lowest_crgms_files[2], 
                                         free_residues_df,
                                         background_crg,
                                         res_of_interest)

In [None]:
# Sample data (uncomment to display):

#low_crg_count_res.head(3) 

In [None]:
title = "Microstates Energy within 1.36 kcal/mol of Lowest, pH7" 
msa.jointplot(lowest_crgms_files,
              background_crg,
              title,
              out_dir=output_dir,
              save_name="crgms_logcount_vs_lowest_E.pdf",
              show=True)

#### This is to get the unique charge microstates within +/- 0.5 pH unit (+/- 0.68 kcal/mol) of average microstate energy

In [None]:
begin_mc_aver = mc.average_E - 0.68
end_mc_aver = mc.average_E + 0.68
print(f"Using the average microstate energy +/- 0.5 pH unit (+/- 0.68 kcal/mol): ({begin_mc_aver}, {end_mc_aver})")

average_crgms_files = msa.findUniqueCrgmsCountOrder(crg_orig_lst,
                                                    begin_energy = begin_mc_aver,
                                                    end_energy = end_mc_aver
                                                    )
av_crg_count_res = msa.ConcaCrgMsPandas(average_crgms_files[0],
                                        average_crgms_files[1],
                                        average_crgms_files[2], 
                                        free_residues_df,
                                        background_charge,
                                        res_of_interest
                                       )

In [None]:
# Sample data (uncomment to display):

#av_crg_count_res.head(3) 

In [None]:
title = "Microstates Energy within 0.5 pH unit (0.68 kcal/mol) of Average, pH7" 
msa.jointplot(average_crgms_files,
              background_crg,
              title,
              out_dir=output_dir,
              save_name="crgms_logcount_vs_aver_E.pdf",
              show=True)

#### This is to get the unique charge microstates >= 1 KT unit (1.36 kcal/mol) of highest microstate energy

In [None]:
highest_crgms_files = msa.findUniqueCrgmsCountOrder(crg_orig_lst,
                                                    begin_energy = (mc.highest_E - 1.36),
                                                    end_energy = mc.highest_E
                                                    )
high_crg_count_res = msa.ConcaCrgMsPandas(highest_crgms_files[0],
                                          highest_crgms_files[1],
                                          highest_crgms_files[2], 
                                          free_residues_df,
                                          background_crg,
                                          res_of_interest)

In [None]:
# Sample data (uncomment to display):

#high_crg_count_res.head(3) 

In [None]:
title = "Microstates Energy within 1.36 kcal/mol of highest, pH7"
msa.jointplot(highest_crgms_files,
              background_crg,
              title,
              out_dir=output_dir,
              save_name="crgms_logcount_vs_highest_E.pdf",
              show=True)

# TODO:
 * try these other plotting fns:
   - 'plot_hist_by_ms_energy'
   - 'plots_unique_crg_histogram'

## Save all these 3 DataFrames and non titrable residues charge information:
 * low_crg_count_res
 * av_crg_count_res
 * high_crg_count_res
 * fixed_resoi_crg_df

In [None]:
# Writing to Excel will fail if xlswriter is not installed:
# see https://anaconda.org/conda-forge/xlsxwriter
try:
    # this will be a user specified name:
    xl_filename = "charge_ms_dry2_semi_pH7.xlsx"
    outfile_xl = output_dir.joinpath(xl_filename)

    # Create an ExcelWriter object
    with pd.ExcelWriter(outfile_xl,
                        engine="xlsxwriter"
                       ) as writer:
        # Write your dataframes to different sheets
        low_crg_count_res.to_excel(writer, sheet_name='low_crg_count_res')
        av_crg_count_res.to_excel(writer, sheet_name='av_crg_count_res')
        high_crg_count_res.to_excel(writer, sheet_name='high_crg_count_res')
        fixed_resoi_crg_df.to_excel(writer, sheet_name='fixed_residues_crg')

        # Save the Excel file
        writer.save()
        print(f"Saved Excel file {outfile_xl!s}.")

except Exception as e:
    if type(e).__name__ == "ModuleNotFoundError":
        print("The excel writer engine must be installed;", e)
    else:
        print(f"Error occurred while saving Excel file: {str(e)}")

In [None]:
if n_fixed_resoi:
    fixed_resoi_crg_df.head(3)

In [None]:
# Save all_crg_count_res DataFrame to a CSV file:
# Due to column size issue in excel for large sytem. We save all_crg_count_res in csv format.

crg_count_csv = output_dir.joinpath("all_crg_count_res.csv")

try:
    all_crg_count_res.to_csv(crg_count_csv, header=True)
    print(f"DataFrame all_crg_count_res saved to {crg_count_csv!s}.")

except Exception as e:
    print(f"Error occurred while saving all_crg_count_res to csv file: {str(e)}")

---
# Post-processing of already saved main crgms file to get the correlation matrix for residues. 

In [None]:
# This is for correlation.

all_crg_count_read = pd.read_csv(crg_count_csv, index_col=0)
all_crg_count_read 

In [None]:
# drop if the protonation is not changing in free residues

all_crg_change_df = msa.changing_residues_df(all_crg_count_read)

# TODO:
 * Output file ~ "crg_count_res_of_interest"
 * After heatmap: Add fixed res of interest in same file for saving
 * Sort matrix to prevent "scattered hot spots"

In [None]:
# this is for demo: choose your favorite resids:

df_choose_res_data = msa.choose_res_data(all_crg_change_df, choose_res)
df_choose_res_data["Occupancy"] = round(df_choose_res_data["Count"]/sum(df_choose_res_data["Count"]), 2)
df_choose_res_data

file_name = f"cluster{len(choose_res)}_crg_count_res.csv"
df_choose_res_data.to_csv(output_dir.joinpath(file_name), header=True)

In [None]:
df_chosen_res_renamed = msa.renameOrderResidues(df_choose_res_data)
df_chosen_res_renamed

# TODO: Check size of df: if not > 1: no need for correlation processing

# Weighted correlation

In [None]:
if df_chosen_res_renamed.shape[0] > 1:
    df_correlation = msa.WeightedCorr(df=df_chosen_res_renamed, wcol="Count")(method="pearson")
    df_correlation
else:
    print("Too few rows for correlation.")

## FIX: Does not work.
#### what is the expected output?

## Create a heatmap to display the residue correlation:
 * Kept `correl_heatmap` in notebook for tweaking

 * Existing function: `msa.corr_heat_map`
 ```
    def corr_heat_map(df_corr: pd.DataFrame, out_dir: Path, save_name: str = "corr.pdf", show: bool = False):
 ```

In [None]:
msa.corr_heat_map(df_correlation,
                  out_dir=output_dir,
                  save_name="corr.pdf",
                  show=True)

In [None]:
def correl_heatmap(correl_df: pd.DataFrame, out_dir: str, fig_name: str, show: bool = False):
    
    plt.figure(figsize=(25, 8))
    cmap = ListedColormap(["darkred", "red", "pink", "lightgray","skyblue", "blue", "darkblue"])
    bounds = [-1.0, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 1.0]
    norm = BoundaryNorm(bounds, cmap.N)
    heatmap = sns.heatmap(correl_df,
                          linecolor="gray",
                          cmap=cmap,
                          norm=norm,
                          square=True,
                          fmt=".2f",
                          linewidths=.01,
                          annot=True,
                          annot_kws={"fontsize":12}
                         )
    heatmap.set_title("Loaded", fontdict={"fontsize":18}, pad=16)
    cbar = heatmap.collections[0].colorbar
    cbar.ax.tick_params(labelsize=20)
    
    plt.ylabel(None)
    plt.xlabel(None)
    plt.yticks(fontsize = 15, rotation = 0)
    plt.xticks(fontsize = 15, rotation = 90)

    corr_pdf = Path(out_dir).joinpath(fig_name)
    plt.savefig(corr_pdf, dpi=600, bbox_inches="tight");
    print(f"Figure saved: {corr_pdf!s}")
    if show:
        plt.show()

    return

In [None]:
print("Using correl_heatmap:")

correl_heatmap(df_correlation, 
               out_dir=output_dir, fig_name="corr.pdf",
               show=True)

In [None]:
msa.plots_unique_crg_histogram?

In [None]:
# Show in notebook:

msa.plot_hist_by_ms_energy?