In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import scipy.stats as stats
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import DBSCAN
import umap
import matplotlib as mpl
import matplotlib.pyplot as plt

#One-hot encoding the multiple mutation data by their mutation types
file_path = './cluster_resultsv3.csv'
data = pd.read_csv(file_path)

amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 
               'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
mutation_types = [f'{aa1}-{aa2}' for aa1 in amino_acids for aa2 in amino_acids]
one_hot_df = pd.DataFrame(0, index=data.index, columns=mutation_types)
appearing_mutation_types = set()

for idx, row in data.iterrows():
    original_aa = row['I'].split(',')  # Example: ['I', 'V']
    mutated_aa = row['J'].split(',')   # Example: ['F', 'G']
    
    for orig, mut in zip(original_aa, mutated_aa):
        mutation = f'{orig}-{mut}'
        if mutation in one_hot_df.columns:
            one_hot_df.at[idx, mutation] = 1
            appearing_mutation_types.add(mutation)
one_hot_df_filtered = one_hot_df[list(appearing_mutation_types)]

In [None]:
#Setting output figure parameters
mpl.rcParams.update({

   'ps.fonttype': 42,

   'pdf.fonttype': 42,

   'font.family': 'Arial',

   'font.size': 7,

   'lines.linewidth': 0.5,

   'axes.titlesize': 7,

   'axes.labelsize': 7,

   'axes.titlepad': 2,

   'axes.labelpad': 2,

   'axes.linewidth': 0.5,

   'xtick.labelsize': 6,

   'ytick.labelsize': 6,

   'xtick.major.size': 2,

   'ytick.major.size': 2,

   'xtick.major.width': 0.5,

   'ytick.major.width': 0.5,

   'xtick.major.pad': 1,

   'ytick.major.pad': 1,

   'legend.frameon': False,

   'legend.fontsize': 6,

   'legend.labelspacing': 0.1,

   'legend.handlelength': 1,

   'legend.handletextpad': 0.2,
    
   'figure.dpi': 180,
})

In [None]:
#Setting output figure parameters
def set_figure(axsize=(4, 2), dim=(1, 1), ws_ratio=0.4, hs_ratio=0.4, **kwargs):

   assert isinstance(dim, tuple),
   assert len(dim) == 2,
   assert ws_ratio == hs_ratio,
   axw = axsize[0]
   axh = axsize[1]
    
   if dim == (1, 1):
       figw = axw * 1.25
       figh = axh * 1.25
       fig, axes = plt.subplots(dim[0], dim[1], figsize=(figw, figh), **kwargs)
       plt.subplots_adjust(left=0.15, bottom=0.15, right=0.95, top=0.95)

   elif dim[0] == 1:
       figw = axw * dim[1] + axw * ws_ratio * (dim[1]-1)
       figh = axh
       fig, axes = plt.subplots(dim[0], dim[1], figsize=(figw, figh), **kwargs)
       plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=ws_ratio)

   elif dim[1] == 1:
       figw = axw
       figh = axh*dim[0] + axh * hs_ratio * (dim[0]-1)
       fig, axes = plt.subplots(dim[0], dim[1], figsize=(figw, figh), **kwargs)
       plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=hs_ratio)

   else:
       figw = axw * dim[1] + axw*ws_ratio*(dim[1]-1)
       figh = axh * dim[0] + axh*hs_ratio*(dim[0]-1)
       fig, axes = plt.subplots(dim[0], dim[1], figsize=(figw, figh), **kwargs)
       plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=hs_ratio, wspace=ws_ratio)

   if dim != (1, 1):
       for ax in axes.flatten():
           ax.tick_params(length=2, width=0.5, pad=1)
           ax.xaxis.set_tick_params(labelbottom=True)
           ax.yaxis.set_tick_params(labelleft=True)

   else:
       axes.tick_params(length=2, width=0.5, pad=1)
       axes.xaxis.set_tick_params(labelbottom=True)
       axes.yaxis.set_tick_params(labelleft=True)

   return fig, axes

In [None]:
#Apply tsne to reduce features dimension
def tsne_dr(exp_df, filter=True, filter_quantile=0.5, n_components=10, perplexity=50, n_iter=4000, metric='euclidean', random_state=0, **kwargs):

   if filter == True:

       tmp_var = pd.DataFrame(exp_df.var()).sort_values(0, ascending=False)
       tmp_ernas = list(tmp_var[tmp_var[0] > tmp_var[0].quantile(filter_quantile)].index)
       #tmp_ernas = list(tmp_var[tmp_var[0] > 0].index)
       plot_df_zscore = exp_df[tmp_ernas]
   else:
       plot_df_zscore = exp_df.copy()

   plot_df_zscore = plot_df_zscore.apply(stats.zscore).dropna(axis=1)
   pca = PCA(n_components=n_components, random_state=0)
   c = pca.fit_transform(plot_df_zscore)
   tmp_pca = pd.DataFrame(c, index=plot_df_zscore.index)
   exp_embedded = TSNE(

       n_components=2, perplexity=perplexity, n_iter=n_iter, random_state=0, 
       metric=metric, **kwargs).fit_transform(tmp_pca)

   print(exp_embedded)
   tmp_df = pd.DataFrame(data=exp_embedded, columns=['t-SNE 1', 't-SNE 2'], index=plot_df_zscore.index)

   return tmp_df

In [None]:
#Output tsne results graph
def draw_tsne(plot_df, hue=None, style=None, ax=None, **kwargs):

   if ax is None:
       fig, ax = set_figure(axsize=(2, 2), dim=(1, 1))

   if hue is not None and 'palette' not in kwargs:
       kwargs['palette'] = 'tab20'
   sns.scatterplot(x='t-SNE 1', y='t-SNE 2', data=plot_df, hue=hue, style=style,
                    s=5, linewidth=0, ax=ax, **kwargs)

   ax.set_title('t-SNE', fontsize = 7)
   ax.legend(fontsize=6, loc='center left', bbox_to_anchor=(1, 0.5), ncol=1, markerscale=2)
   ax.set_xlabel('')
   ax.set_ylabel('')

   if ax is None:
       return fig, ax
   else:

       return ax

tsne_result = tsne_dr(one_hot_df_filtered, perplexity=70, n_iter=3000, random_state=42, metric='euclidean')
draw_tsne(tsne_result)

In [None]:
#Utilize DBSCAN to cluster the mutation-type based on reduced features
def tsne_dbscan(tsne_df, tsne_hue=None, eps=1, min_samples=50, tsne_palette='tab20', dbscan_palette='tab20', dbscan_metric='euclidean'):

   fig, ax = set_figure(axsize=(4, 4), dim=(1, 1), ws_ratio=0.4, hs_ratio=0.4)

   #ax = axes[0]

   #draw_tsne(tsne_df, hue=tsne_hue, palette=tsne_palette, ax=ax)

   #ax = axes[1]

   clustering = DBSCAN(eps=eps, min_samples=min_samples, metric=dbscan_metric).fit(
       tsne_df[['t-SNE 1', 't-SNE 2']])

   tsne_df['cluster'] = clustering.labels_
   draw_tsne(tsne_df, hue='cluster', palette=dbscan_palette, alpha=1, ax=ax)
    
   output_file = 'cluster_multiple.jpeg'
   fig.savefig(output_file, format='jpeg', dpi=300, bbox_inches='tight')

   return fig, ax, tsne_df['cluster']

_, _, cluster_labels = tsne_dbscan(tsne_result, min_samples=28, eps=5, dbscan_metric='euclidean')
data['Cluster'] = cluster_labels
output_file_path = 'cluster_results.csv'  # Adjust the file path as needed
data.to_csv(output_file_path, index=False)