In [4]:
import os
import ast

import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
import matplotlib.patches as mpatches

import pickle

import umap

import metaspace
from metaspace import SMInstance

from viz import get_ds_list, get_ion_imgs, ion_cluster, plot_ion_imgs, label_point, imshow_ions, cluster_viz
from metadata import get_meta_df, post_processing

plt.rcParams['figure.figsize'] = (13,9)
plt.rcParams['figure.dpi'] = 300

In [10]:
vec_file = "slurm_job/Vanilla_output/ThNo1No2gensim_q10p50_10iters.model.txt"

In [13]:
p = post_processing(vec_file, u_embed=True)
meta_df = p.get_info_df()
#meta_df = get_meta_df(vec_file, txt=True, embed=False)

In [None]:
pal = sns.color_palette()
color_dict_single = {'whole body xenograft (1) [RMS norm]': pal[0], 'wb xenograft trp pathway dosed- rms_corrected': pal[1], 
                   'whole body xenograft (2) [RMS norm]': pal[2], 'Servier_Ctrl_mouse_wb_lateral_plane_9aa': pal[3], 
                   'Servier_Ctrl_mouse_wb_median_plane_9aa': pal[4],  'Servier_Ctrl_mouse_wb_median_plane_chca' : pal[5],
                   'Servier_Ctrl_mouse_wb_lateral_plane_chca': pal[6], 'Servier_Ctrl_mouse_wb_lateral_plane_DHB': pal[9]}

color_dict_multiple = {'Multiple Datasets':'gray'}

binary_dict = {0:'lightgray', 1:pal[9]}

In [None]:
single_sh = sh_df[sh_df['single_dataset_name'] != 'Multiple Datasets']
multiple_sh = sh_df[sh_df['single_dataset_name'] == 'Multiple Datasets'] 

single_nosh = nosh_df[nosh_df['single_dataset_name'] != 'Multiple Datasets']
multiple_nosh = nosh_df[nosh_df['single_dataset_name'] == 'Multiple Datasets'] 

In [None]:
plt.title('GenVa 100: UMAP embeddings colored by occurence in Servier_Ctrl_mouse_wb_lateral_plane_DHB')
#ax = sns.scatterplot(data = all_df, x = 'umap_x', y = 'umap_y', hue = all_df[one_hot.classes_].apply(tuple, axis=1),
#                     palette=palette,  s = 40, )
ax = sns.scatterplot(data = binary_df, x = 'umap_x', y= 'umap_y', hue = 'Servier_Ctrl_mouse_wb_lateral_plane_DHB', palette = binary_dict)
#sns.scatterplot(data = multiple_ds_df, x = 'umap_x', y='umap_y', hue = 'single_dataset_name', palette = color_dict_multiple, alpha = 0.3)
#label_point(all_df.umap_x, all_df.umap_y, all_df.mol_name, plt.gca())
plt.xlabel('UMAP dimension 1')
plt.ylabel('UMAP dimension 2')


#left, bottom, width, height = (10.5, 10.6, 2.4, 2.6)
#rect=mpatches.Rectangle((left,bottom),width,height, fill=False,color="purple",linewidth=2)
#plt.gca().add_patch(rect)
#sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
#plt.savefig('plots/GenVa100_AllData_datasets_lateral_DHB.png', format='png')

In [None]:
plt.title('GenVa 100: UMAP embeddings colored by superclass')

ax = sns.scatterplot(data = all_df, x = 'umap_x', y= 'umap_y', hue = 'super_class')
#label_point(all_df.umap_x, all_df.umap_y, all_df.mol_name, plt.gca())
plt.xlabel('UMAP dimension 1')
plt.ylabel('UMAP dimension 2')

#sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
#plt.savefig('plots/GenVa100_AllData_superclass.png', format='png')

Closer look at cluster 1.

In [None]:
left, bottom, width, height = (5.5, 5.85, 3.5, 2.5)

plt.title('GenVa100: Cluster 1')

ax = sns.scatterplot(data = single_ds_df, x = 'umap_x', y= 'umap_y', hue = 'single_dataset_name', palette = color_dict_single)
sns.scatterplot(data = multiple_ds_df, x = 'umap_x', y='umap_y', hue = 'single_dataset_name', palette = color_dict_multiple, alpha = 0.3)
#label_point(all_df.umap_x, all_df.umap_y, all_df.mol_name, plt.gca())
plt.xlabel('UMAP dimension 1')
plt.ylabel('UMAP dimension 2')

rect=mpatches.Rectangle((left,bottom),width,height, 
                        fill=False,
                        color="darkblue",
                       linewidth=2)

plt.gca().add_patch(rect)
#sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
#plt.savefig('plots/GenVa100_AllData_Cluster1.png', format='png')

In [None]:
c1,_ = ion_cluster(all_df, left, left+width, bottom, bottom+height, 'super_class', 'mol_name',
                title = 'GenVa100: Cluster 1, superclass and molecule names', output=None, 
                text_size=6, marker_size=60)

c1,_ = ion_cluster(all_df, left, left+width, bottom, bottom+height, 'single_dataset_name', 'mol_name',
                title = 'GenVa100: Cluster 1, dataset and molecule names', output=None, 
                text_size=6, marker_size=60)

In [None]:
imshow_ions(c1, out = 'GenVa100_cluster1_ionimgs')

Cluster 2

In [None]:
c2_bb = [9.8, 1.8, 2.5, 2.6] # left,bottom, width,heigth
left, bottom, width, height = (c2_bb[0], c2_bb[1], c2_bb[2], c2_bb[3])

plt.title('GenVa100: Cluster 2')

ax = sns.scatterplot(data = single_ds_df, x = 'umap_x', y= 'umap_y', hue = 'single_dataset_name', palette = color_dict_single)
sns.scatterplot(data = multiple_ds_df, x = 'umap_x', y='umap_y', hue = 'single_dataset_name', palette = color_dict_multiple, alpha = 0.3)
plt.xlabel('UMAP dimension 1')
plt.ylabel('UMAP dimension 2')

rect=mpatches.Rectangle((left,bottom),width,height, 
                        fill=False,
                        color="orange",
                       linewidth=2)

plt.gca().add_patch(rect)
plt.tight_layout()
#plt.savefig('plots/GenVa100_AllData_Cluster2.png', format='png')

In [None]:
c2,_ = ion_cluster(all_df,c2_bb[0], c2_bb[0]+c2_bb[2], c2_bb[1], c2_bb[1]+c2_bb[3], 'super_class', 'mol_name',
                title = 'GenVa100: Cluster 2, superclass and molecule names', output=None, 
                text_size=6, marker_size=60)

c2,_ = ion_cluster(all_df, c2_bb[0], c2_bb[0]+c2_bb[2], c2_bb[1], c2_bb[1]+c2_bb[3], 'single_dataset_name', 'mol_name',
                title = 'GenVa100: Cluster 2, dataset and molecule names', output=None, 
                text_size=6, marker_size=60)

In [None]:
imshow_ions(c2, out = 'GenVa100_cluster2_ionimgs')

In [None]:
c3_bb = [-7, 1.9, 3.7, 2.3] # left,bottom, width,heigth

plt.title('GenVa100: Cluster 3')

ax = sns.scatterplot(data = single_ds_df, x = 'umap_x', y= 'umap_y', hue = 'single_dataset_name', palette = color_dict_single)
sns.scatterplot(data = multiple_ds_df, x = 'umap_x', y='umap_y', hue = 'single_dataset_name', palette = color_dict_multiple, alpha = 0.3)
plt.xlabel('UMAP dimension 1')
plt.ylabel('UMAP dimension 2')

rect=mpatches.Rectangle((c3_bb[0],c3_bb[1]),c3_bb[2],c3_bb[3], 
                        fill=False,
                        color="grey",
                       linewidth=2)

plt.gca().add_patch(rect)
plt.tight_layout()
#plt.savefig('plots/GenVa100_AllData_Cluster3.png', format='png')

In [None]:
c3,_ = ion_cluster(all_df,c3_bb[0], c3_bb[0]+c3_bb[2], c3_bb[1], c3_bb[1]+c3_bb[3], 'super_class', 'mol_name',
                title = 'GenVa100: Cluster 3, superclass and molecule names', output=None, 
                text_size=5, marker_size=60)

c3,_ = ion_cluster(all_df, c3_bb[0], c3_bb[0]+c3_bb[2], c3_bb[1], c3_bb[1]+c3_bb[3], 'single_dataset_name', 'mol_name',
                title = 'GenVa100: Cluster 3, dataset and molecule names', output=None, 
                text_size=5, marker_size=60)

In [None]:
imshow_ions(c3, out = 'GenVa100_cluster3_ionimgs')

In [None]:
c4_bb = [0.3, 4.45, 1.6, 1.3] # left,bottom, width,heigth

plt.title('GenVa100: Cluster 4')

ax = sns.scatterplot(data = single_ds_df, x = 'umap_x', y= 'umap_y', hue = 'single_dataset_name', palette = color_dict_single)
sns.scatterplot(data = multiple_ds_df, x = 'umap_x', y='umap_y', hue = 'single_dataset_name', palette = color_dict_multiple, alpha = 0.3)
plt.xlabel('UMAP dimension 1')
plt.ylabel('UMAP dimension 2')

rect=mpatches.Rectangle((c4_bb[0],c4_bb[1]),c4_bb[2],c4_bb[3], 
                        fill=False,
                        color="darkgreen",
                       linewidth=2)

plt.gca().add_patch(rect)
plt.tight_layout()
#plt.savefig('plots/GenVa100_AllData_Cluster4.png', format='png')

In [None]:
c4,_ = ion_cluster(all_df,c4_bb[0], c4_bb[0]+c4_bb[2], c4_bb[1], c4_bb[1]+c4_bb[3], 'super_class', 'mol_name',
                title = 'GenVa100: Cluster 4, superclass and molecule names', output=None, 
                text_size=8, marker_size=80)

c4,_ = ion_cluster(all_df, c4_bb[0], c4_bb[0]+c4_bb[2], c4_bb[1], c4_bb[1]+c4_bb[3], 'single_dataset_name', 'mol_name',
                title = 'GenVa100: Cluster 4, dataset and molecule names', output=None, 
                text_size=8, marker_size=80)

In [None]:
imshow_ions(c4, out = 'GenVa100_cluster4_ionimgs')

In [None]:
# Compare to No2_both (positive and negative mode)
No2both_vecs = "slurm_job/ThNo2_both_gensim_q00p50_10iters_sg.model.txt"

In [None]:
two_df = get_meta_df(No2both_vecs, txt=True, embed=True)

In [None]:
binary_df = pd.concat([two_df,pd.get_dummies(two_df['ds_names'].apply(pd.Series).stack()).sum(level=0)], axis=1)

In [None]:
binary_df

In [None]:
single_ds_df2 = two_df[two_df['single_dataset_name'] != 'Multiple Datasets']
multiple_ds_df2 = two_df[two_df['single_dataset_name'] == 'Multiple Datasets'] 

In [None]:
plt.title('GenVa: UMAP embeddings colored by occurence in datasets')
#ax = sns.scatterplot(data = all_df, x = 'umap_x', y = 'umap_y', hue = all_df[one_hot.classes_].apply(tuple, axis=1),
#                     palette=palette,  s = 40, )
ax = sns.scatterplot(data = single_ds_df2, x = 'umap_x', y= 'umap_y', hue = 'single_dataset_name', palette = color_dict_single)
sns.scatterplot(data = multiple_ds_df2, x = 'umap_x', y='umap_y', hue = 'single_dataset_name', palette = color_dict_multiple, alpha = 0.3)
#label_point(all_df.umap_x, all_df.umap_y, all_df.mol_name, plt.gca())


#left, bottom, width, height = (10.5, 10.6, 2.4, 2.6)
#rect=mpatches.Rectangle((left,bottom),width,height, fill=False,color="purple",linewidth=2)
#plt.gca().add_patch(rect)
#sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.savefig('plots/GenVa_No2both_datasets.png', format='png')