# Do nonnegative matrix factorization to demix fluorescence into the component fluorophores

In [18]:
%matplotlib widget
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib as mpl
import time
import matplotlib.pyplot as plt
from matplotlib.path import Path as mplPath
import matplotlib.cm as cm
import matplotlib.patches as patches
from scipy import stats
import math
import numpy as np
from scipy import interpolate
from pathlib import Path
import os 
from gating_util import ScatterSelectorGating
from bsccm import BSCCM
from demixing_util import *

COHERENT = False

bsccm_with_spectra = BSCCM(str(Path.home()) + '/BSCCM_local/BSCCM/')

#containing the data to demix
if COHERENT:
    bsccm_with_data = BSCCM(str(Path.home()) + '/BSCCM_local/BSCCM-coherent/')
else:
    bsccm_with_data = BSCCM(str(Path.home()) + '/BSCCM_local/BSCCM/')



dataframe_saving_fullpath = str(Path.home()) + '/BSCCM_local/BSCCM/BSCCM_surface_markers.csv'
# For exporting figures
export_dir = '/home/henry/leukosight_data/figures/demixing'


single_markers = ['CD123', 'CD3', 'CD19', 'CD56', 'HLA-DR', 'CD45', 'CD14', 'CD16', 'autofluor']

# The names of the fluorescent measurements put into a database
channel_names = ['Fluor_426-446_shading_corrected', 
       'Fluor_500-550_shading_corrected', 
       'Fluor_550-570_shading_corrected', 
       'Fluor_585-625_shading_corrected', 
       'Fluor_627-673_shading_corrected', 
       'Fluor_690-_shading_corrected']

selections = {batch: ['selection_example_{}_positive_cells_batch_{}'.format(m, batch) 
               for m in single_markers] for batch in range(2)}

unmixed_channel_names = ['CD123/HLA-DR/CD14', 'CD3/CD19/CD56', 'CD45', 'CD16', 'autofluor']

    
# Prepare the raw data
mixed_data = bsccm_with_data.surface_marker_dataframe[channel_names].to_numpy()
#Make everything positive
mixed_data -= (np.min(mixed_data, axis=0) - 1e-2)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Opening BSCCM (this may take a few seconds)...
BSCCM Opened
Opening BSCCM (this may take a few seconds)...
BSCCM Opened


# Unmix single stain with 2 spectrum (antibody + autofluor) or 1 spectrum (autofluor only) model

In [19]:
for batch in bsccm_with_data.index_dataframe.batch.unique():
    
    #demix with measurements specific to each batch
    single_marker_unmix_channel_spectra, single_marker_unmix_channel_brightness, unmix_channel_spectra, unmix_channel_brightness = \
        compute_spectra(bsccm_with_spectra, channel_names, unmixed_channel_names, single_markers, batch=batch)
    
    for antibodies in bsccm_with_data.index_dataframe.antibodies.unique():
        print(antibodies, batch, '\t\t\t\t\t\t\t')
        mask = np.logical_and(bsccm_with_data.index_dataframe.antibodies == antibodies,
              bsccm_with_data.index_dataframe.batch == batch)
        if antibodies == 'unstained':
            spectra_names = ['autofluor'] # single spectrum
            spectra = np.stack([single_marker_unmix_channel_spectra[c] for c in spectra_names], axis=0)
            reweighting = [1]
        elif antibodies == 'all':
            continue # 2 spectra model is misspecified for this one
        else:
            spectra_names = [antibodies, 'autofluor']
            spectra = np.stack([single_marker_unmix_channel_spectra[c] for c in spectra_names], axis=0)
            autofluor_mag = single_marker_unmix_channel_brightness['autofluor']
            marker_mag = single_marker_unmix_channel_brightness[antibodies]
            autofluor_vec = single_marker_unmix_channel_spectra['autofluor']
            marker_vec = single_marker_unmix_channel_spectra[antibodies]
            weighted_proj = (autofluor_vec @ marker_vec) / marker_mag

            reweighting = [weighted_proj, 1]
            
        l1_reg = 7e-1
        unmixed, background_spectrum = do_factorization(mixed_data[mask], spectra,
                l1_reg = l1_reg,
                momentum=0.9,
                learning_rate = 1e3,
                background_learning_rate=1e-1,
                reweighting=reweighting)
        
        # put results into the dataframe
        mask_indices = np.flatnonzero(mask)
        bsccm_with_data.surface_marker_dataframe.loc[mask_indices, 
                        [name + '_single_antibody_model_unmixed' for name in spectra_names]] = unmixed
    
    # Set all marker levels that are equal to 0 instead equal to the smallest nonzero value
    # This shouldn't appreciably change the data but makes it easier to deal with in log space
    mask = bsccm_with_data.index_dataframe.batch == batch
    mask_indices = np.flatnonzero(mask)
    data = bsccm_with_data.surface_marker_dataframe.loc[mask_indices, 
                        [name + '_single_antibody_model_unmixed' for name in spectra_names]].to_numpy()
    new_data = []
    for col_index in range(len(spectra_names)):
        t = data[:, col_index]
        nonzero = t[t !=0] 
        bottom = np.nanpercentile(nonzero, 0.5)
        t[t<bottom] = bottom
        new_data.append(t)
    new_data = np.stack(new_data, axis=1)

    bsccm_with_data.surface_marker_dataframe.loc[mask_indices, 
                        [name + '_single_antibody_model_unmixed' for name in spectra_names]] = new_data

CD45 1 							
CD123 1 							.061	rel_error: 0.0005		5.6  14.1  1.7  3.4  -0.0  2.9  				
unstained 1 							rel_error: 0.0005		-0.0  7.3  1.8  2.3  -0.0  9.9  					
CD19 1 							4.956	rel_error: 0.0005		24.5  10.2  1.6  1.0  -0.0  9.6  					
CD56 1 							.677	rel_error: 0.0005		27.9  3.4  1.5  -0.0  -0.0  11.3  				
all 1 							86.479	rel_error: 0.0005		25.2  8.5  1.4  -0.0  -0.0  9.4  				
CD14 1 							
CD16 1 							.277	rel_error: 0.0005		-0.0  10.7  2.1  1.5  -0.0  7.8  				
HLA-DR 1 							02	rel_error: 0.0004		-0.0  7.4  1.8  2.0  -0.0  8.8  					
CD3 1 							1.369	rel_error: 0.0005		-0.0  12.9  1.2  0.7  -0.0  8.1  				
CD45 0 							6.606	rel_error: 0.0005		24.4  8.0  1.2  -0.0  -0.0  9.3  				
CD123 0 							618	rel_error: 0.0005		2.7  6.5  2.5  3.9  -0.0  3.7  					
unstained 0 							rel_error: 0.0005		-0.0  1.1  1.8  4.1  1.8  11.3  				
CD19 0 							9.580	rel_error: 0.0005		26.4  2.6  1.5  3.0  1.3  11.0  				
CD56 0 							8.323	rel_error: 0.0005		29.9  -0.0  1.6 

# Unmix with full set of unmixing channels

In [20]:
unmix_channels_to_use = unmixed_channel_names[:-1] #exclude autofluorescence
# unmix_channels_to_use = unmixed_channel_names #include autofluorescence
print(unmix_channels_to_use)
l1_reg = 7e-1

for batch in bsccm_with_data.index_dataframe.batch.unique():
# for batch in [0]:

    #demix with measurements specific to each batch
    single_marker_unmix_channel_spectra, single_marker_unmix_channel_brightness, unmix_channel_spectra, unmix_channel_brightness = \
        compute_spectra(bsccm_with_spectra, channel_names, unmixed_channel_names, single_markers, batch=batch)
    
    for antibodies in bsccm_with_data.index_dataframe.antibodies.unique():
#         if not (antibodies == 'all' or antibodies == 'CD45' or antibodies == 'CD3' or antibodies == 'unstained'):
#             continue
        
        print(antibodies, batch, '\t\t\t\t\t\t\t')
        mask = np.logical_and(bsccm_with_data.index_dataframe.antibodies == antibodies,
              bsccm_with_data.index_dataframe.batch == batch)
        
        spectra = np.stack([unmix_channel_spectra[c] for c in unmix_channels_to_use], axis=0)
        
        #Weight regularization based on projection onto first singular vector
        unmix_spectrum = np.array([unmix_channel_spectra[name] * unmix_channel_brightness[name] for name in unmix_channels_to_use])
        u, s, vh = np.linalg.svd(unmix_spectrum, full_matrices=False)
        first_vec = np.abs(vh[0])
        reweighting = [first_vec @ unmix_channel_spectra[name] / unmix_channel_brightness[name]
                        for name in unmix_channels_to_use]
            


        unmixed, background_spectrum = do_factorization(mixed_data[mask], spectra,
                l1_reg = l1_reg,
                momentum=0.9,
                learning_rate = 1e3,
                background_learning_rate=1e-1,
                reweighting=reweighting)
        
        # put results into the dataframe
        mask_indices = np.flatnonzero(mask)
        bsccm_with_data.surface_marker_dataframe.loc[mask_indices, 
                        [name + '_full_model_unmixed' for name in unmix_channels_to_use]] = unmixed


     # Set all marker levels that are equal to 0 instead equal to the smallest nonzero value
    # This shouldn't appreciably change the data but makes it easier to deal with in log space
    mask = bsccm_with_data.index_dataframe.batch == batch
    mask_indices = np.flatnonzero(mask)
    data = bsccm_with_data.surface_marker_dataframe.loc[mask_indices, 
                        [name + '_full_model_unmixed' for name in unmix_channels_to_use]].to_numpy()
    new_data = []
    for col_index in range(len(unmix_channels_to_use)):
        t = data[:, col_index]
        nonzero = t[t !=0] 
        bottom = np.nanpercentile(nonzero, 0.5)
        t[t<bottom] = bottom
        new_data.append(t)
    new_data = np.stack(new_data, axis=1)

    bsccm_with_data.surface_marker_dataframe.loc[mask_indices, 
                        [name + '_full_model_unmixed' for name in unmix_channels_to_use]] = new_data

['CD123/HLA-DR/CD14', 'CD3/CD19/CD56', 'CD45', 'CD16']
CD45 1 							
CD123 1 							071	rel_error: 0.0005		-0.0  5.6  1.7  0.6  -0.0  8.1  					
unstained 1 								rel_error: 0.0005		-0.0  4.5  2.0  0.2  -0.0  9.1  				
CD19 1 							1.871	rel_error: 0.0005		-0.0  4.3  2.0  0.2  -0.0  9.2  				
CD56 1 							.854	rel_error: 0.0005		-0.0  4.4  2.1  0.1  -0.0  9.0  				
all 1 							11.290	rel_error: 0.0005		-0.0  4.2  2.0  0.1  -0.0  9.4  				
CD14 1 							3.445	rel_error: 0.0005		-0.0  4.5  1.6  -0.0  -0.0  9.7  					
CD16 1 							1.457	rel_error: 0.0005		-0.0  4.2  2.0  0.2  -0.0  9.2  				
HLA-DR 1 							340	rel_error: 0.0005		-0.0  6.1  1.9  -0.0  -0.0  8.8  				
CD3 1 							13.263	rel_error: 0.0005		-0.0  4.5  1.9  0.1  -0.0  9.1  					
CD45 0 							1.163	rel_error: 0.0005		-0.0  3.7  2.0  0.3  -0.0  9.1  				
CD123 0 							.838	rel_error: 0.0005		-0.0  1.0  1.9  0.5  -0.0  9.0  				
unstained 0 							rel_error: 0.0005		-0.0  2.1  2.0  0.9  -0.0  8.2  				
CD19 0 							.1

In [None]:
# bsccm_with_data.surface_marker_dataframe.columns

# Data viewer to verify that it worked

In [21]:
from fluorescence_processing.gating_util import ScatterSelectorGating

viewer_channel_names = ['CD123/HLA-DR/CD14_full_model_unmixed',
       'CD3/CD19/CD56_full_model_unmixed', 'CD45_full_model_unmixed',
       'CD16_full_model_unmixed']

# viewer_channel_names = ['CD45_single_antibody_model_unmixed',
#        'autofluor_single_antibody_model_unmixed',
#        'CD123_single_antibody_model_unmixed',
#        'CD19_single_antibody_model_unmixed',
#        'CD56_single_antibody_model_unmixed',
#        'CD14_single_antibody_model_unmixed',
#        'CD16_single_antibody_model_unmixed',
#        'HLA-DR_single_antibody_model_unmixed',
#        'CD3_single_antibody_model_unmixed',]

# For exporting figures
export_dir = '/home/henry/leukosight_data/figures/demixing/'
# For making image montages
read_image_fn = lambda index: bsccm_with_data.read_image(index, contrast_type='dpc')

ScatterSelectorGating(bsccm=bsccm_with_data, channel_names=viewer_channel_names, num_cols=1, 
                  export_dir=export_dir, read_image_fn=read_image_fn)

HBox(children=(Dropdown(description='antibodies:', options=('CD45', 'CD123', 'unstained', 'CD19', 'CD56', 'all…

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

interactive(children=(ToggleButtons(description='Plot_index', options=(), value=None), ToggleButtons(descripti…

HBox(children=(Button(description='Gate selection', style=ButtonStyle()), Button(description='Clear selection'…

HBox(children=(Text(value='name', placeholder=''), Button(description='Save selection', style=ButtonStyle()), …

HBox(children=(ToggleButton(value=False, description='Manual axes'), FloatRangeSlider(value=(0.0, 1.0), contin…

HBox(children=(FloatLogSlider(value=1.0, continuous_update=False, description='Density scale:', max=0.0, min=-…

HBox(children=(Text(value='Export_name.pdf', placeholder=''), Button(description='Export figure', style=Button…

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

HBox(children=(Text(value='Export_name.pdf', placeholder=''), Button(description='Export figure', style=Button…

<fluorescence_processing.gating_util.ScatterSelectorGating at 0x7f828c4ce2b0>

# Save results

In [22]:
dataframe_saving_fullpath = bsccm_with_data.data_root + 'BSCCM_surface_markers.csv'
bsccm_with_data.surface_marker_dataframe.to_csv(dataframe_saving_fullpath, index=False)

# Resave BSCCM-tiny with updated fluorescence

In [23]:
## Don't forget to go back and update BSCCM Tiny with new flu

if COHERENT:
    tiny_root = str(Path.home()) + '/BSCCM_local/BSCCM-coherent-tiny/'
else:
    tiny_root = str(Path.home()) + '/BSCCM_local/BSCCM-tiny/'
bsccm_tiny = BSCCM(tiny_root)
    
global_indices = bsccm_tiny.index_dataframe['full_dataset_global_index'].to_numpy()
tiny_data = bsccm_with_data.surface_marker_dataframe.loc[global_indices]

tiny_saving_path = bsccm_tiny.data_root + 'BSCCM_surface_markers.csv'

tiny_data['full_dataset_global_index'] = tiny_data['global_index']
tiny_data['global_index'] = np.arange(len(tiny_data))  

tiny_data.to_csv(tiny_saving_path, index=False)

Opening BSCCM (this may take a few seconds)...
BSCCM Opened
