In [10]:
import numpy as np
import pandas as pd
import skimage.filters as filters
from functools import partial, lru_cache
from typing import Union
from pprint import pprint


In [2]:
sample_data = pd.read_pickle("./data/sample_sc_data.pkl")
sample_data.head(n=10)

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,-0.27945,-0.707366,0.384317,0.836923,1.096945,0.228808,-0.096544,0.76724,0.820186,0.665852,...,1.435914,-0.992491,2.013223,0.410796,-0.387851,-0.697368,-0.624541,1.979515,-0.26531,0.064496
3,0.659602,0.427319,1.279784,1.255604,1.520442,1.190339,0.864978,0.910992,1.1384,0.634851,...,1.59636,-0.873943,0.947234,0.238758,0.357634,-0.306862,1.06005,-0.236844,-0.26531,0.546826
5,-1.396816,-0.485352,0.625359,0.504764,-0.261098,-0.352752,1.41764,0.616807,0.402,-0.639013,...,0.3296,-0.386987,-0.416999,0.112814,-0.387851,-0.697368,0.031788,-0.732879,-0.26531,0.862344
6,1.799861,1.150422,0.654544,1.012722,0.363497,1.149256,1.256522,0.459767,1.019238,-0.291694,...,1.54304,0.673735,-0.416999,0.917741,0.065323,-0.697368,1.549627,-0.732879,-0.26531,-3.492455
7,1.166063,-0.277376,0.54742,0.54677,-0.232914,-1.405149,-0.732767,0.166671,-0.327599,0.354038,...,0.781131,1.022201,-0.416999,0.666394,-0.387851,-0.697368,1.976239,-0.732879,-0.26531,0.418635
8,0.538385,1.879914,2.596488,1.293312,1.636818,0.586411,1.968399,1.387183,0.742524,2.282121,...,1.374499,-0.369884,-0.416999,0.571801,-0.387851,0.091034,0.750767,1.189648,0.393445,0.740139
9,0.007213,-0.707366,0.508603,-0.242018,0.188294,0.613587,1.63474,1.33473,0.619595,0.483853,...,1.1045,0.155065,-0.416999,0.328468,-0.387851,1.763908,-0.773157,-0.732879,-0.26531,1.037948
10,0.674056,-0.707366,0.673496,0.269582,1.352195,0.318894,-0.732767,1.675753,-1.098947,0.460308,...,0.244038,0.220964,-0.416999,0.996335,-0.387851,1.053502,-0.773157,0.53304,-0.26531,0.547179
11,2.186458,-0.256867,0.9923,-0.456662,-0.931927,1.572883,-0.732767,1.753674,1.312197,1.08353,...,0.721455,-0.992491,-0.416999,1.751948,2.262133,-0.172314,0.336999,0.201111,-0.26531,0.740218
12,0.85437,1.430719,0.534565,-1.49313,1.00186,0.588177,1.335718,1.815472,1.201327,0.885432,...,0.444743,-0.380583,-0.416999,0.612848,-0.387851,-0.697368,-0.57742,1.828483,-0.26531,0.38313


# Binary Encodoing
This strategy uses Otsu's thresholding to define + and - for each cell based on the maximum in-class homogeneity. The input is a matrix of $num\_obs \times num\_features$

In [3]:

def threshold_val(value, threshold):
    return 0 if value <= threshold else 1    
    

def vocab_generator(df: pd.DataFrame, style='discrete'):
    """
    For every gene, generate a 'word' that is the ambiguous gene (e.g. "CD45"), 
    the positive, and the negative genes (e.g. "CD45+" and "CD45-"). 
    
    The "style" parameter is meant to let this be expandable further on.
    
    :param df: The data frame containing the `obs x genes` matrix
    :param style: the style of vocab to generate. Options:
    :yield: the ambiguous, negative, and positive versions of that gene.  
    """
    
    
    if style.upper() == 'DISCRETE':
        for val in iter(df.columns):
            val = val.strip()
            yield val, val + "-", val + "+"
                    
def generate_vocabulary(df, style='discrete'):
    generator = vocab_generator(df=df, style=style)
    vocab = []
    if style.upper() == 'DISCRETE':
        for tup in generator:
            [vocab.append(v) for v in tup]
    return vocab
    
def binary_encode_otsu(df, histogram_bins=1000):
    """
    :param df: Data frame with one row per cell and one column per gene.
    :returns: Encoded table-- a copy of df with each numeric entry replaced with 1 for + and 0 for -
    :returns: Vocabulary-- a comprehensive vocabulary of each gene in the table
    :returns: Gene thresholds-- a dictionary relating {Gene: threshold}
    """
    gene_thresh_dict = {}
    encoded_table = pd.DataFrame(0, index=df.index, columns=df.columns)
    
    # Go through each columns and 
    # TODO: subsample to avoid loading 1 billion cells at once
    for (col_name, col_data) in df.iteritems():
        thresh = filters.threshold_otsu(col_data.values, nbins=histogram_bins)
        gene_thresh_dict[col_name] = thresh
        threshold_partial = partial(threshold_val, threshold=thresh)
        encoded_table.loc[:, col_name] = df.loc[:, col_name].apply(threshold_partial)
    
    vocab = generate_vocabulary(df, style='discrete')
    
    return encoded_table, gene_thresh_dict, vocab
        
        

In [4]:
encoded_table, thresh_dict, vocabulary = binary_encode_otsu(sample_data)

In [5]:
sample_data.head()

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,-0.27945,-0.707366,0.384317,0.836923,1.096945,0.228808,-0.096544,0.76724,0.820186,0.665852,...,1.435914,-0.992491,2.013223,0.410796,-0.387851,-0.697368,-0.624541,1.979515,-0.26531,0.064496
3,0.659602,0.427319,1.279784,1.255604,1.520442,1.190339,0.864978,0.910992,1.1384,0.634851,...,1.59636,-0.873943,0.947234,0.238758,0.357634,-0.306862,1.06005,-0.236844,-0.26531,0.546826
5,-1.396816,-0.485352,0.625359,0.504764,-0.261098,-0.352752,1.41764,0.616807,0.402,-0.639013,...,0.3296,-0.386987,-0.416999,0.112814,-0.387851,-0.697368,0.031788,-0.732879,-0.26531,0.862344
6,1.799861,1.150422,0.654544,1.012722,0.363497,1.149256,1.256522,0.459767,1.019238,-0.291694,...,1.54304,0.673735,-0.416999,0.917741,0.065323,-0.697368,1.549627,-0.732879,-0.26531,-3.492455
7,1.166063,-0.277376,0.54742,0.54677,-0.232914,-1.405149,-0.732767,0.166671,-0.327599,0.354038,...,0.781131,1.022201,-0.416999,0.666394,-0.387851,-0.697368,1.976239,-0.732879,-0.26531,0.418635


In [6]:
encoded_table.head()

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,0,0,1,1,1,1,0,1,0,1,...,1,0,0,1,0,0,0,1,0,1
3,1,0,1,1,1,1,1,1,0,1,...,1,0,0,1,0,0,1,0,0,1
5,0,0,1,1,0,0,1,1,0,0,...,1,0,0,1,0,0,0,0,0,1
6,1,1,1,1,1,1,1,1,0,0,...,1,1,0,1,0,0,1,0,0,0
7,1,0,1,1,0,0,0,1,0,1,...,1,1,0,1,0,0,1,0,0,1


In [7]:
def write_vocabulary_file(fp, vocab_list, sort=False):
    if sort:
        vocab_list = sorted(vocab_list)
        
    with open(fp, 'w') as output:
        for word in vocab_list:
            output.write(word + "\n")

In [8]:
write_vocabulary_file("vocab.txt", vocabulary)

In [22]:
@lru_cache(maxsize=1024)
def binary_to_text(value, protein_name):
    return protein_name + "+" if value == 1 else protein_name + "-"
    
def binary_table_to_text(df):
    text_table = pd.DataFrame(0, index=df.index, columns=df.columns)
    for col in df.columns:
        bin2text_partial = partial(binary_to_text, protein_name=col)
        text_table.loc[:, col] = df.loc[:, col].apply(bin2text_partial)
    return text_table
    

In [19]:
encoded_table

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,0,0,1,1,1,1,0,1,0,1,...,1,0,0,1,0,0,0,1,0,1
3,1,0,1,1,1,1,1,1,0,1,...,1,0,0,1,0,0,1,0,0,1
5,0,0,1,1,0,0,1,1,0,0,...,1,0,0,1,0,0,0,0,0,1
6,1,1,1,1,1,1,1,1,0,0,...,1,1,0,1,0,0,1,0,0,0
7,1,0,1,1,0,0,0,1,0,1,...,1,1,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
129,1,0,1,1,1,1,0,0,0,1,...,1,1,0,0,0,0,0,0,0,1
130,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
131,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
binary_table_to_text(encoded_table)

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,CD45-,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15+,CD31_PECAM1+,CD8a-,CD182_CXCR2+,IgA-,CD66ace+,...,CD24+,CD38-,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1+,CD56-,CD16+
3,CD45+,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15+,CD31_PECAM1+,CD8a+,CD182_CXCR2+,IgA-,CD66ace+,...,CD24+,CD38-,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4+,CD279_PD1-,CD56-,CD16+
5,CD45-,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15-,CD31_PECAM1-,CD8a+,CD182_CXCR2+,IgA-,CD66ace-,...,CD24+,CD38-,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1-,CD56-,CD16+
6,CD45+,CD196_CCR6+,CD181_CXCR1+,HLA_DR+,CD15+,CD31_PECAM1+,CD8a+,CD182_CXCR2+,IgA-,CD66ace-,...,CD24+,CD38+,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4+,CD279_PD1-,CD56-,CD16-
7,CD45+,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15-,CD31_PECAM1-,CD8a-,CD182_CXCR2+,IgA-,CD66ace+,...,CD24+,CD38+,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4+,CD279_PD1-,CD56-,CD16+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,CD45+,CD196_CCR6-,CD181_CXCR1+,HLA_DR-,CD15-,CD31_PECAM1-,CD8a-,CD182_CXCR2+,IgA-,CD66ace-,...,CD24-,CD38-,CD278_ICOS-,CD32-,CD152_CTLA4-,IgM-,CD184_CXCR4+,CD279_PD1-,CD56-,CD16+
129,CD45+,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15+,CD31_PECAM1+,CD8a-,CD182_CXCR2-,IgA-,CD66ace+,...,CD24+,CD38+,CD278_ICOS-,CD32-,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1-,CD56-,CD16+
130,CD45-,CD196_CCR6-,CD181_CXCR1-,HLA_DR-,CD15-,CD31_PECAM1-,CD8a-,CD182_CXCR2-,IgA-,CD66ace-,...,CD24+,CD38-,CD278_ICOS-,CD32-,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1-,CD56-,CD16+
131,CD45-,CD196_CCR6-,CD181_CXCR1-,HLA_DR+,CD15-,CD31_PECAM1-,CD8a-,CD182_CXCR2+,IgA-,CD66ace-,...,CD24-,CD38-,CD278_ICOS-,CD32-,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1-,CD56-,CD16+
