In [1]:
import numpy as np
import pandas as pd
import skimage.filters as filters
from functools import partial, lru_cache
from typing import Union
from pprint import pprint
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

In [2]:
sample_data = pd.read_pickle("./data/sample_sc_data.pkl")
sample_data.head(n=10)

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,-0.27945,-0.707366,0.384317,0.836923,1.096945,0.228808,-0.096544,0.76724,0.820186,0.665852,...,1.435914,-0.992491,2.013223,0.410796,-0.387851,-0.697368,-0.624541,1.979515,-0.26531,0.064496
3,0.659602,0.427319,1.279784,1.255604,1.520442,1.190339,0.864978,0.910992,1.1384,0.634851,...,1.59636,-0.873943,0.947234,0.238758,0.357634,-0.306862,1.06005,-0.236844,-0.26531,0.546826
5,-1.396816,-0.485352,0.625359,0.504764,-0.261098,-0.352752,1.41764,0.616807,0.402,-0.639013,...,0.3296,-0.386987,-0.416999,0.112814,-0.387851,-0.697368,0.031788,-0.732879,-0.26531,0.862344
6,1.799861,1.150422,0.654544,1.012722,0.363497,1.149256,1.256522,0.459767,1.019238,-0.291694,...,1.54304,0.673735,-0.416999,0.917741,0.065323,-0.697368,1.549627,-0.732879,-0.26531,-3.492455
7,1.166063,-0.277376,0.54742,0.54677,-0.232914,-1.405149,-0.732767,0.166671,-0.327599,0.354038,...,0.781131,1.022201,-0.416999,0.666394,-0.387851,-0.697368,1.976239,-0.732879,-0.26531,0.418635
8,0.538385,1.879914,2.596488,1.293312,1.636818,0.586411,1.968399,1.387183,0.742524,2.282121,...,1.374499,-0.369884,-0.416999,0.571801,-0.387851,0.091034,0.750767,1.189648,0.393445,0.740139
9,0.007213,-0.707366,0.508603,-0.242018,0.188294,0.613587,1.63474,1.33473,0.619595,0.483853,...,1.1045,0.155065,-0.416999,0.328468,-0.387851,1.763908,-0.773157,-0.732879,-0.26531,1.037948
10,0.674056,-0.707366,0.673496,0.269582,1.352195,0.318894,-0.732767,1.675753,-1.098947,0.460308,...,0.244038,0.220964,-0.416999,0.996335,-0.387851,1.053502,-0.773157,0.53304,-0.26531,0.547179
11,2.186458,-0.256867,0.9923,-0.456662,-0.931927,1.572883,-0.732767,1.753674,1.312197,1.08353,...,0.721455,-0.992491,-0.416999,1.751948,2.262133,-0.172314,0.336999,0.201111,-0.26531,0.740218
12,0.85437,1.430719,0.534565,-1.49313,1.00186,0.588177,1.335718,1.815472,1.201327,0.885432,...,0.444743,-0.380583,-0.416999,0.612848,-0.387851,-0.697368,-0.57742,1.828483,-0.26531,0.38313


# Binary Encodoing
This strategy uses Otsu's thresholding to define + and - for each cell based on the maximum in-class homogeneity. The input is a matrix of $num\_obs \times num\_features$

In [3]:
def threshold_val(value, threshold):
    return 0 if value <= threshold else 1    
    
def remove_outliers(x, percentiles=[5, 95]):
    a = np.array(x)
    upper_percentile = np.percentile(a, percentiles[1])
    lower_percentile = np.percentile(a, percentiles[0])
    mask = np.logical_and(a < upper_percentile, a > lower_percentile)
    return a[mask]

    
    
def vocab_generator(df: pd.DataFrame, style='discrete'):
    """
    For every gene, generate a 'word' that is the ambiguous gene (e.g. "CD45"), 
    the positive, and the negative genes (e.g. "CD45+" and "CD45-"). 
    
    The "style" parameter is meant to let this be expandable further on.
    
    :param df: The data frame containing the `obs x genes` matrix
    :param style: the style of vocab to generate. Options:
    :yield: the ambiguous, negative, and positive versions of that gene.  
    """
    
    
    if style.upper() == 'DISCRETE':
        for val in iter(df.columns):
            val = val.strip()
            yield val, val + "-", val + "+"
                    
def generate_vocabulary(df, style='discrete'):
    generator = vocab_generator(df=df, style=style)
    vocab = []
    if style.upper() == 'DISCRETE':
        for tup in generator:
            [vocab.append(v) for v in tup]
    return vocab
    
def binary_encode(df, method='otsu', histogram_bins=1000, clip_data=False):
    """
    :param df: Data frame with one row per cell and one column per gene.
    :returns: Encoded table-- a copy of df with each numeric entry replaced with 1 for + and 0 for -
    :returns: Vocabulary-- a comprehensive vocabulary of each gene in the table
    :returns: Gene thresholds-- a dictionary relating {Gene: threshold}
    """
    gene_thresh_dict = {}
    encoded_table = pd.DataFrame(0, index=df.index, columns=df.columns)
    
    # Go through each columns and 
    # TODO: subsample to avoid loading 1 billion cells at once
    for (col_name, col_data) in df.iteritems():
        
        # Remove outliers from data if parameter is passed (makes more of a difference in Otsu's method)
        if clip_data:
            data_values = remove_outliers(col_data.values)
        else:
            data_values = col_data.values
        
        # Determine threshold based on method passed.
        if method.upper() == "OTSU":
            thresh = filters.threshold_otsu(data_values, nbins=histogram_bins)
        elif method.upper() == "MEDIAN":
            thresh = np.median(data_values)
        else:
            raise ValueError("Unexpected value for parameter `method`: %s" % method)
            
        # Set value in threhsold disctionary, encode table.
        gene_thresh_dict[col_name] = thresh
        threshold_partial = partial(threshold_val, threshold=thresh)
        encoded_table.loc[:, col_name] = df.loc[:, col_name].apply(threshold_partial)
    
    vocab = generate_vocabulary(df, style='discrete')
    
    return encoded_table, gene_thresh_dict, vocab
        
        

In [4]:
encoded_table_otsu, thresh_dict_otsu, vocabulary_otsu = binary_encode(sample_data, method='otsu')
encoded_table_median, thresh_dict_median, vocabulary_median = binary_encode(sample_data, method='median')

print()




In [5]:
def write_vocabulary_file(fp, vocab_list, sort=False):
    if sort:
        vocab_list = sorted(vocab_list)
        
    with open(fp, 'w') as output:
        for word in vocab_list:
            output.write(word + "\n")

In [6]:
write_vocabulary_file("vocab.txt", vocabulary_otsu)

In [7]:
@lru_cache(maxsize=1024)
def binary_to_text(value, protein_name):
    return protein_name + "+" if value == 1 else protein_name + "-"
    
def binary_table_to_text(df):
    text_table = pd.DataFrame(0, index=df.index, columns=df.columns)
    for col in df.columns:
        bin2text_partial = partial(binary_to_text, protein_name=col)
        text_table.loc[:, col] = df.loc[:, col].apply(bin2text_partial)
    return text_table
    

In [8]:
encoded_table_otsu

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,0,0,1,1,1,1,0,1,0,1,...,1,0,0,1,0,0,0,1,0,1
3,1,0,1,1,1,1,1,1,0,1,...,1,0,0,1,0,0,1,0,0,1
5,0,0,1,1,0,0,1,1,0,0,...,1,0,0,1,0,0,0,0,0,1
6,1,1,1,1,1,1,1,1,0,0,...,1,1,0,1,0,0,1,0,0,0
7,1,0,1,1,0,0,0,1,0,1,...,1,1,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
129,1,0,1,1,1,1,0,0,0,1,...,1,1,0,0,0,0,0,0,0,1
130,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
131,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
binary_table_to_text(encoded_table_otsu)

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,CD45-,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15+,CD31_PECAM1+,CD8a-,CD182_CXCR2+,IgA-,CD66ace+,...,CD24+,CD38-,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1+,CD56-,CD16+
3,CD45+,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15+,CD31_PECAM1+,CD8a+,CD182_CXCR2+,IgA-,CD66ace+,...,CD24+,CD38-,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4+,CD279_PD1-,CD56-,CD16+
5,CD45-,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15-,CD31_PECAM1-,CD8a+,CD182_CXCR2+,IgA-,CD66ace-,...,CD24+,CD38-,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1-,CD56-,CD16+
6,CD45+,CD196_CCR6+,CD181_CXCR1+,HLA_DR+,CD15+,CD31_PECAM1+,CD8a+,CD182_CXCR2+,IgA-,CD66ace-,...,CD24+,CD38+,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4+,CD279_PD1-,CD56-,CD16-
7,CD45+,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15-,CD31_PECAM1-,CD8a-,CD182_CXCR2+,IgA-,CD66ace+,...,CD24+,CD38+,CD278_ICOS-,CD32+,CD152_CTLA4-,IgM-,CD184_CXCR4+,CD279_PD1-,CD56-,CD16+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,CD45+,CD196_CCR6-,CD181_CXCR1+,HLA_DR-,CD15-,CD31_PECAM1-,CD8a-,CD182_CXCR2+,IgA-,CD66ace-,...,CD24-,CD38-,CD278_ICOS-,CD32-,CD152_CTLA4-,IgM-,CD184_CXCR4+,CD279_PD1-,CD56-,CD16+
129,CD45+,CD196_CCR6-,CD181_CXCR1+,HLA_DR+,CD15+,CD31_PECAM1+,CD8a-,CD182_CXCR2-,IgA-,CD66ace+,...,CD24+,CD38+,CD278_ICOS-,CD32-,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1-,CD56-,CD16+
130,CD45-,CD196_CCR6-,CD181_CXCR1-,HLA_DR-,CD15-,CD31_PECAM1-,CD8a-,CD182_CXCR2-,IgA-,CD66ace-,...,CD24+,CD38-,CD278_ICOS-,CD32-,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1-,CD56-,CD16+
131,CD45-,CD196_CCR6-,CD181_CXCR1-,HLA_DR+,CD15-,CD31_PECAM1-,CD8a-,CD182_CXCR2+,IgA-,CD66ace-,...,CD24-,CD38-,CD278_ICOS-,CD32-,CD152_CTLA4-,IgM-,CD184_CXCR4-,CD279_PD1-,CD56-,CD16+


In [10]:
def create_names(name, n_grades):
    """
    Given a protein number and the number of grades, return a list of [protein::grade]
    Example:
    >>> create_names("CD45", 4)
    ["CD45::0", "CD45::1", "CD45::2", "CD45::3"]
    
    """
    
    grades = [name + "::" + str(grade) for grade in range(n_grades)]
    return grades
        

def graded_encode(df, n_comps=3, n_samples=-1, method='gmm'):
    """
    Gaussian Mixture Model
    Kmeans
    Quantiles = [25, 50, 75]
    """
    
    gene_thresh_dict = {}
    encoded_table = pd.DataFrame(0, index=df.index, columns=df.columns)

    
    for (col_name, col_data) in df.iteritems():
        reshaped_data = col_data.values.reshape(-1, 1)
        gmm = GaussianMixture(n_components=n_comps, covariance_type='full', verbose=0)
        preds = gmm.fit_predict(reshaped_data)
        grades = create_names(col_name, n_comps)
        
        # Find the indices for sorting the models in increasing order (min expr to max expr)
        original_idxs = np.arange(n_comps)
        sorted_idxs = np.argsort(gmm.means_.ravel())
        
      
        # This is a swap dictionary relating the mean to the sorted position (i.e. grade)
        swap_dict = {original: new for original, new in zip(original_idxs.ravel(), sorted_idxs.ravel())}
        
        graded_genes = []
        for i in preds:
            graded_genes.append(swap_dict[i])
            
        encoded_table.loc[:, col_name] = np.asarray(graded_genes)
        
        
    return encoded_table
    

In [11]:
graded_table = graded_encode(sample_data)

In [12]:
@lru_cache(maxsize=1024)
def graded_to_text(value, protein_name):
    return protein_name + "::" + str(value)
    
def graded_table_to_text(df):
    text_table = pd.DataFrame(0, index=df.index, columns=df.columns, dtype=str)
    for col in df.columns:
        graded2text_partial = partial(graded_to_text, protein_name=col)
        text_table.loc[:, col] = df.loc[:, col].apply(graded2text_partial)
    return text_table
    
    

In [13]:
sample_data.head()

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,-0.27945,-0.707366,0.384317,0.836923,1.096945,0.228808,-0.096544,0.76724,0.820186,0.665852,...,1.435914,-0.992491,2.013223,0.410796,-0.387851,-0.697368,-0.624541,1.979515,-0.26531,0.064496
3,0.659602,0.427319,1.279784,1.255604,1.520442,1.190339,0.864978,0.910992,1.1384,0.634851,...,1.59636,-0.873943,0.947234,0.238758,0.357634,-0.306862,1.06005,-0.236844,-0.26531,0.546826
5,-1.396816,-0.485352,0.625359,0.504764,-0.261098,-0.352752,1.41764,0.616807,0.402,-0.639013,...,0.3296,-0.386987,-0.416999,0.112814,-0.387851,-0.697368,0.031788,-0.732879,-0.26531,0.862344
6,1.799861,1.150422,0.654544,1.012722,0.363497,1.149256,1.256522,0.459767,1.019238,-0.291694,...,1.54304,0.673735,-0.416999,0.917741,0.065323,-0.697368,1.549627,-0.732879,-0.26531,-3.492455
7,1.166063,-0.277376,0.54742,0.54677,-0.232914,-1.405149,-0.732767,0.166671,-0.327599,0.354038,...,0.781131,1.022201,-0.416999,0.666394,-0.387851,-0.697368,1.976239,-0.732879,-0.26531,0.418635


In [14]:
graded_table.head()

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,0,2,1,1,1,2,1,2,1,2,...,2,0,1,1,0,2,1,0,0,0
3,0,0,1,1,2,2,1,2,1,2,...,2,1,1,1,1,0,2,2,0,1
5,2,0,1,1,1,0,1,2,1,0,...,1,1,0,1,0,2,1,1,0,1
6,1,0,1,1,1,2,1,2,1,2,...,2,1,0,1,1,2,2,1,0,2
7,1,0,1,1,1,0,0,1,1,2,...,2,2,0,1,0,2,2,1,0,1


In [15]:
graded_table_to_text(graded_table).head()

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,CD45::0,CD196_CCR6::2,CD181_CXCR1::1,HLA_DR::1,CD15::1,CD31_PECAM1::2,CD8a::1,CD182_CXCR2::2,IgA::1,CD66ace::2,...,CD24::2,CD38::0,CD278_ICOS::1,CD32::1,CD152_CTLA4::0,IgM::2,CD184_CXCR4::1,CD279_PD1::0,CD56::0,CD16::0
3,CD45::0,CD196_CCR6::0,CD181_CXCR1::1,HLA_DR::1,CD15::2,CD31_PECAM1::2,CD8a::1,CD182_CXCR2::2,IgA::1,CD66ace::2,...,CD24::2,CD38::1,CD278_ICOS::1,CD32::1,CD152_CTLA4::1,IgM::0,CD184_CXCR4::2,CD279_PD1::2,CD56::0,CD16::1
5,CD45::2,CD196_CCR6::0,CD181_CXCR1::1,HLA_DR::1,CD15::1,CD31_PECAM1::0,CD8a::1,CD182_CXCR2::2,IgA::1,CD66ace::0,...,CD24::1,CD38::1,CD278_ICOS::0,CD32::1,CD152_CTLA4::0,IgM::2,CD184_CXCR4::1,CD279_PD1::1,CD56::0,CD16::1
6,CD45::1,CD196_CCR6::0,CD181_CXCR1::1,HLA_DR::1,CD15::1,CD31_PECAM1::2,CD8a::1,CD182_CXCR2::2,IgA::1,CD66ace::2,...,CD24::2,CD38::1,CD278_ICOS::0,CD32::1,CD152_CTLA4::1,IgM::2,CD184_CXCR4::2,CD279_PD1::1,CD56::0,CD16::2
7,CD45::1,CD196_CCR6::0,CD181_CXCR1::1,HLA_DR::1,CD15::1,CD31_PECAM1::0,CD8a::0,CD182_CXCR2::1,IgA::1,CD66ace::2,...,CD24::2,CD38::2,CD278_ICOS::0,CD32::1,CD152_CTLA4::0,IgM::2,CD184_CXCR4::2,CD279_PD1::1,CD56::0,CD16::1


In [76]:
def create_names(name, n_grades):
    """
    Given a protein number and the number of grades, return a list of [protein::grade]
    Example:
    >>> create_names("CD45", 4)
    ["CD45::0", "CD45::1", "CD45::2", "CD45::3"]
    
    """
    
    grades = [name + "::" + str(grade) for grade in range(n_grades)]
    return grades
        

def cluster_genes_gmm(vals, n_components, col_name):
        gmm = GaussianMixture(n_components=n_components)
        preds = gmm.fit_predict(vals)
        grades = create_names(col_name, n_components)
        
        # Find the indices for sorting the models in increasing order (min expr to max expr)
        original_idxs = np.arange(n_components)
        sorted_idxs = np.argsort(gmm.means_.ravel())
        
      
        # This is a swap dictionary relating the mean to the sorted position (i.e. grade)
        swap_dict = {original: new for original, new in zip(original_idxs.ravel(), sorted_idxs.ravel())}
        
        graded_genes = []
        for i in preds:
            graded_genes.append(swap_dict[i])
        
        return graded_genes
    
def cluster_genes_kmeans(vals, n_components, col_name):
        kmeans = Kmeans(n_components=n_components)
        preds = kmeans.fit_predict(vals)
        grades = create_names(col_name, n_components)
        
        # Find the indices for sorting the models in increasing order (min expr to max expr)
        original_idxs = np.arange(n_components)
        sorted_idxs = np.argsort(kmeans.means_.ravel())
        
      
        # This is a swap dictionary relating the mean to the sorted position (i.e. grade)
        swap_dict = {original: new for original, new in zip(original_idxs.ravel(), sorted_idxs.ravel())}
        
        graded_genes = []
        for i in preds:
            graded_genes.append(swap_dict[i])
        
        return graded_genes
    
def graded_encode(df, n_comps=3, n_samples=-1, method='kmeans'):
    """
    Gaussian Mixture Model
    Kmeans
    Quantiles = [25, 50, 75]
    """
    
    gene_thresh_dict = {}
    encoded_table = pd.DataFrame(0, index=df.index, columns=df.columns)
    
    # Determine clustering method
    cluster_genes = cluster_genes_gmm
    if method.upper == "GMM":
        cluster_genes = cluster_genes_gmm
    if method.upper == "KMEANS":
        cluster_genes = cluster_genes_kmeans
        
    
    for (col_name, col_data) in df.iteritems():
        reshaped_data = col_data.values.reshape(-1, 1)
        
        graded_genes = cluster_genes(reshaped_data, n_comps, col_name)
        
        encoded_table.loc[:, col_name] = np.asarray(graded_genes)
        
        
    return encoded_table

def perentile_encode(df, percentiles):
    """
    Percentiles = [25, 50, 75]
    """
    
    gene_thresh_dict = {}
    encoded_table = pd.DataFrame(0, index=df.index, columns=df.columns)
        
    for (col_name, col_data) in df.iteritems():
        data_percentiles = np.percentile(col_data.values, percentiles)
        binned = np.digitize(col_data.values, data_percentiles)
        print(binned)
        encoded_table.loc[:, col_name] = binned
        
        
    return encoded_table


In [77]:
encoded_gmm = graded_encode(sample_data, method='gmm')
encoded_kmeans = graded_encode(sample_data, method='kmeans')
encoded_percentiles = perentile_encode(sample_data, [25, 50, 75])

[1 2 0 3 3 2 1 2 3 2 1 3 2 3 1 0 3 1 3 0 1 3 0 0 0 1 0 1 2 3 0 1 1 1 2 0 2
 1 0 2 0 2 2 3 0 0 3 0 1 0 1 3 0 0 3 2 2 3 3 1 0 0 1 0 3 1 3 2 3 0 0 3 3 2
 3 1 3 2 1 1 2 1 1 0 2 2 2 3 2 2 1 2 0 3 3 2 1 0 1 2]
[2 2 2 3 2 3 2 2 2 3 2 2 3 3 2 2 3 3 2 2 2 2 2 2 3 3 3 3 3 3 2 2 2 2 2 2 3
 2 2 2 3 2 2 3 2 2 3 3 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2]
[2 3 2 2 2 3 2 2 3 2 0 2 1 2 0 1 3 3 1 3 0 1 0 0 0 1 2 1 3 0 0 1 0 1 0 2 3
 2 2 1 2 2 2 3 3 1 0 2 3 0 3 3 2 3 0 1 1 3 3 0 1 0 3 1 3 2 0 2 3 1 0 2 3 3
 3 1 1 1 0 0 1 0 1 0 1 2 1 1 1 2 3 2 0 0 3 1 3 0 0 2]
[3 3 2 3 2 3 1 2 1 0 3 2 2 2 0 2 0 1 1 2 0 3 0 0 3 3 1 1 3 2 2 0 3 3 1 1 3
 0 2 1 1 3 0 2 2 0 1 3 1 3 3 0 0 3 2 1 2 1 3 0 1 0 3 2 0 2 1 1 3 2 3 2 1 1
 0 2 2 0 0 2 2 1 0 0 2 1 0 1 2 0 3 0 0 1 3 1 3 0 3 1]
[3 3 1 2 1 3 2 3 0 3 0 3 1 0 2 0 0 3 0 3 3 1 1 1 2 1 1 3 1 1 1 0 2 2 0 2 2
 0 3 1 2 3 0 1 1 2 3 1 1 3 3 0 1 1 0 0 3 2 3 0 0 0 3 0 1 2 3 1 1 0 0 2 0 2
 3 0 1 0 2 2 2 2 2 1 3 2 3 2 3 3 3

In [78]:
sample_data.head()

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,-0.27945,-0.707366,0.384317,0.836923,1.096945,0.228808,-0.096544,0.76724,0.820186,0.665852,...,1.435914,-0.992491,2.013223,0.410796,-0.387851,-0.697368,-0.624541,1.979515,-0.26531,0.064496
3,0.659602,0.427319,1.279784,1.255604,1.520442,1.190339,0.864978,0.910992,1.1384,0.634851,...,1.59636,-0.873943,0.947234,0.238758,0.357634,-0.306862,1.06005,-0.236844,-0.26531,0.546826
5,-1.396816,-0.485352,0.625359,0.504764,-0.261098,-0.352752,1.41764,0.616807,0.402,-0.639013,...,0.3296,-0.386987,-0.416999,0.112814,-0.387851,-0.697368,0.031788,-0.732879,-0.26531,0.862344
6,1.799861,1.150422,0.654544,1.012722,0.363497,1.149256,1.256522,0.459767,1.019238,-0.291694,...,1.54304,0.673735,-0.416999,0.917741,0.065323,-0.697368,1.549627,-0.732879,-0.26531,-3.492455
7,1.166063,-0.277376,0.54742,0.54677,-0.232914,-1.405149,-0.732767,0.166671,-0.327599,0.354038,...,0.781131,1.022201,-0.416999,0.666394,-0.387851,-0.697368,1.976239,-0.732879,-0.26531,0.418635


In [41]:
encoded_gmm.head()

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,1,0,1,1,2,1,1,0,1,0,...,1,0,1,0,0,0,1,1,0,1
3,1,1,1,1,0,1,1,0,1,0,...,1,1,1,0,1,1,2,0,0,2
5,0,1,1,1,2,2,1,0,1,2,...,0,1,0,0,0,0,1,2,0,2
6,2,1,1,1,2,1,1,0,1,0,...,1,1,0,0,1,0,2,2,0,0
7,2,1,1,1,2,2,0,2,1,0,...,1,2,0,0,0,0,2,2,0,2


In [42]:
encoded_kmeans.head()

Unnamed: 0,CD45,CD196_CCR6,CD181_CXCR1,HLA_DR,CD15,CD31_PECAM1,CD8a,CD182_CXCR2,IgA,CD66ace,...,CD24,CD38,CD278_ICOS,CD32,CD152_CTLA4,IgM,CD184_CXCR4,CD279_PD1,CD56,CD16
0,1,0,1,1,1,0,1,2,1,1,...,1,0,1,1,0,0,1,2,0,1
3,1,1,1,1,2,0,1,2,1,1,...,1,1,1,1,1,1,2,1,0,2
5,0,1,1,1,1,1,1,2,1,2,...,0,1,0,1,0,0,1,0,0,2
6,2,1,1,1,1,0,1,2,1,1,...,1,1,0,1,1,0,2,0,0,0
7,2,1,1,1,1,1,0,1,1,1,...,1,2,0,1,0,0,2,0,0,2
