In [None]:
#import packages
import numpy as np
import pandas as pd
import pickle, time
import os
from collections import OrderedDict as odict
from functools import reduce
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix,cohen_kappa_score
from bokeh.plotting import figure,output_file,output_notebook,show
import bokeh

In [None]:
# SimHash
class LSH(object):
    def __init__(self,data,label,hash_length,nnn):
        """
        data: Nxd matrix
        label: N*1 vector
        hash_length: the projection numbers
        nnn: n nearest Neighbors
        """
        self.hash_length=hash_length
        self.data=data-np.mean(data,axis=1,keepdims=True)
        self.label=label
        self.weights=np.random.random((data.shape[1],hash_length))
        self.hashes=(self.data@self.weights)>0
        self.maxl1distance=2*self.hash_length
        self.nnn=nnn
        self.create_bins()
        
    def query(self,query_data,nnn,not_olap=False):
        query_data=query_data-np.mean(query_data)
        query_hash=(query_data@self.weights)>0
        L1_distances=np.sum(np.abs(query_hash^self.hashes),axis=1)
        nnn=min(self.hashes.shape[0],nnn)
        if not_olap:
            no_overlaps=np.sum(L1_distances==self.maxl1distance)
            return no_overlaps

        NNs=L1_distances.argsort()
        NNs=NNs[:nnn]
        return NNs

    def create_bins(self):
        if hasattr(self,'bins'):
            return
        start=time.time()
        self.bins=np.unique(self.hashes,axis=0)
        self.num_bins=self.bins.shape[0]
        assignment=np.zeros(self.hashes.shape[0])
        for idx,_bin in enumerate(self.bins):
            assignment[(self.hashes==_bin).all(axis=1)]=idx
        self.binstopoints={bin_idx:np.flatnonzero(assignment==bin_idx) for bin_idx in range(self.bins.shape[0])}
        self.pointstobins={point:int(_bin) for point,_bin in enumerate(assignment)}
        self.timetoindex=time.time()-start

    def query_bins(self,query_data,search_radius=1,order=True):
        if not hasattr(self,'bins'):
            raise ValueError('Bins for model not created')
        query_data=query_data-np.mean(query_data)
        query_hash = (query_data@self.weights)>0
        valid_bins=np.flatnonzero((query_hash[None,:]^self.bins).sum(axis=1)<=search_radius)
        all_points=reduce(np.union1d,np.array([self.binstopoints[idx] for idx in valid_bins]))
        if order:
            l1distances=(query_hash^self.hashes[all_points,:]).sum(axis=1)
            all_points=all_points[l1distances.argsort()]
        return all_points

In [None]:
#FlyHash
class flylsh(LSH):
    def __init__(self,data,label,hash_length,nnn,sampling_ratio,embedding_size):
        """
        data: Nxd matrix
        label: N*1 vector
        hash_length: scalar
        nnn: n nearest neighbors
        sampling_ratio: fraction of input dims to sample from when producing a hash
        embedding_size: dimensionality of projection space, m
        Note that in Flylsh, the hash length and embedding_size are NOT the same
        whereas in usual LSH they are
        """
        self.hash_length=hash_length
        self.embedding_size=embedding_size
        K=embedding_size//hash_length
        self.data=data-np.mean(data,axis=1,keepdims=True)
        self.label=label
        self.nnn=nnn
        
        num_projections=int(sampling_ratio*data.shape[1])
        weights=np.random.random((data.shape[1],embedding_size))
        yindices=np.arange(weights.shape[1])[None,:]
        xindices=weights.argsort(axis=0)[-num_projections:,:]
        self.weights=np.zeros_like(weights,dtype=np.bool)
        self.weights[xindices,yindices]= True#sparse projection vectors
        
        all_activations=(self.data@self.weights)
        xindices=np.arange(data.shape[0])[:,None]
        yindices=all_activations.argsort(axis=1)[:,-hash_length:]
        self.hashes=np.zeros_like(all_activations,dtype=np.bool)
        self.hashes[xindices,yindices]=True #choose topk activations
        self.dense_activations=all_activations
        self.sparse_activations=self.hashes.astype(np.float32)*all_activations #elementwise product
        self.maxl1distance=2*self.hash_length
        self.lowd_hashes=all_activations.reshape((-1,hash_length,K)).sum(axis=-1) > 0
        
        self.create_bins()
        self.create_lowd_bins()

    def create_lowd_bins(self):
        start=time.time()
        self.lowd_bins=np.unique(self.lowd_hashes,axis=0)
        #self.num_bins=self.bins.shape[0]

        assignment=np.zeros(self.lowd_hashes.shape[0])
        for idx,_bin in enumerate(self.lowd_bins):
            assignment[(self.lowd_hashes==_bin).all(axis=1)]=idx
        self.lowd_binstopoints={bin_idx:np.flatnonzero(assignment==bin_idx) for bin_idx in range(self.lowd_bins.shape[0])}
        self.lowd_pointstobins={point:int(_bin) for point,_bin in enumerate(assignment)}
        self.timetoindex=time.time()-start

    def query_lowd_bins(self,query_data,search_radius=1,order=True):
        if not hasattr(self,'lowd_bins'):
            raise ValueError('low dimensional bins for model not created')
        query_data=query_data-np.mean(query_data)
        query_activation=(query_data@self.weights)
        indices=query_activation.argsort()[-hash_length:]
        query_hashes=np.zeros_like(qh,dtype=np.bool)
        query_hashes[indices]=True
        query_bin=self.reshape((-1,hash_length,K)).sum(axis=-1) > 0
        valid_bins=np.flatnonzero((query_bin[None,:]^self.lowd_bins).sum(axis=1)<=search_radius)
        all_points=reduce(np.union1d,np.array([self.lowd_binstopoints[idx] for idx in valid_bins]))
        if order:
            l1distances=(self.hashes[qidx,:]^self.hashes[all_points,:]).sum(axis=1)
            all_points=all_points[l1distances.argsort()]
        return all_points
    
    def query(self,query_data,nnn,not_olap=False):
        query_data=query_data-np.mean(query_data)
        query_activation=(query_data@self.weights)
        indices=query_activation.argsort()[-self.hash_length:]
        query_hash=np.zeros_like(query_activation,dtype=np.bool)
        query_hash[indices]=True
        
        L1_distances=np.sum(np.abs(query_hash^self.hashes),axis=1)
        nnn=min(self.hashes.shape[0],nnn)
        if not_olap:
            no_overlaps=np.sum(L1_distances==self.maxl1distance)
            return no_overlaps

        NNs=L1_distances.argsort()
        NNs=NNs[:nnn]
        return NNs

In [None]:
# DenseFly
class denseflylsh(flylsh):
    def __init__(self,data,label,hash_length,nnn,sampling_ratio,embedding_size):
        self.hash_length=hash_length
        self.embedding_size=embedding_size
        K=embedding_size//hash_length
        self.data=data-np.mean(data,axis=1,keepdims=True)
        self.label=label
        self.group_counts=np.unique(self.label).shape[0]
        self.nnn=nnn
        
        weights=np.random.random((data.shape[1],embedding_size))
        self.weights=(weights>1-sampling_ratio) #sparse projection vectors
        all_activations=(self.data@self.weights)
        threshold=0
        self.hashes=(all_activations>=threshold) #choose topk activations
        self.dense_activations=all_activations
        self.sparse_activations=self.hashes.astype(np.float32)*all_activations #elementwise product
        self.maxl1distance=2*self.hash_length
        self.lowd_hashes=all_activations.reshape((-1,hash_length,K)).sum(axis=-1) > 0
        
        self.create_lowd_bins()
        
        
    def query_lowd_bins(self,query_data,search_radius=1,order=True):
        if not hasattr(self,'lowd_bins'):
            raise ValueError('low dimensional bins for model not created')
        query_data=query_data-np.mean(query_data)
        query_activation=(query_data@self.weights)
        query_hashes=query_activation>0
        query_bin=self.reshape((-1,hash_length,K)).sum(axis=-1) > 0
        valid_bins=np.flatnonzero((query_bin[None,:]^self.lowd_bins).sum(axis=1)<=search_radius)
        all_points=reduce(np.union1d,np.array([self.lowd_binstopoints[idx] for idx in valid_bins]))
        if order:
            l1distances=(self.hashes[qidx,:]^self.hashes[all_points,:]).sum(axis=1)
            all_points=all_points[l1distances.argsort()]
        return all_points
    
    def query(self,query_data,nnn,not_olap=False):
        query_data=query_data-np.mean(query_data)
        query_activation=(query_data@self.weights)
        query_hash=query_activation>0
        
        L1_distances=np.sum(np.abs(query_hash^self.hashes),axis=1)
        nnn=min(self.hashes.shape[0],nnn)
        if not_olap:
            no_overlaps=np.sum(L1_distances==self.maxl1distance)
            return no_overlaps

        NNs=L1_distances.argsort()
        NNs=NNs[:nnn]
        return NNs

In [None]:
def compute_CKS(matrix):
        po = matrix.trace()/np.sum(matrix)
        pe = sum(np.sum(matrix,axis=0)*np.sum(matrix,axis=1))/np.sum(matrix)/np.sum(matrix)
        return (po-pe)/(1-pe)
    
def test_batcheffect(model_1,model_2):
    # query cells from batch 1 with batch 2 data as the reference
    cm = 0
    for i in range(5):
        query_indices = np.random.choice(model_1.data.shape[0],model_1.data.shape[0]//5)
        for index in query_indices:
            NNs = model_2.query(model_1.data[index],model_2.nnn)
            pre_label= model_2.label[NNs]
            true_label = np.array([model_1.label[index]]*pre_label.shape[0])
            cm += confusion_matrix(true_label,pre_label,np.unique(model_2.label))
    return cm

In [None]:
# read data and divide matrix into two batches and their labels
path = "../data/Batch_Data/Batch_Data.txt"    
data=pd.read_table(path,sep=' ')
index_b1=data["Batch"]=="Batch1"
index_b2=data["Batch"]=="Batch2"
data_1 = np.array(data[index_b1].iloc[:,:-2])
label_1 = np.array(data[index_b1].iloc[:,-1])
data_2 = np.array(data[index_b2].iloc[:,:-2])
label_2 = np.array(data[index_b2].iloc[:,-1])

In [None]:
#setting parameters
hash_length=64 # hash length can be selected from [64,128,256,512,1024]
nnn=10
#construct DenseFly models
print('DenseFly')
densemodel_1=denseflylsh(data_1,label_1,hash_length,nnn,0.1,20*hash_length)
densemodel_2=denseflylsh(data_2,label_2,hash_length,nnn,0.1,20*hash_length)
#get confusion_matrix and compute cohen kappa score
cm_1to2 = test_batcheffect(densemodel_1,densemodel_2)
cm_2to1 = test_batcheffect(densemodel_2,densemodel_1)
compute_CKS(cm_1to2),compute_CKS(cm_2to1)

#construct FlyHash models
print('FlyHash')
flymodel_1=flylsh(data_1,label_1,hash_length,nnn,0.1,20*hash_length)
flymodel_2=flylsh(data_2,label_2,hash_length,nnn,0.1,20*hash_length)
#get confusion_matrix and compute cohen kappa score
cm_1to2 = test_batcheffect(flymodel_1,flymodel_2)
cm_2to1 = test_batcheffect(flymodel_2,flymodel_1)
compute_CKS(cm_1to2),compute_CKS(cm_2to1)

#construct SimHash models
print('SimHash')
lshmodel_1=LSH(data_1,label_1,hash_length,nnn)
lshmodel_2=LSH(data_2,label_2,hash_length,nnn)
#get confusion_matrix and compute cohen kappa score
cm_1to2 = test_batcheffect(lshmodel_1,lshmodel_2)
cm_2to1 = test_batcheffect(lshmodel_2,lshmodel_1)
compute_CKS(cm_1to2),compute_CKS(cm_2to1)