In [None]:
import pandas as pd
import numpy as np
import os
import warnings
import umap
from scipy import stats

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

import plotly.express as px

warnings.filterwarnings("ignore")

In [None]:
emb_df = pd.read_csv("/home/ubuntu/large-bascivi/exp_logs/v6/gene_exclusion/pred_embeddings.tsv", sep="\t")



In [None]:
fig = px.scatter(emb_df, x="umap_0", y="umap_1", color="standard_true_celltype", width=1000, height=800)
fig.update_traces(marker=dict(size=2, opacity=0.5,))
fig

fig.write_html("/home/ubuntu/large-bascivi/exp_logs/v6/gene_exclusion/embeddings_celltype.html")

In [None]:
fig = px.scatter(emb_df, x="umap_0", y="umap_1", color="study_name_display", width=1000, height=800)
fig.update_traces(marker=dict(size=2, opacity=0.5,))
fig

fig.write_html("/home/ubuntu/large-bascivi/exp_logs/v6/gene_exclusion/embeddings_study.html")

In [None]:
# BAScVI Models

DIR = "/home/ubuntu/large-bascivi/exp_logs/v6"

exp_logs = os.listdir(DIR)
exps = []
files = []

for i,el in enumerate(exp_logs):
    fname = os.path.join(DIR, el,'pred_embeddings.tsv')
    
    if os.path.exists(fname):
        exps.append(el)
        files.append(fname)

# Sort on name guarantee matching files to study names

exps = np.asarray(exps)
files = np.asarray(files)

inds = np.argsort(exps)

exps = exps[inds]
files = files[inds]
cols = ["embedding_"+ str(i) for i in range(10)]
dims = [10 for i in range(50)]

print(files)

In [None]:
# Set up for KNI

#df_embeddings = pd.read_csv(files[0])
#df_embeddings = df_embeddings.join(df_embeddings_harmony.copy(), on='barcode',rsuffix='_') # scGPT

df_embeddings = pd.read_csv(files[0], delimiter='\t') # scVI / BAscVI / Harmony / Scanorama
#df_embeddings[cols] = pca_vals # PCA

#df_embeddings = df_embeddings.join(df_embeddings_harmony.copy(), on='barcode',rsuffix='_') # Seurat
#cols = cols_list[0] # For bbknn

cell_types = np.asarray(df_embeddings['standard_true_celltype'].astype('category').cat.codes,dtype=int)

cat = df_embeddings['study_name'].astype('category')
mapping = cat.cat.categories
study_name = cat.cat.codes
studies = list(range(len(mapping)))

results = pd.DataFrame(np.zeros((len(studies),exps.shape[0])),index=studies,columns=exps)

print("Starting Loop")

# KNI loop

for ii,fname in enumerate(files):
    
    #df_embeddings = pd.read_csv(files[0])   
    #df_embeddings = df_embeddings.join(df_embeddings_harmony.copy(), on='barcode',rsuffix='_') # scGPT 

    df_embeddings = pd.read_csv(fname,delimiter='\t') # scVI / BAscVI / Harmony / Scanorama
    
    #df_embeddings[cols] = pca_vals # PCA
    #df_embeddings = df_embeddings.join(df_embeddings_harmony.copy(), on=index_names[ii],rsuffix='_') # Seurat
    
    #cols = cols_list[ii]
    
    for i in range(dims[ii]):
        df_embeddings[cols[i]] = df_embeddings[cols[i]] - np.mean(df_embeddings[cols[i]])
        (q1,q2) = np.quantile(df_embeddings[cols[i]],[0.25,0.75])
        df_embeddings[cols[i]] = df_embeddings[cols[i]]/(q2-q1)
    
    classifier = KNeighborsClassifier(n_neighbors=50) # 25 used for csv file
    
    cell_types = np.asarray(df_embeddings['standard_true_celltype'].astype('category').cat.codes,dtype=int)
    cat = df_embeddings['study_name'].astype('category')
    mapping = cat.cat.categories
    study_name = cat.cat.codes

    classifier.fit(df_embeddings[cols[:dims[ii]]], cell_types)

    vals = classifier.kneighbors(n_neighbors=50)

    knn_ct = cell_types[vals[1].flatten()].reshape(vals[1].shape)
    knn_exp = study_name.iloc[vals[1].flatten()].values.reshape(vals[1].shape)
    
    exp_mat = np.repeat(np.expand_dims(study_name,1),knn_exp.shape[1],axis=1)
    
    self_mask = knn_exp != exp_mat
    cutoff = np.sum(np.logical_not(self_mask),axis=1)

    acc = {study:0 for study in studies}
    batch = {study:0 for study in studies}
    kni = {study:0 for study in studies}

    mask_1 = cutoff < 40

    for i in range(df_embeddings.shape[0]):
        if mask_1[i]:
            acc[study_name[i]]
            pred = np.argmax(np.bincount(knn_ct[i,:][self_mask[i,:]]))
            batch[study_name[i]] +=1
            if pred == cell_types[i]:
                kni[study_name[i]] +=1
    
    print(fname)
    total = 0
    for study in studies:
        print(mapping[study], '\t', kni[study])
        
        results[exps[ii]].loc[study] = kni[study]
        total += kni[study]
    print("Total:  ", total, " Cell N:  ", df_embeddings.shape[0]," % Acc: " , total/df_embeddings.shape[0])
    print()
    
    # Break down into accuracy vs. batch / kbet

    print("Batch breadkdown:")
    print()
    
    for study in studies:
        print(mapping[study], '\t', batch[study])
    
    print()
    print("Accuracy breadkdown:")
    print()
    
    for study in studies:
        
        classifier = KNeighborsClassifier(n_neighbors=10) # 25 used for csv file
        
        study_mask = mapping[study] == df_embeddings['study_name']
        
        classifier.fit(df_embeddings[cols[:dims[ii]]].values[np.logical_not(study_mask),:], 
                       cell_types[np.logical_not(study_mask)])
        
        pred = classifier.predict(df_embeddings[cols[:dims[ii]]].values[study_mask,:])
        acc[study] = np.sum(pred == cell_types[study_mask])
        
        print(mapping[study],'\t',acc[study])
        
    print()

#results.to_csv("KNI_results/baScVI_ScVI_lp.csv")


In [None]:
'''
classifier = KNeighborsClassifier(n_neighbors=10) # 25 used for csv file

study_mask = mapping[study] == df_embeddings['study_name']
print(study)
classifier.fit(df_embeddings[cols[:dims[ii]]].values[np.logical_not(study_mask),:], 
               cell_types[np.logical_not(study_mask)])

pred = classifier.predict(df_embeddings[cols[:dims[ii]]].values[study_mask,:])
acc[study] = np.sum(pred == cell_types[study_mask])

print(acc[study])
'''

In [None]:
# Set up for KNI with FOXP3+ labels

df_embeddings = pd.read_csv(files[0],delimiter='\t')

#cols = ["embedding_"+ str(i) for i in range(10)]

cols = ["PC_"+ str(i+1) for i in range(50)] # Harmony
df_embeddings[cols] = pca_vals # PCA
    
#df_embeddings = df_embeddings.join(df_embeddings_harmony.copy(), on='barcode',rsuffix='_') # Seurat 

#cols = ["X_umap10_bbknn_harm_study"+ str(i) for i in range(10)] # bbKNN
#cols = ["harmony_embedding"+ str(i) for i in range(50)]
#cols = ["harmony_sample_embedding"+ str(i) for i in range(10)]

dims = [10]

df_names = pd.read_csv('foxp3_labels.csv')
df_names = df_names.set_index('barcode')

df_embeddings = df_embeddings.join(df_names, on='barcode',rsuffix='_FOXP3')
cell_types = np.asarray(df_embeddings['standard_true_celltype_FOXP3'].astype('category').cat.codes,dtype=int)

cat = df_embeddings['standard_true_celltype_FOXP3'].astype('category')
mapping_cell = cat.cat.categories

cat = df_embeddings['study_name'].astype('category')
mapping = cat.cat.categories
study_name = cat.cat.codes

studies = list(range(len(mapping)))

results = pd.DataFrame(np.zeros((len(mapping_cell),exps.shape[0])),index=mapping_cell,columns=exps)

print("Starting Loop")

# KNI loop

#for ii,fname in enumerate(['./baScVI_ScVI_hp_lrx0.5/BaScVI_4L_Both/train_embeddings.tsv',
#                           './baScVI_ScVI_lp/ScVI_4L_Both/train_embeddings.tsv',
#                           './baScVI_ScVI_hp/ScVI_2L_Sample/train_embeddings.tsv',
#                           'vae_scvi/VAE_MSE/train_embeddings.tsv']):
for ii,fname in enumerate([files[0]]):

    df_embeddings = pd.read_csv(fname,delimiter='\t')
    #df_embeddings = df_embeddings.join(df_embeddings_harmony.copy(), on=index_names[ii],rsuffix='_') # Seurat 

    cols = ["PC_"+ str(i+1) for i in range(50)] # Harmony
    df_embeddings[cols] = pca_vals # PCA

    mask = df_embeddings['study_name'] != 'external_wu_natgenet_2021_34493872'
    df_embeddings = df_embeddings[mask]
    
    df_names = pd.read_csv('foxp3_labels.csv')
    df_names = df_names.set_index('barcode')
    df_embeddings = df_embeddings.join(df_names, on='barcode',rsuffix='_FOXP3')
    print(df_embeddings.shape)
    
    cell_types = np.asarray(df_embeddings['standard_true_celltype_FOXP3'].astype('category').cat.codes,dtype=int)
    cell_types[cell_types==-1] = 0 #sets 30 cells to 0 for Harmony evaluation
    
    cat = df_embeddings['study_name'].astype('category')
    mapping = cat.cat.categories
    study_name = cat.cat.codes
    
    # Normalization to standard local distance spread for RbNI

    for i in range(dims[ii]):
        df_embeddings[cols[i]] = df_embeddings[cols[i]] - np.mean(df_embeddings[cols[i]])
        (q1,q2) = np.quantile(df_embeddings[cols[i]],[0.25,0.75])
        df_embeddings[cols[i]] = df_embeddings[cols[i]]/(q2-q1)
    
    classifier = KNeighborsClassifier(n_neighbors=50)
    classifier.fit(df_embeddings[cols], cell_types)

    vals = classifier.kneighbors(n_neighbors=50)

    knn_ct = cell_types[vals[1].flatten()].reshape(vals[1].shape)
    knn_exp = study_name.iloc[vals[1].flatten()].values.reshape(vals[1].shape)
    
    exp_mat = np.repeat(np.expand_dims(study_name,1),knn_exp.shape[1],axis=1)
    
    self_mask = knn_exp != exp_mat
    cutoff = np.sum(np.logical_not(self_mask),axis=1)

    acc = {cell_type:0 for cell_type in mapping_cell}

    mask_1 = cutoff < 40

    for i in range(df_embeddings.shape[0]):
        if mask_1[i]:
            pred = np.argmax(np.bincount(knn_ct[i,:][self_mask[i,:]]))
            if pred == cell_types[i]:
                acc[mapping_cell[cell_types[i]]] +=1
                
    print(fname)
    total = 0
    for cell_type in mapping_cell:
        
        print(cell_type, '\t', acc[cell_type])
        
        results[exps[ii]].loc[cell_type] = acc[cell_type]
        total += acc[cell_type]
    print("Total:  ", total)
    print()

results.to_csv("KNI_scanorama_foxp3.csv")

In [None]:
# Evaluate on Wu holdout study

files = ['holdout/BA_scVI_1k_holdout/train_embeddings.tsv']

# Set up for KNI

df_embeddings = pd.read_csv(files[0],delimiter='\t')         

cols = ["embedding_"+ str(i) for i in range(10)]
dims = [10]

mask_tissue = df_embeddings['study_name'] != 'external_macfib_schuster_2020_000000'
df_embeddings = df_embeddings[mask_tissue]

cell_types = np.asarray(df_embeddings['standard_true_celltype'].astype('category').cat.codes,dtype=int)

cat = df_embeddings['study_name'].astype('category')
mapping = cat.cat.categories
study_name = cat.cat.codes
studies = list(range(len(mapping)))

results = pd.DataFrame(np.zeros((len(studies),exps.shape[0])),index=studies,columns=exps)

print("Starting Loop")

# KNI loop

for ii,fname in enumerate(files):
    

    df_embeddings = pd.read_csv(fname,delimiter='\t') # scVI / BAscVI / Harmony / Scanorama
    
    mask_tissue = df_embeddings['study_name'] != 'external_macfib_schuster_2020_000000'
    df_embeddings = df_embeddings[mask_tissue]
    
    for i in range(dims[ii]):
        df_embeddings[cols[i]] = df_embeddings[cols[i]] - np.mean(df_embeddings[cols[i]])
        (q1,q2) = np.quantile(df_embeddings[cols[i]],[0.25,0.75])
        df_embeddings[cols[i]] = df_embeddings[cols[i]]/(q2-q1)
    
    classifier = KNeighborsClassifier(n_neighbors=50) # 25 used for csv file
    
    cell_types = np.asarray(df_embeddings['standard_true_celltype'].astype('category').cat.codes,dtype=int)
    cat = df_embeddings['study_name'].astype('category')
    mapping = cat.cat.categories
    study_name = cat.cat.codes

    classifier.fit(df_embeddings[cols[:dims[ii]]], cell_types)

    vals = classifier.kneighbors(n_neighbors=50)

    knn_ct = cell_types[vals[1].flatten()].reshape(vals[1].shape)
    knn_exp = study_name.iloc[vals[1].flatten()].values.reshape(vals[1].shape)
    
    exp_mat = np.repeat(np.expand_dims(study_name,1),knn_exp.shape[1],axis=1)
    
    self_mask = knn_exp != exp_mat
    cutoff = np.sum(np.logical_not(self_mask),axis=1)

    acc = {study:0 for study in studies}
    batch = {study:0 for study in studies}
    kni = {study:0 for study in studies}

    mask_1 = cutoff < 40

    for i in range(df_embeddings.shape[0]):
        if mask_1[i]:
            acc[study_name.iloc[i]]
            pred = np.argmax(np.bincount(knn_ct[i,:][self_mask[i,:]]))
            batch[study_name.iloc[i]] +=1
            if pred == cell_types[i]:
                kni[study_name.iloc[i]] +=1
    
    print(fname)
    total = 0
    for study in studies:
        print(mapping[study], '\t', kni[study])
        
        results[exps[ii]].loc[study] = kni[study]
        total += kni[study]
    print("Total:  ", total, " Cell N:  ", df_embeddings.shape[0]," % Acc: " , total/df_embeddings.shape[0])
    print()
    
    # Break down into accuracy vs. batch / kbet

    print("Batch breadkdown:")
    print()
    
    for study in studies:
        print(mapping[study], '\t', batch[study])
    
    print()
    print("Accuracy breadkdown:")
    print()
    
    for study in studies:
        
        classifier = KNeighborsClassifier(n_neighbors=10) # 25 used for csv file
        
        study_mask = mapping[study] == df_embeddings['study_name']
        
        classifier.fit(df_embeddings[cols[:dims[ii]]].values[np.logical_not(study_mask),:], 
                       cell_types[np.logical_not(study_mask)])
        
        pred = classifier.predict(df_embeddings[cols[:dims[ii]]].values[study_mask,:])
        acc[study] = np.sum(pred == cell_types[study_mask])
        
        print(mapping[study],'\t',acc[study])
        
    print()

#results.to_csv("KNI_results/baScVI_ScVI_lp.csv")



In [None]:
df_embeddings = pd.read_csv(files[0],delimiter='\t')
cols = ["harmony_sample_embedding"+ str(i) for i in range(50)]


In [None]:
# Evaluate on Wu holdout studies

files = ['holdout/BA_scVI_1k_holdout/train_embeddings.tsv']

# Set up for KNI

df_embeddings = pd.read_csv(files[0],delimiter='\t')


In [None]:
mask_internal = df_embeddings['study_name'] != 'external_macfib_schuster_2020_000000'
df_embeddings = df_embeddings[mask_internal]

In [None]:
for ii,fname in enumerate(files):
    
    #df_embeddings = pd.read_csv(files[0])   
    #df_embeddings = df_embeddings.join(df_embeddings_harmony.copy(), on='barcode',rsuffix='_') # scGPT 

    df_embeddings = pd.read_csv(fname,delimiter='\t') # scVI / BAscVI / Harmony / Scanorama
    snlist = list(df_embeddings['study_name'].unique())
    snlist.sort()
    for sn in snlist:
        
        print(sn, np.sum(sn == df_embeddings['study_name']))