Notebook for processing and doublet removal of the TEC Lineage 


In [None]:
# fine annotate lineage 
import scanpy as sc
import numpy as np
import pandas as pd
import scvi
import seaborn as sns
import os,sys
#import mudata
import anndata
## Add this line so the text on pdf is correctly recognised!!!
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42
import scvi
import scvi_wrapper as sv

#import multi_view_atlas as mva
sc.logging.print_header()
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'pdf')

# Load large object

In [None]:
adata = sc.read_h5ad('...adata_full_rev_2.h5ad')
adata

In [None]:
sc.set_figure_params(fontsize=10,figsize=[10,10],dpi=100)
sc.pl.umap(adata,
           color=['leiden_2','cell_type_level_0'], 
           frameon=False,
           s=1,
           # groups=['10'],
           legend_loc='on data',
           legend_fontoutline=1,
           outline_color='white',
          )

In [None]:
# isolate only sp clusters and some associated
adata_tec = adata[adata.obs['leiden_2'].isin(['27','25','16','20'])]
sc.set_figure_params(fontsize=10,figsize=[10,10],dpi=100)
sc.pl.umap(adata_tec,
           color='leiden_2', 
           frameon=False,
           s=1,
           # groups=['10'],
           legend_loc='on data',
           legend_fontoutline=1,
           outline_color='white',
          )

In [None]:
a = adata_tec.obs['sample'].value_counts(dropna=False)>5
b = list(a[np.where(list(a))[0]].keys())
adata_tec = adata_tec[adata_tec.obs['sample'].isin(b)]
adata_tec

In [None]:
adata_tec = sv.scvi_wrapper_basic(
    adata_tec,
    max_epochs=350,
    remove_vdjgenes=True,
    hvg=3500,
    batch_key='sample',
    categorical_covariate_keys=['chemistry_simple','age_group','study','sex','donor'],
    continuous_covariate_keys = None,
    cluster_leiden=False,
    layer_use=None,
    remove_cite=False,
) 
sc.tl.leiden(adata_tec, resolution=1, key_added="leiden_1")
sc.tl.leiden(adata_tec, resolution=2, key_added="leiden_2")
sc.tl.leiden(adata_tec, resolution=3, key_added="leiden_3")
sc.tl.leiden(adata_tec, resolution=4, key_added="leiden_4")

In [None]:
sc.set_figure_params(fontsize=10,figsize=[10,10],dpi=100)
sc.pl.umap(adata_tec,
           color=['leiden_2','donor','cell_type_level_4','doublet_score','study','leiden_3','leiden_4'],
           frameon=False,
           ncols=2,
           s=5,
           legend_loc='on data',
           legend_fontoutline=1,
           outline_color='white',
          )

In [None]:
sc.pl.umap(adata_tec,
           color=['leiden_2'],
           frameon=False,
           ncols=2,
           s=5,
           groups='26',
           legend_loc='on data',
           legend_fontoutline=1,
           outline_color='white',
          )

In [None]:
adata_tec.obs['cell_type_level_0'].value_counts(dropna=False)

In [None]:
sc.pl.dotplot(adata_tec,var_names=['doublet_score'],groupby='leiden_2',swap_axes=True)

In [None]:
os.chdir('...')
adata_tec.write_h5ad('adata_tec_rev_1.h5ad')

In [None]:
os.chdir('...')
adata_tec = sc.read_h5ad('adata_tec_rev_1.h5ad')

In [None]:
adata_tec.obs['unannotated'] = adata_tec.obs['cell_type_level_3'].isna().astype('str')
adata_tec.obs['unannotated'].value_counts()
sc.set_figure_params(fontsize=10,figsize=[15,15])
sc.pl.umap(adata_tec,
           color=['unannotated'], 
           ncols=3,
           # legend_loc='on data',
           groups='True',
           wspace=0.5,
           frameon=False,
           s=5,
           cmap='gist_rainbow',
          )

In [None]:
adata_tec = adata_tec[~adata_tec.obs['leiden_2'].isin(['26','29'])]

a = adata_tec.obs['sample'].value_counts(dropna=False)>5
b = list(a[np.where(list(a))[0]].keys())
adata_tec = adata_tec[adata_tec.obs['sample'].isin(b)]
adata_tec
# b

In [None]:
adata_tec = sv.scvi_wrapper_basic(
    adata_tec,
    max_epochs=350,
    remove_vdjgenes=True,
    hvg=5000,
    batch_key='sample',
    categorical_covariate_keys=['chemistry_simple','age_group','study','sex','donor'],
    continuous_covariate_keys = None,
    cluster_leiden=False,
    layer_use=None,
    remove_cite=False,
) 

In [None]:
adata_tec.obs['cell_type_level_0'].value_counts(dropna=False)

In [None]:
sc.set_figure_params(fontsize=10,figsize=[10,10],dpi=100)
sc.pl.umap(adata_tec,
           color=['cell_type_level_4','age_group','PCNA'],
           frameon=False,
           ncols=2,
           s=5,
           legend_loc='on data',
           legend_fontoutline=1,
           outline_color='white',
          )

In [None]:
adata_tec.obs['unannotated'] = adata_tec.obs['cell_type_level_3'].isna().astype('str')
adata_tec.obs['unannotated'].value_counts()
sc.set_figure_params(fontsize=10,figsize=[15,15])
sc.pl.umap(adata_tec,
           color=['unannotated'], 
           ncols=3,
           # legend_loc='on data',
           groups='True',
           wspace=0.5,
           frameon=False,
           s=5,
           cmap='gist_rainbow',
          )

## KNN-prediction

In [None]:
# migrate annotations. taken from - https://github.com/LungCellAtlas/mapping_data_to_the_HLCA/blob/main/scripts/scarches_label_transfer.py

from collections import Counter

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsTransformer

def weighted_knn_trainer(train_adata, train_adata_emb, n_neighbors=50):
    """Trains a weighted KNN classifier on ``train_adata``.
    Parameters
    ----------
    train_adata: :class:`~anndata.AnnData`
        Annotated dataset to be used to train KNN classifier with ``label_key`` as the target variable.
    train_adata_emb: str
        Name of the obsm layer to be used for calculation of neighbors. If set to "X", anndata.X will be
        used
    n_neighbors: int
        Number of nearest neighbors in KNN classifier.
    """
    print(
        f"Weighted KNN with n_neighbors = {n_neighbors} ... ",
        end="",
    )
    k_neighbors_transformer = KNeighborsTransformer(
        n_neighbors=n_neighbors,
        mode="distance",
        algorithm="brute",
        metric="euclidean",
        n_jobs=-1,
    )
    if train_adata_emb == "X":
        train_emb = train_adata.X
    elif train_adata_emb in train_adata.obsm.keys():
        train_emb = train_adata.obsm[train_adata_emb]
    else:
        raise ValueError(
            "train_adata_emb should be set to either 'X' or the name of the obsm layer to be used!"
        )
    k_neighbors_transformer.fit(train_emb)
    return k_neighbors_transformer


def weighted_knn_transfer(
    query_adata,
    query_adata_emb,
    ref_adata_obs,
    label_keys,
    knn_model,
    threshold=1,
    pred_unknown=False,
    mode="package",
):
    """Annotates ``query_adata`` cells with an input trained weighted KNN classifier.
    Parameters
    ----------
    query_adata: :class:`~anndata.AnnData`
        Annotated dataset to be used to queryate KNN classifier. Embedding to be used
    query_adata_emb: str
        Name of the obsm layer to be used for label transfer. If set to "X",
        query_adata.X will be used
    ref_adata_obs: :class:`pd.DataFrame`
        obs of ref Anndata
    label_keys: str
        Names of the columns to be used as target variables (e.g. cell_type) in ``query_adata``.
    knn_model: :class:`~sklearn.neighbors._graph.KNeighborsTransformer`
        knn model trained on reference adata with weighted_knn_trainer function
    threshold: float
        Threshold of uncertainty used to annotating cells as "Unknown". cells with
        uncertainties higher than this value will be annotated as "Unknown".
        Set to 1 to keep all predictions. This enables one to later on play
        with thresholds.
    pred_unknown: bool
        ``False`` by default. Whether to annotate any cell as "unknown" or not.
        If `False`, ``threshold`` will not be used and each cell will be annotated
        with the label which is the most common in its ``n_neighbors`` nearest cells.
    mode: str
        Has to be one of "paper" or "package". If mode is set to "package",
        uncertainties will be 1 - P(pred_label), otherwise it will be 1 - P(true_label).
    """
    if not type(knn_model) == KNeighborsTransformer:
        raise ValueError(
            "knn_model should be of type sklearn.neighbors._graph.KNeighborsTransformer!"
        )

    if query_adata_emb == "X":
        query_emb = query_adata.X
    elif query_adata_emb in query_adata.obsm.keys():
        query_emb = query_adata.obsm[query_adata_emb]
    else:
        raise ValueError(
            "query_adata_emb should be set to either 'X' or the name of the obsm layer to be used!"
        )
    top_k_distances, top_k_indices = knn_model.kneighbors(X=query_emb)

    stds = np.std(top_k_distances, axis=1)
    stds = (2.0 / stds) ** 2
    stds = stds.reshape(-1, 1)

    top_k_distances_tilda = np.exp(-np.true_divide(top_k_distances, stds))

    weights = top_k_distances_tilda / np.sum(
        top_k_distances_tilda, axis=1, keepdims=True
    )
    cols = ref_adata_obs.columns[ref_adata_obs.columns.str.startswith(label_keys)]
    uncertainties = pd.DataFrame(columns=cols, index=query_adata.obs_names)
    pred_labels = pd.DataFrame(columns=cols, index=query_adata.obs_names)
    for i in range(len(weights)):
        for j in cols:
            y_train_labels = ref_adata_obs[j].values
            unique_labels = np.unique(y_train_labels[top_k_indices[i]])
            best_label, best_prob = None, 0.0
            for candidate_label in unique_labels:
                candidate_prob = weights[
                    i, y_train_labels[top_k_indices[i]] == candidate_label
                ].sum()
                if best_prob < candidate_prob:
                    best_prob = candidate_prob
                    best_label = candidate_label

            if pred_unknown:
                if best_prob >= threshold:
                    pred_label = best_label
                else:
                    pred_label = "Unknown"
            else:
                pred_label = best_label

            if mode == "package":
                uncertainties.iloc[i][j] = (max(1 - best_prob, 0))

            else:
                raise Exception("Inquery Mode!")

            pred_labels.iloc[i][j] = (pred_label)

    print("finished!")

    return pred_labels, uncertainties

In [None]:
train_adata = adata_tec[adata_tec.obs['unannotated']=='False']
k_neighbors_transformer = weighted_knn_trainer(train_adata, 'X_scVI', n_neighbors=20)
train_adata

In [None]:
target_adata = adata_tec[adata_tec.obs['unannotated']=='True']
target_adata

In [None]:
# cell_type_level_4

In [None]:
pred_labels, uncertainties = weighted_knn_transfer(
    query_adata=target_adata,
    query_adata_emb='X_scVI',
    ref_adata_obs=train_adata.obs,
    label_keys='cell_type_level_4',
    knn_model=k_neighbors_transformer,
    threshold=1,
    pred_unknown=False,
    mode="package",
)

In [None]:
uncertainties.cell_type_level_4.hist()

In [None]:
adata_tec.obs['pred_cell_type_level_4'] = adata_tec.obs['cell_type_level_4']
adata_tec.obs['pred_cell_type_level_4_uncertainties'] = 0
adata_tec.obs.loc[pred_labels.index,'pred_cell_type_level_4'] = pred_labels['cell_type_level_4']
adata_tec.obs.loc[pred_labels.index,'pred_cell_type_level_4_uncertainties'] = uncertainties['cell_type_level_4']
adata_tec.obs['pred_cell_type_level_4'].value_counts(dropna=False)
sc.set_figure_params(dpi=200)
sc.pl.umap(adata_tec[adata_tec.obs['unannotated']!='False'], color=['pred_cell_type_level_4','pred_cell_type_level_4_uncertainties'],s=10)

In [None]:
sc.pl.violin(adata_tec, keys=['pct_counts_ribo','pct_counts_mt','n_genes_by_counts'],groupby='unannotated',s=1,log=True)

In [None]:
adata_tec.obs['pred_cell_type_level_4_uncertainties'] = adata_tec.obs['pred_cell_type_level_4_uncertainties'].astype('float32')

In [None]:
os.chdir('...')
adata_tec.write_h5ad('adata_tec_rev_1_clean.h5ad')