# Sciplex3
In this notebook, we load the sciplex3 dataset (4 cell line 188 drugs, 4 dosages).
Count matrix contains about 1 007 419 688 (1 billion ) non zero elements and about 700 000 cells.

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata
import os
import importlib
from sklearn.decomposition import PCA
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import sys


In [24]:
# change this to the directory that contains the datasets
data_dir = 'C:/Users/nbrouwer1/Documents/VS_projects/prepare_data/input_files/sciplex_selections/'

fn = data_dir + 'Srivatsan_2019_raw.h5ad'

adata = sc.read(fn)
adata

AnnData object with n_obs × n_vars = 354640 × 2000
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'batch', 'n_counts', 'dose_val', 'condition', 'drug_dose_name', 'cov_drug_dose_name', 'cov_drug', 'control', 'split_ho_pathway', 'split_tyrosine_ood', 'split_epigenetic_ood', 'split_cellcycle_ood', 'SMILES', 'split_ood_finetuning', 'split_ho_epigenetic', 'split_ho_epigenetic_all', 'split_random', 'split_ood'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'gene_id', 'in_lincs', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'all_DEGs', 'hvg', 'lincs_DEGs', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'rdkit2d', 'rdkit2d_dose'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [None]:
# metalabels
adata.uns['preprocessing_nb_link'] = f'https://nbviewer.org/github/theislab/sc-pert/blob/main/datasets/Srivatsan_2019_curation.ipynb'
adata.uns['doi'] = doi
print(adata)
display(adata.obs.describe(include='all').T)

# filtering and processing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=20)

adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

# use gene symbols as gene names
if var_genes:
    adata.var[var_genes] = adata.var[var_genes].astype(str)
    adata.var = adata.var.reset_index().set_index(var_genes)
    print(adata.var_names)
    
adata.var_names_make_unique()

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 8000, :]  # edit
adata = adata[adata.obs.pct_counts_mt < 50]

In [None]:
# %load block3_standardize.py
# the following fields are meant to serve as a template
control = 'Vehicle'
replace_dict = {
    control: 'DMSO',  # most common control for small molecules, with its own strong effect
}
adata.obs['perturbation_name'] = [s.split(' (')[0].split('HCl')[0] for s in adata.obs.product_name]
adata.obs['perturbation_name'] = adata.obs['perturbation_name'].replace(replace_dict)
adata.obs['perturbation_name'].unique()

In [None]:
adata.obs['perturbation_type'] = 'small molecule'
adata.obs['perturbation_value'] = adata.obs['dose']
adata.obs['perturbation_unit'] = 'ug'

In [None]:
sc.pl.umap(adata, color=[c for c in adata.obs.columns if len(adata.obs[c].unique()) < 30], wspace=.4)

In [None]:
sc.pl.umap(
    adata[adata.obs.perturbation_name.isin((adata.obs.perturbation_name.value_counts().index[:30]))],
    color='perturbation_name')