<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Functions" data-toc-modified-id="Functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Functions</a></span><ul class="toc-item"><li><span><a href="#Preprocess-10X" data-toc-modified-id="Preprocess-10X-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Preprocess 10X</a></span></li><li><span><a href="#Load-10X" data-toc-modified-id="Load-10X-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Load 10X</a></span></li><li><span><a href="#Reduce-dimensions" data-toc-modified-id="Reduce-dimensions-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Reduce dimensions</a></span></li></ul></li><li><span><a href="#Main" data-toc-modified-id="Main-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Main</a></span><ul class="toc-item"><li><span><a href="#Load-data-and-preprocess" data-toc-modified-id="Load-data-and-preprocess-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Load data and preprocess</a></span></li><li><span><a href="#Reduce-dimensions" data-toc-modified-id="Reduce-dimensions-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Reduce dimensions</a></span></li><li><span><a href="#Display-PCA" data-toc-modified-id="Display-PCA-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Display PCA</a></span></li></ul></li></ul></div>

# Imports

In [14]:
import scprep
import matplotlib.pyplot as plt
# from functions import *

# Functions

## Preprocess 10X

In [2]:
#' This function takes a data-frame (genes x cells), 
#' creates a Scprep object with it and filters the object for default tags such as  
#' Min and max nFeature_RNA and % of MT
#' 
#'
#' @param data data-frame
#' @param percent_mt integer [0-100]
#' @param max_features integer [0-Inf]
#' @param min_features integer [0-Inf]
#' @return Preprocessed Seurat object
#' @export

def preprocess_10X(data, name='10X-project', percent_mt=20, max_features=5000, min_features=200):
    
    #Remove empty cells and empty genes
    scprep_data = scprep.filter.filter_empty_cells(data)
    scprep_data = scprep.filter.filter_empty_genes(scprep_data)
    
    #Remove elements based on mythocondrial percentage
    mt_genes = scprep.select.get_gene_set(data, starts_with=["MT-", "mt-"])
    scprep_data = scprep.filter.filter_gene_set_expression(data=scprep_data, genes=mt_genes, percentile=100-percent_mt)
    
    #Remove elements based on number of cell and number of features
    scprep_data = scprep.filter.filter_library_size(data=scprep_data, cutoff=(min_features, max_features), keep_cells='between')
    scprep_data = scprep.filter.filter_rare_genes(data=scprep_data, cutoff=0, min_cells=3)

    return scprep_data

## Load 10X

In [3]:
# This function takes the path to a 10X output folder and instanciates the Scprep object
#
# @param file string (path to file)
# @param percent_mt integer [0-100]
# @param max_features integer [0-Inf]
# @param min_features integer [0-Inf]
# @return Preprocessed Seurat object
# @export

def load_10X(dir_path, name='10X-project',percent_mt=20, max_features=5000, min_features=200):
    data = scprep.io.load_10X(dir_path, sparse=True, gene_labels='both')
    return preprocess_10X(data=data, percent_mt=percent_mt, max_features=max_features, min_features=min_features)


## Reduce dimensions

In [9]:
# This function reduces the dimensions of the Normalized Seurat object
# Runs PCA and then UMAP and then performs clustering

# @param seurat.obj S4 instance
# @param ndims integer [3-100]
# @param res double [0-3]
# @return Seurat object with reduction embeddings
# @export

def reduce_dim(data, ndims=15, res=.1):
    data = scprep.normalize.library_size_normalize(data)
#     data = scprep.transform.sqrt(data)

    return data

# Main

## Load data and preprocess

In [5]:

data_paths = [
    "../data/14_EAE_all_cells.MPS12344479-A02.sorted.1229.merged.10x_outputs/filtered_feature_bc_matrix",
    "../data/15_EAE_all_cells.MPS12344479-B02.sorted.1229.merged.10x_outputs/filtered_feature_bc_matrix",
    "../data/16_WT_all_cells.MPS12344479-H01.sorted.1229.merged.10x_outputs/filtered_feature_bc_matrix"
]

# seurat_path = "../../tutorials/seurat_guidedClustering/seurat_clusteringTutorial/filtered_gene_bc_matrices/hg19"
data = load_10X(dir_path=data_paths[0], percent_mt=20, max_features=5000, min_features=200)
data.head()


Unnamed: 0_level_0,Sox17 (ENSMUSG00000025902),Mrpl15 (ENSMUSG00000033845),Lypla1 (ENSMUSG00000025903),Tcea1 (ENSMUSG00000033813),Rgs20 (ENSMUSG00000002459),Atp6v1h (ENSMUSG00000033793),Rb1cc1 (ENSMUSG00000025907),4732440D04Rik (ENSMUSG00000090031),St18 (ENSMUSG00000033740),Pcmtd1 (ENSMUSG00000051285),...,Vamp7 (ENSMUSG00000051412),Tmlhe (ENSMUSG00000079834),AC133103.1 (ENSMUSG00000079190),Csprs (ENSMUSG00000062783),AC125149.3 (ENSMUSG00000079800),AC168977.2 (ENSMUSG00000094915),AC168977.1 (ENSMUSG00000079808),AC149090.1 (ENSMUSG00000095041),CAAA01118383.1 (ENSMUSG00000063897),CAAA01147332.1 (ENSMUSG00000095742)
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGGTATCGCAT-1,3.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
AAACGGGAGAACTGTA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
AAACGGGGTTCTCATT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
AAACGGGGTTCTGAAC-1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
AAAGATGCAATGACCT-1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


## Reduce dimensions

In [10]:
reduce_dim(data)

Unnamed: 0_level_0,Sox17 (ENSMUSG00000025902),Mrpl15 (ENSMUSG00000033845),Lypla1 (ENSMUSG00000025903),Tcea1 (ENSMUSG00000033813),Rgs20 (ENSMUSG00000002459),Atp6v1h (ENSMUSG00000033793),Rb1cc1 (ENSMUSG00000025907),4732440D04Rik (ENSMUSG00000090031),St18 (ENSMUSG00000033740),Pcmtd1 (ENSMUSG00000051285),...,Vamp7 (ENSMUSG00000051412),Tmlhe (ENSMUSG00000079834),AC133103.1 (ENSMUSG00000079190),Csprs (ENSMUSG00000062783),AC125149.3 (ENSMUSG00000079800),AC168977.2 (ENSMUSG00000094915),AC168977.1 (ENSMUSG00000079808),AC149090.1 (ENSMUSG00000095041),CAAA01118383.1 (ENSMUSG00000063897),CAAA01147332.1 (ENSMUSG00000095742)
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGGTATCGCAT-1,12.195122,0.000000,0.000000,4.065041,0.0,0.000000,8.130081,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.065041
AAACGGGAGAACTGTA-1,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,8.665511,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,8.665511
AAACGGGGTTCTCATT-1,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,5.065856,2.532928,0.000000
AAACGGGGTTCTGAAC-1,0.000000,0.000000,0.000000,2.319109,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,2.319109,4.638219
AAAGATGCAATGACCT-1,0.000000,2.428363,2.428363,0.000000,0.0,2.428363,0.000000,0.0,0.0,2.428363,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.856727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAAGTACCT-1,0.000000,16.863406,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
TTTGTCAAGGGTTCCC-1,0.000000,0.000000,0.000000,0.000000,0.0,4.242681,4.242681,0.0,0.0,0.000000,...,2.121341,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,2.121341
TTTGTCACATCTGGTA-1,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,3.484321
TTTGTCATCAAGAAGT-1,0.000000,0.000000,0.000000,4.378284,0.0,4.378284,0.000000,0.0,0.0,4.378284,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000


## Display PCA

In [16]:
data_pcs = scprep.reduce.pca(data, n_pca=100)



In [27]:
# fig, ax = plt.subplots(1, figsize=(5,5))

print(data_pcs[:, 1])
# # ax.scatter(data_pcs[:,0], data_pcs[:,1], s=1)
# ax.set_xlabel('PC1')
# ax.set_ylabel('PC2')
# ax.set_title('T-cell - PCA')

# fig.tight_layout()
data_pcs

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC91,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99,PC100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGGTATCGCAT-1,54.571623,-42.050212,-32.301797,-8.644527,-9.892896,-30.296499,27.293738,11.196447,-1.921014,-4.859825,...,-1.970145,1.412500,1.623350,2.270855,-1.279885,-2.242077,-1.760964,-2.598679,0.620645,0.659460
AAACGGGAGAACTGTA-1,60.688941,-41.787415,-44.238570,11.359011,-18.103137,-7.897464,-0.406050,14.440889,-7.447427,-3.455910,...,1.836934,2.640584,-2.626710,-0.226265,0.133087,0.207336,-1.388068,-0.375155,-3.534828,1.846835
AAACGGGGTTCTCATT-1,-111.617975,70.515353,-14.399128,22.207315,-16.877472,33.618490,28.469850,-61.627316,3.280834,-22.714868,...,3.891487,3.335549,1.188549,-3.550460,-0.393050,0.970080,-2.392282,2.661967,-6.872816,1.971505
AAACGGGGTTCTGAAC-1,-158.290800,96.990258,4.189925,-15.649616,16.238163,-21.703642,-5.346266,37.771566,-10.970094,-14.929869,...,-0.652627,-1.364883,-3.179916,7.652325,6.842832,-1.049886,-8.763531,5.403565,3.025563,-0.708290
AAAGATGCAATGACCT-1,27.725490,-40.993366,88.790572,3.635568,2.338297,7.368429,-5.346816,-17.606005,-5.496045,10.525775,...,3.683723,-3.214429,2.331204,-1.784724,1.686061,-7.920170,0.683620,-3.630835,0.234583,3.080840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAAGTACCT-1,36.451231,-20.190025,-58.967005,11.653974,-10.222626,6.424215,-40.496598,8.145069,-1.675757,-11.896460,...,1.923180,-0.474638,-0.794694,2.797560,1.476609,0.422986,-1.487143,2.157046,-0.137277,-2.570250
TTTGTCAAGGGTTCCC-1,22.509996,-41.278538,106.585167,-0.060550,5.755612,10.047605,2.010885,-20.044406,-1.348159,10.721293,...,6.994986,-7.362923,6.771368,-2.706019,0.863807,-1.701758,0.274522,-5.718739,-7.409088,-1.262381
TTTGTCACATCTGGTA-1,-50.972907,14.946033,59.869345,5.712163,1.866353,25.170768,3.959365,-13.918282,-6.279613,-61.993980,...,8.435286,-1.652817,8.119613,6.665620,0.869893,-0.876310,-3.922431,0.932367,3.622807,2.775007
TTTGTCATCAAGAAGT-1,38.880842,-40.162480,40.526938,10.680056,-3.285649,8.432925,-24.593881,-1.201882,-7.349881,10.223630,...,2.937499,1.113081,-0.236544,2.596381,0.998135,1.825668,-0.370620,-4.071295,-0.770253,-5.165614
