# Using CellTypist for cell type classification
This notebook showcases the cell type classification for scRNA-seq query data by retrieving the most likely cell type labels from the built-in CellTypist models.

In [11]:
import celltypist
from celltypist import models
import pandas as pd

In [12]:
input_matrix="./dataset/peripheal-blood/matrix.mtx"
gene_id_file="./dataset/peripheal-blood/gene_ids.csv"
cell_id_file="./dataset/peripheal-blood/barcodes.tsv"

In [13]:
# Enabling `force_update = True` will overwrite existing (old) models.
models.download_models(force_update = False)

📂 Storing models in /Users/irenetesta/.celltypist/data/models
⏩ Skipping [1/44]: Immune_All_Low.pkl (file exists)
⏩ Skipping [2/44]: Immune_All_High.pkl (file exists)
⏩ Skipping [3/44]: Adult_CynomolgusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [4/44]: Adult_Human_PancreaticIslet.pkl (file exists)
⏩ Skipping [5/44]: Adult_Human_Skin.pkl (file exists)
⏩ Skipping [6/44]: Adult_Mouse_Gut.pkl (file exists)
⏩ Skipping [7/44]: Adult_Mouse_OlfactoryBulb.pkl (file exists)
⏩ Skipping [8/44]: Adult_Pig_Hippocampus.pkl (file exists)
⏩ Skipping [9/44]: Adult_RhesusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [10/44]: Autopsy_COVID19_Lung.pkl (file exists)
⏩ Skipping [11/44]: COVID19_HumanChallenge_Blood.pkl (file exists)
⏩ Skipping [12/44]: COVID19_Immune_Landscape.pkl (file exists)
⏩ Skipping [13/44]: Cells_Fetal_Lung.pkl (file exists)
⏩ Skipping [14/44]: Cells_Intestinal_Tract.pkl (file exists)
⏩ Skipping [15/44]: Cells_Lung_Airway.pkl (file exists)
⏩ Skipping [16/44]: Developing_Human

In [14]:
genes = pd.read_csv('./dataset/peripheal-blood/genes.tsv', sep='\t', header=None)
gene_ids = genes[0]
gene_ids.to_csv('./dataset/peripheal-blood/gene_ids.csv', index=False, header=False)

In [15]:
predictions = celltypist.annotate(input_matrix, model = 'Immune_All_Low.pkl', transpose_input = True, majority_voting = True, mode = 'best match', gene_file=gene_id_file, cell_file=cell_id_file)
predictions.to_table(folder = './dataset/peripheal-blood/', prefix = "PBMC_raw_Immune_All_Low_")

📁 Input file is './dataset/peripheal-blood/matrix.mtx'
⏳ Loading data
🔬 Input data has 5527 cells and 33570 genes
🔗 Matching reference genes in the model
🧬 11 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!


In [16]:
label_df= pd.read_csv('./dataset/peripheal-blood/PBMC_raw_Immune_All_Low_predicted_labels.csv')
label_df.head()

Unnamed: 0.1,Unnamed: 0,predicted_labels,over_clustering,majority_voting
0,AAACCCACAGGCTTGC-1,Double-positive thymocytes,24,Double-positive thymocytes
1,AAACCCAGTAGTTAGA-1,Double-positive thymocytes,46,Double-positive thymocytes
2,AAACGAAGTAACGATA-1,Double-positive thymocytes,34,Double-positive thymocytes
3,AAACGAAGTGGATCAG-1,Double-positive thymocytes,39,Double-positive thymocytes
4,AAACGAATCATGAGAA-1,Double-positive thymocytes,107,Double-positive thymocytes


In [18]:
label_df['predicted_labels'].value_counts()

Double-positive thymocytes    4783
Regulatory T cells             237
CD8a/a                         171
Tcm/Naive helper T cells       171
Epithelial cells               161
CD16+ NK cells                   4
Name: predicted_labels, dtype: int64