# Pancreatitis scRNA-seq Data - Reananalysis

In [2]:
import scanpy as sc
import scvi

In [10]:
adata = sc.read_text('rawdata/GSE181276_genes.counts_for_GEO_uploading.txt').T

In [34]:
adata

Unnamed: 0,n_cells,highly_variable,highly_variable_rank,means,variances,variances_norm
Sox17,995,True,952.0,0.070158,0.338813,0.778727
St18,471,True,1379.0,0.034263,0.126855,0.576267
Sbspon,1810,True,817.0,0.138208,0.707650,0.892606
Rdh10,9712,True,1294.0,0.988005,6.635307,0.605559
Gdap1,449,True,1776.0,0.027256,0.081145,0.464792
...,...,...,...,...,...,...
Adra2a,646,True,855.0,0.042962,0.234823,0.859430
Habp2,1600,True,1820.0,0.081322,0.224414,0.454574
Pnlip,25481,True,742.0,104.450907,234857.664343,0.966904
Pnliprp1,23241,True,720.0,45.855883,37249.587246,1.003806


In [16]:
adata.X.shape

(33681, 31053)

In [18]:
Total_gene_number = adata.shape[1]

In [19]:
# Filter to remove genes occuring in less than 10 cells
sc.pp.filter_genes(adata, min_cells=10)

In [21]:
Gene_Number_after_filtering = adata.shape[1]

In [24]:
Loss_of_genes = round((Total_gene_number-Gene_Number_after_filtering)/Total_gene_number*100,2)
Loss_of_genes

43.56

In [28]:
sc.pp.highly_variable_genes(adata, n_top_genes = 2000, subset = True, flavor='seurat_v3')

In [29]:
adata

AnnData object with n_obs × n_vars = 33681 × 2000
    var: 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg'

In [30]:
# Train the SCVI Model
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata)
vae.train()

  accelerator, lightning_devices, device = parse_device_args(
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/thorsten/.pyenv/versions/3.10.6/envs/SingleCell/lib/python3.10/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/thorsten/.pyenv/versions/3.10.6/envs/SingleCell/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training:   0%|          | 0/238 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=238` reached.


In [31]:
# Train the doublet class model (it has an stop inside if loss is not changing anymore)
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()

[34mINFO    [0m Creating doublets, preparing SOLO model.                                                                  


  accelerator, lightning_devices, device = parse_device_args(
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/thorsten/.pyenv/versions/3.10.6/envs/SingleCell/lib/python3.10/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/thorsten/.pyenv/versions/3.10.6/envs/SingleCell/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/thorsten/.pyenv/versions/3.10.6/envs/SingleCell/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` a

Training:   0%|          | 0/400 [00:00<?, ?it/s]

Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.209. Signaling Trainer to stop.


In [32]:
# Use predict to annotate the cell barcodes with the trained identifier and annotate it as a string in "doublet" or "singlet"
df = solo.predict()
df['prediction'] = solo.predict(soft=False)
df

  return func(*args, **kwargs)
  return func(*args, **kwargs)


Unnamed: 0,doublet,singlet,prediction
WT_AAACCCAAGCATCTTG,0.006114,0.993886,singlet
WT_AAACCCAAGGGTTGCA,0.120150,0.879850,singlet
WT_AAACCCAGTCCGAAAG,0.188696,0.811304,singlet
WT_AAACCCAGTCCGTTTC,0.097345,0.902655,singlet
WT_AAACCCAGTGTGGTCC,0.003006,0.996994,singlet
...,...,...,...
D7_analysis_TTTGTTGGTGAGCAGT,0.027798,0.972202,singlet
D7_analysis_TTTGTTGGTGCCTGAC,0.266351,0.733649,singlet
D7_analysis_TTTGTTGGTTGAGAGC,0.101135,0.898865,singlet
D7_analysis_TTTGTTGTCCTTATAC,0.057880,0.942120,singlet


In [52]:
# Count how many cells are predicted as singlet or doublet
counts = df.groupby('prediction').count()

# Total number of cells
total = counts.sum().values[0]  # or len(df)

# Number of predicted doublets
num_doublets = df.groupby('prediction').count().loc['doublet','doublet']

# Percentage of doublets
percent_doublets = (num_doublets / total) * 100

print(f"Doublets: {num_doublets} / {total} ({percent_doublets:.2f}%)")

Doublets: 3811 / 33681 (11.31%)


In [36]:
#Calculating the difference of doublet and singlet score to identify cells that have high scores in both (close to 0)
df['dif'] = df.doublet - df.singlet
df

Unnamed: 0,doublet,singlet,prediction,dif
WT_AAACCCAAGCATCTTG,0.006114,0.993886,singlet,-0.987771
WT_AAACCCAAGGGTTGCA,0.120150,0.879850,singlet,-0.759700
WT_AAACCCAGTCCGAAAG,0.188696,0.811304,singlet,-0.622608
WT_AAACCCAGTCCGTTTC,0.097345,0.902655,singlet,-0.805311
WT_AAACCCAGTGTGGTCC,0.003006,0.996994,singlet,-0.993988
...,...,...,...,...
D7_analysis_TTTGTTGGTGAGCAGT,0.027798,0.972202,singlet,-0.944404
D7_analysis_TTTGTTGGTGCCTGAC,0.266351,0.733649,singlet,-0.467298
D7_analysis_TTTGTTGGTTGAGAGC,0.101135,0.898865,singlet,-0.797729
D7_analysis_TTTGTTGTCCTTATAC,0.057880,0.942120,singlet,-0.884240


In [None]:
# Count how many cells are predicted as singlet or doublet
counts = df.groupby('prediction').count()

# Total number of cells
total = counts.sum().values[0]  # or len(df)

# Number of predicted doublets
num_doublets = counts.loc['doublet'].values[0]

# Percentage of doublets
percent_doublets = (num_doublets / total) * 100

print(f"Doublets: {num_doublets} / {total} ({percent_doublets:.2f}%)")