In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
## Try first when creating the notebook

%cd /content/drive/MyDrive/repos
import os
repo_path = "/content/drive/MyDrive/repos/Epilepsy_Microglia"
if not os.path.exists(repo_path):
    !git clone https://github.com/Unsogoodday/Epilepsy_Microglia.git {repo_path}
%cd {repo_path}

In [None]:
## remote check
%cd /content/drive/MyDrive/repos/Epilepsy_Microglia
!git remote -v

/content/drive/MyDrive/repos/Epilepsy_Microglia


In [None]:
## If there is no origin/fetch, run this
!git remote add origin "https://github.com/Unsogoodday/Epilepsy_Microglia.git"
!git push -u origin main

In [None]:
## config
%cd /content/drive/MyDrive/repos/Epilepsy_Microglia
!git config user.email "242shizume@gmail.com"
!git config user.name "Unsogoodday"

/content/drive/MyDrive/repos/Epilepsy_Microglia


In [None]:
## Save to github

%cd /content/drive/MyDrive/repos/Epilepsy_Microglia
!git pull origin main --allow-unrelated-histories
!git add .
!git commit -m "Create 2. preprocess"

/content/drive/MyDrive/repos/Epilepsy_Microglia
From https://github.com/Unsogoodday/Epilepsy_Microglia
 * branch            main       -> FETCH_HEAD
Already up to date.
[main 306592f] Create 2. preprocess
 1 file changed, 1 insertion(+)
 create mode 100644 notebooks/2. preprocess


In [None]:
!git push origin main

Enumerating objects: 6, done.
Counting objects:  16% (1/6)Counting objects:  33% (2/6)Counting objects:  50% (3/6)Counting objects:  66% (4/6)Counting objects:  83% (5/6)Counting objects: 100% (6/6)Counting objects: 100% (6/6), done.
Delta compression using up to 2 threads
Compressing objects:  25% (1/4)Compressing objects:  50% (2/4)Compressing objects:  75% (3/4)Compressing objects: 100% (4/4)Compressing objects: 100% (4/4), done.
Writing objects:  25% (1/4)Writing objects:  50% (2/4)Writing objects:  75% (3/4)Writing objects: 100% (4/4)Writing objects: 100% (4/4), 2.08 KiB | 212.00 KiB/s, done.
Total 4 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/Unsogoodday/Epilepsy_Microglia.git
   e009ad7..306592f  main -> main


In [None]:
# 1. install packages
!git pull origin main --allow-unrelated-histories
%cd /content/drive/MyDrive/repos/Epilepsy_Microglia
%pip install -q -r requirements.txt

In [None]:
 #2. Import Packages

import torch

# Single Cell Libraries
import scvi
import scanpy as sc
import anndata as ad

# Data Processing and Plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import igraph
import leidenalg

# File grab
import os
import tempfile
import pooch
import shutil, subprocess, glob
import gzip

print(torch.__version__)
print(scvi.__version__)
print(torch.cuda.is_available())

**2. Preprocess**

0. Check var_names, make names unique
1. QC


In [None]:
KUMAR = "/content/drive/MyDrive/datas/epilepsy_microglia/raw/GSE201048_raw/kumar_GSE201048_raw.h5ad"
adata = sc.read_h5ad(KUMAR)
adata

In [None]:
tempdir = "/content/plots/"
os.makedirs(tempdir, exist_ok=True)

In [None]:
# 0-1. Check var names (ensemble vs symbol)
print(adata.obs_names[:10])

In [None]:
# 0-2. make names unique
adata.var_names_make_unique()
adata

In [None]:
# 1-1. QC
adata.var["mt"] = adata.var_names.str.startswith("MT-")
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)

sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter = 0.4,
    multi_panel=True,
    save = "qc.png"
)
sc.pl.scatter(
    adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt",
    save = "qc.png"
)


In [None]:
# 1-2.
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=3)

adata.layers["counts"] = adata.X.copy()

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata

sc.pp.highly_variable_genes(
    adata,
    n_top_genes=1200,
    subset=True,
    layer="counts",
    flavor="seurat_v3",
)

adata