In [1]:
%mkdir -p /content/drive/MyDrive/GSE132080
%cd /content/drive/MyDrive/GSE132080

/content/drive/MyDrive/GSE132080


In [2]:
%%bash

urls=(
"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE132080&format=file&file=GSE132080%5F10X%5Fbarcodes%2Etsv%2Egz"
"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE132080&format=file&file=GSE132080%5F10X%5Fgenes%2Etsv%2Egz"
"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE132080&format=file&file=GSE132080%5F10X%5Fmatrix%2Emtx%2Egz"
"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE132080&format=file&file=GSE132080%5Fcell%5Fidentities%2Ecsv%2Egz"
"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE132080&format=file&file=GSE132080%5FsgRNA%5Fbarcode%5Fsequences%5Fand%5Fphenotypes%2Ecsv%2Egz"
)

for url in "${urls[@]}"
do
   filename=$(echo "$url" | sed -n 's/.*file=\(.*\)/\1/p' | sed 's/%5F/_/g' | sed 's/%2E/./g')

   curl -o "$filename" "$url"

   echo "Downloaded: $filename"
done


Downloaded: GSE132080_10X_barcodes.tsv.gz
Downloaded: GSE132080_10X_genes.tsv.gz
Downloaded: GSE132080_10X_matrix.mtx.gz
Downloaded: GSE132080_cell_identities.csv.gz
Downloaded: GSE132080_sgRNA_barcode_sequences_and_phenotypes.csv.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 94141  100 94141    0     0   251k      0 --:--:-- --:--:-- --:--:--  251k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  258k  100  258k    0     0   821k      0 --:--:-- --:--:-- --:--:--  823k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0  335M    0  104k    0     0   349k      0  0:16:24 --:--:--  0:16:24  348k 14  335M   14 48.0M    0     0  36.6M      0  0:

In [3]:
%pip install scanpy python-dotenv -q

import os
import tarfile
import json
import gzip

import scanpy as sc
import anndata as ad
import pandas as pd

from dotenv import load_dotenv
load_dotenv(dotenv_path='/content/drive/MyDrive/.gse.env')
COLS = json.loads(os.environ.get('OBS_COLS'))
ORGANISMS = json.loads(os.environ.get('ORGANISMS'))

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.4/124.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for session-info (setup.py) ... [?25l[?25hdone


In [7]:
adata = sc.read_mtx('GSE132080_10X_matrix.mtx.gz').T
adata.obs.index = pd.read_csv('GSE132080_10X_barcodes.tsv.gz', sep='\t', header=None)[0]
adata.obs.index.name = 'cell_barcode'
genes = pd.read_csv('GSE132080_10X_genes.tsv.gz', sep='\t', index_col=1, header=None)
genes.index.name = 'gene_name'
genes.columns = ['gene_id']
adata.var = genes

In [12]:
cell_identities = pd.read_csv('GSE132080_cell_identities.csv.gz', index_col=0)
sgRNA_infos = pd.read_csv('GSE132080_sgRNA_barcode_sequences_and_phenotypes.csv.gz', index_col=0)

In [10]:
cell_identities.head()

Unnamed: 0_level_0,guide_identity,read_count,UMI_count,coverage,gemgroup,good_coverage,number_of_cells
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GGACAAGTCCCTGACT-3,neg_ctrl_non-targeting_00028,7452,457,16.306346,3,True,1
CGACTTCAGAAGGCCT-3,GNB2L1_GNB2L1_+_180670873.23-P1P2_13,6554,361,18.155125,3,True,1
TTAGGCAAGAAGGCCT-2,TUBB_TUBB_+_30688126.23-P1_00,4177,165,25.315152,2,True,2
CGTAGGCAGCCAGGAT-1,TUBB_TUBB_+_30688126.23-P1_01,4024,218,18.458716,1,True,1
GCGCAACTCACGATGT-2,HSPE1_HSPE1_+_198365089.23-P1P2_00,3923,134,29.276119,2,True,1


In [13]:
sgRNA_infos

Unnamed: 0_level_0,sequence,gene,gamma_day5,gamma_day10,relative_activity_day5,relative_activity_day10
sgRNA_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ALDOA_+_30077139.23-P1P2_00,GGTCACCAGGACCCCTTCTG,ALDOA,-0.412746,-0.366469,1.000000,1.000000
ALDOA_+_30077139.23-P1P2_06,GGTCACCAGGATCCCTTCTG,ALDOA,-0.396687,-0.348503,0.961091,0.950977
ALDOA_+_30077139.23-P1P2_07,GGTCACCAGGCCCCCTTCTG,ALDOA,-0.360892,-0.335059,0.874369,0.914291
ALDOA_+_30077139.23-P1P2_13,GGTCACCAGGACCCCTTTTG,ALDOA,0.017063,-0.000220,-0.041340,0.000601
ALDOA_+_30077139.23-P1P2_14,GGTCACCAGGACCGCTTCTG,ALDOA,-0.175243,-0.156611,0.424579,0.427353
...,...,...,...,...,...,...
TUBB_+_30688126.23-P1_00,GCGGCAGGAAGGTTCTGAGA,TUBB,-0.897047,-0.699905,1.000000,1.000000
TUBB_+_30688126.23-P1_01,GCAGCAGGAAGGTTCTGAGA,TUBB,-0.925988,-0.611949,1.032262,0.874332
TUBB_+_30688126.23-P1_03,GCGGCAGCAAGGTTCTGAGA,TUBB,-0.692569,-0.495873,0.772054,0.708486
TUBB_+_30688126.23-P1_06,GCGGCAGGACGGTTCTGAGA,TUBB,-0.730802,-0.554446,0.814676,0.792174


In [26]:
import numpy as np

cell_identities['perturbation_name'] = cell_identities['guide_identity'].apply(
    lambda x: x.split('_')[0] if not x.startswith('neg_ctrl') else 'neg_ctrl'
)
cell_identities['perturbation_name'] = cell_identities['perturbation_name'].apply(
    lambda x: x if x != 'neg_ctrl' else 'non-targeting'
)
cell_identities['condition'] = cell_identities['perturbation_name'].apply(
    lambda x: 'targeting' if x != 'non-targeting' else 'non-targeting'
)
mask = np.intersect1d(adata.obs_names, cell_identities.index)
adata = adata[mask]
adata.obs = cell_identities.loc[mask]

  utils.warn_names_duplicates("var")


In [27]:
adata.obs.head()

Unnamed: 0_level_0,guide_identity,read_count,UMI_count,coverage,gemgroup,good_coverage,number_of_cells,perturbation_name,condition
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAACCTGAGAGCAATT-2,RAN_RAN_+_131356438.23-P1P2_02,707,21,33.666667,2,True,1,RAN,targeting
AAACCTGAGAGTAATC-1,RAN_RAN_+_131356438.23-P1P2_12,544,34,16.0,1,True,1,RAN,targeting
AAACCTGAGCTAAGAT-2,CDC23_CDC23_-_137548987.23-P1P2_08,204,13,15.692308,2,True,1,CDC23,targeting
AAACCTGAGGGATCTG-1,neg_ctrl_non-targeting_00089,267,19,14.052632,1,True,1,non-targeting,non-targeting
AAACCTGAGGTCATCT-1,POLR2H_POLR2H_+_184081251.23-P1P2_08,622,34,18.294118,1,True,1,POLR2H,targeting


In [28]:
adata.obs['crispr_type'] = 'CRISPRi'
adata.obs['cancer_type'] = 'Leukemia'
adata.obs['cell_type'] = 'leukemia cell'
adata.obs['organism'] = ORGANISMS[0]

In [29]:
missing = [col for col in COLS if col not in adata.obs]
print(f"Missing following columns: \n {missing}")

if 'organism' in adata.obs_keys():
    assert adata.obs['organism'].isin(ORGANISMS).all(), "Invalid organism naming"

Missing following columns: 
 []


In [None]:
adata.write_h5ad('GSE132080.h5ad')