In [1]:
%mkdir -p /content/drive/MyDrive/GSE133344
%cd /content/drive/MyDrive/GSE133344

/content/drive/MyDrive/GSE133344


In [None]:
%%bash

#!/bin/bash

declare -A files=(
  ["https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344%5Ffiltered%5Fbarcodes.tsv.gz"]="GSE133344_filtered_barcodes.tsv.gz"
  ["https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344%5Ffiltered%5Fcell%5Fidentities.csv.gz"]="GSE133344_filtered_cell_identities.csv.gz"
  ["https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344%5Ffiltered%5Fgenes.tsv.gz"]="GSE133344_filtered_genes.tsv.gz"
  ["https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344%5Ffiltered%5Fmatrix.mtx.gz"]="GSE133344_filtered_matrix.mtx.gz"
)

for url in "${!files[@]}"; do
  wget -O "${files[$url]}" "$url"
done


Output hidden; open in https://colab.research.google.com to view.

In [None]:
%ls

GSE133344_filtered_barcodes.tsv.gz         GSE133344_filtered_genes.tsv.gz   GSE133344.ipynb
GSE133344_filtered_cell_identities.csv.gz  GSE133344_filtered_matrix.mtx.gz


In [None]:
!zcat GSE133344_filtered_genes.tsv.gz | head

ENSG00000243485	RP11-34P13.3
ENSG00000237613	FAM138A
ENSG00000186092	OR4F5
ENSG00000238009	RP11-34P13.7
ENSG00000239945	RP11-34P13.8
ENSG00000239906	RP11-34P13.14
ENSG00000241599	RP11-34P13.9
ENSG00000279928	FO538757.3
ENSG00000279457	FO538757.2
ENSG00000228463	AP006222.2


In [None]:
!zcat GSE133344_filtered_cell_identities.csv.gz | head

cell_barcode,guide_identity,read_count,UMI_count,coverage,gemgroup,good_coverage,number_of_cells
TTGAACGAGACTCGGA-2,ARID1A_NegCtrl0__ARID1A_NegCtrl0,28684,1809,15.856274184632394,2,True,1
CGTTGGGGTGTTTGTG-7,BCORL1_NegCtrl0__BCORL1_NegCtrl0,18367,896,20.498883928571427,7,True,1
GAACCTAAGTGTTAGA-6,FOSB_NegCtrl0__FOSB_NegCtrl0,16296,664,24.542168674698797,6,True,1
CCTTCCCTCCGTCATC-4,SET_KLF1__SET_KLF1,16262,850,19.131764705882354,4,True,1
TCAATCTGTCTTTCAT-2,OSR2_NegCtrl0__OSR2_NegCtrl0,16057,1067,15.048734770384256,2,True,2
TCCCGATGTCTCTTAT-8,KLF1_BAK1__KLF1_BAK1,15695,750,20.926666666666666,8,True,1
AAACCTGTCCAGAAGG-2,FOXA3_FOXL2__FOXA3_FOXL2,15145,950,15.942105263157895,2,True,1
CTGCCTAGTTCCACAA-4,TP73_NegCtrl0__TP73_NegCtrl0,14827,713,20.79523141654979,4,True,2
GAACCTATCCAGAAGG-3,HES7_NegCtrl0__HES7_NegCtrl0,14237,745,19.110067114093958,3,True,1


In [None]:
!cp GSE133344_filtered_genes.tsv.gz GSE133344_filtered_features.tsv.gz

In [2]:
%pip install scanpy python-dotenv -q

import os
import tarfile
import json
import gzip

import scanpy as sc
import anndata as ad
import pandas as pd

from dotenv import load_dotenv
load_dotenv(dotenv_path='/content/drive/MyDrive/.gse.env')
COLS = json.loads(os.environ.get('OBS_COLS'))
ORGANISMS = json.loads(os.environ.get('ORGANISMS'))

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.4/124.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for session-info (setup.py) ... [?25l[?25hdone


In [None]:
adata = sc.read_mtx('GSE133344_filtered_matrix.mtx.gz').T
adata.obs.index = pd.read_csv('GSE133344_filtered_barcodes.tsv.gz', sep='\t', header=None)[0]
adata.obs.index.name = 'cell_barcode'
genes = pd.read_csv('GSE133344_filtered_features.tsv.gz', sep='\t', index_col=1, header=None)
genes.index.name = 'gene_name'
genes.columns = ['gene_id']
adata.var = genes

obs = pd.read_csv('GSE133344_filtered_cell_identities.csv.gz', index_col=0)
adata = adata[obs.index]
adata.obs = obs
adata.obs.head()

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Unnamed: 0_level_0,guide_identity,read_count,UMI_count,coverage,gemgroup,good_coverage,number_of_cells
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TTGAACGAGACTCGGA-2,ARID1A_NegCtrl0__ARID1A_NegCtrl0,28684,1809,15.856274,2,True,1
CGTTGGGGTGTTTGTG-7,BCORL1_NegCtrl0__BCORL1_NegCtrl0,18367,896,20.498884,7,True,1
GAACCTAAGTGTTAGA-6,FOSB_NegCtrl0__FOSB_NegCtrl0,16296,664,24.542169,6,True,1
CCTTCCCTCCGTCATC-4,SET_KLF1__SET_KLF1,16262,850,19.131765,4,True,1
TCAATCTGTCTTTCAT-2,OSR2_NegCtrl0__OSR2_NegCtrl0,16057,1067,15.048735,2,True,2


In [8]:
adata.obs['perturbation_name'] = adata.obs['guide_identity'].str.split('__').str[0].str.split('_').str[0]
adata.obs['perturbation_name'] = adata.obs['perturbation_name'].apply(lambda x: "+".join(x.split('_')))
adata.obs['guides'] = adata.obs['guide_identity'].str.split('__').str[1]

In [9]:
adata.obs.head()

Unnamed: 0_level_0,guide_identity,read_count,UMI_count,coverage,gemgroup,good_coverage,number_of_cells,perturbation_name,guides
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TTGAACGAGACTCGGA-2,ARID1A_NegCtrl0__ARID1A_NegCtrl0,28684,1809,15.856274,2,True,1,ARID1A,ARID1A_NegCtrl0
CGTTGGGGTGTTTGTG-7,BCORL1_NegCtrl0__BCORL1_NegCtrl0,18367,896,20.498884,7,True,1,BCORL1,BCORL1_NegCtrl0
GAACCTAAGTGTTAGA-6,FOSB_NegCtrl0__FOSB_NegCtrl0,16296,664,24.542169,6,True,1,FOSB,FOSB_NegCtrl0
CCTTCCCTCCGTCATC-4,SET_KLF1__SET_KLF1,16262,850,19.131765,4,True,1,SET,SET_KLF1
TCAATCTGTCTTTCAT-2,OSR2_NegCtrl0__OSR2_NegCtrl0,16057,1067,15.048735,2,True,2,OSR2,OSR2_NegCtrl0


In [11]:
adata.obs['condition'] = adata.obs['perturbation_name'].apply(
    lambda x: 'non-targeting' if x.startswith('NegCtrl') and len(x.split('+')) == 1 else 'targeting'
)
adata.obs['condition'].value_counts()

Unnamed: 0_level_0,count
condition,Unnamed: 1_level_1
targeting,82172
non-targeting,29273


In [12]:
adata.obs['crispr_type'] = 'CRISPRi'
adata.obs['cancer_type'] = 'Leukemiaa'
adata.obs['cell_type'] = 'K562 chronic myelogenous leukemia cell'
adata.obs['organism'] = ORGANISMS[0]

In [13]:
missing = [col for col in COLS if col not in adata.obs]
print(f"Missing following columns: \n {missing}")

if 'organism' in adata.obs_keys():
    assert adata.obs['organism'].isin(ORGANISMS).all(), "Invalid organism naming"

Missing following columns: 
 []


In [14]:
adata.write_h5ad('GSE133344_processed.h5ad')