In [9]:
import os
from parse_soft import ParseSoft
from search_soft import SearchSoft
from enrich_soft import EnrichSoft
from label_data import LabelData

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
# step 1: set defaults
geo = "GSE137811"
soft_dir = '/home/yuan/data'
meta_dir = '../data'
download_dir = '/home/yuan/rawdata/SRR'

In [11]:
# step 2: parse soft, and enrich

# analyze soft file
parser = ParseSoft(soft_dir)
soft_path = parser.soft_local_path(geo)
print('Retrieve meta data from soft file: ', soft_path)
softer = SearchSoft(soft_path)
data = softer.filter_data(softer.parse_rows)

# save to json
LabelData(meta_dir).save(data)

Retrieve meta data from soft file:  /home/yuan/data/ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137811/soft/GSE137811_family.soft.gz
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE137nnn/GSE137811/metadata.json


In [12]:
# step 3a: enrich data
data = LabelData(meta_dir).get_meta(geo)
enriched = EnrichSoft(data)()
LabelData(meta_dir).save(enriched)

Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE137nnn/GSE137811/metadata.json


In [13]:
# step 3b: label samples manually
from label_sample import LabelSample

def collator(sample):
    c = LabelSample(sample)
    c._protocol('scrna-seq')
    c._update('cell_type', 'induced pluripotent stem cells')
    
data = LabelData(meta_dir).get_meta(geo)
for sample_id, sample in data['samples'].items():
    collator(sample)
LabelData(meta_dir).save(data)

Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE137nnn/GSE137811/metadata.json


In [14]:
for s in data['samples'].values():
    print(s['labels'])

{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the lung primordial progenitor stage'}
{'tissue': 'ipsc derived cells sorted on nkx2-1^gfp at the l

In [None]:
# stop here

In [28]:
# step 4a: load some data
from utils import Utils

# data are determined previously
samn_srr = Utils.from_json('../results/samn_srr.json')
print(len(samn_srr))

47


In [29]:
# step 4b: Given BioSample, parse SRR accessions into samples
from parse_sra import ParseSra
from label_data import LabelData

data = LabelData(meta_dir).get_meta(geo)
enriched = ParseSra.parse_srr(data, samn_srr)
LabelData(meta_dir).save(enriched)

SRR=2725, samples=1090.
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE112nnn/GSE112274/metadata.json


In [30]:
# step 5: merge srr_urls into data

# sample.<sample_id>.SRR.<SRR accession>."ftp.sra.ebi.ac.uk"
file_name = 'srr_fastq_urls_simple.json'
urls_json = os.path.join('../results', file_name)
urls = Utils.from_json(urls_json)

# parse urls
data = LabelData(meta_dir).get_meta(geo)
enriched = ParseSra.parse_srr_urls(data, urls)
LabelData(meta_dir).save(enriched)

{'srr': 1090, 'available': 2725, 'unknown': 0, 'updated': 2725}
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE112nnn/GSE112274/metadata.json


In [31]:
# step 6a: download raw data using biosample accession using nextflow
from create_config import CreateConfig
from label_data import LabelData

sample_iter = LabelData(meta_dir).geo_sample_iter(geo)
# create bash script for downloading
CreateConfig(download_dir, geo).fetch_biosample(sample_iter)
# run the bash script showed here
# Outcome: nf download failed

Number of biosamples:  1090
bash /home/yuan/rawdata/SRR/GSE112274/fetch1.sh


In [32]:
# step 7: parse path of local fastq.gz with SRR in GEO data
# Note: run the process multiple times to make sure all SRRs are accessed
# print all samples if not any local *.fastq.gz is parsed
from parse_sra import ParseSra
from label_data import LabelData

data = LabelData(meta_dir).get_meta(geo)
data, unparsing = ParseSra.parse_local_fastq(data, download_dir)
if unparsing:
    print(geo, len(unparsing), unparsing[:6])
LabelData(meta_dir).save(data)

Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE112nnn/GSE112274/metadata.json
