In [None]:
# no paired file. only biological files
Read1 contains the cell barcode and Unique Molecule Identifier; read2 was biological reads

In [1]:
import os
from parse_soft import ParseSoft
from search_soft import SearchSoft
from enrich_soft import EnrichSoft
from label_data import LabelData

%load_ext autoreload
%autoreload 2

In [2]:
# step 1: set defaults
geo = "GSE167977"
soft_dir = '/home/yuan/data'
meta_dir = '../data'
download_dir = '/home/yuan/rawdata/SRR'

In [3]:
# step 2: parse soft, and enrich

# analyze soft file
parser = ParseSoft(soft_dir)
soft_path = parser.soft_local_path(geo)
print('Retrieve meta data from soft file: ', soft_path)
softer = SearchSoft(soft_path)
data = softer.filter_data(softer.parse_rows)

# save to json
LabelData(meta_dir).save(data)

Retrieve meta data from soft file:  /home/yuan/data/ftp.ncbi.nlm.nih.gov/geo/series/GSE167nnn/GSE167977/soft/GSE167977_family.soft.gz
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE167nnn/GSE167977/metadata.json


In [4]:
# step 3a: enrich data
data = LabelData(meta_dir).get_meta(geo)
enriched = EnrichSoft(data)()
LabelData(meta_dir).save(enriched)

Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE167nnn/GSE167977/metadata.json


In [8]:
# step 3b: label samples manually
from label_sample import LabelSample

def collator(sample):
    c = LabelSample(sample)
    c._protocol('scrna-seq', "10x3v2")
    c._disease_patient('breast cancer')

data = LabelData(meta_dir).get_meta(geo)
for sample_id, sample in data['samples'].items():
    collator(sample)
LabelData(meta_dir).save(data)

Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE167nnn/GSE167977/metadata.json


In [9]:
for sample_id, sample in data['samples'].items():
    print(sample['sample_title'], sample['labels'])

MATADOR_RNAseq_X158 {'tissue': 'primairy tumor before treatment ffpe', 'treatment': '6 x ac dd', 'disease': 'breast cancer', 'group': 'patient'}
MATADOR_RNAseq_X184 {'tissue': 'primairy tumor before treatment ffpe', 'treatment': '6 x ac dd', 'disease': 'breast cancer', 'group': 'patient'}
MATADOR_RNAseq_X191 {'tissue': 'primairy tumor before treatment ffpe', 'treatment': '6 x ac dd', 'disease': 'breast cancer', 'group': 'patient'}
MATADOR_RNAseq_X203 {'tissue': 'primairy tumor before treatment ffpe', 'treatment': '6 x ac dd', 'disease': 'breast cancer', 'group': 'patient'}
MATADOR_RNAseq_X205 {'tissue': 'primairy tumor before treatment ffpe', 'treatment': '6 x ac dd', 'disease': 'breast cancer', 'group': 'patient'}
MATADOR_RNAseq_X229 {'tissue': 'primairy tumor before treatment ffpe', 'treatment': '6 x ac dd', 'disease': 'breast cancer', 'group': 'patient'}
MATADOR_RNAseq_X234 {'tissue': 'primairy tumor before treatment ffpe', 'treatment': '6 x ac dd', 'disease': 'breast cancer', 'grou

In [10]:
# step 4a: load some data
from utils import Utils

# data are determined previously
samn_srr = Utils.from_json('../results/samn_srr.json')
print(len(samn_srr))

47


In [11]:
# step 4b: Given BioSample, parse SRR accessions into samples
from parse_sra import ParseSra
from label_data import LabelData

data = LabelData(meta_dir).get_meta(geo)
enriched = ParseSra.parse_srr(data, samn_srr)
LabelData(meta_dir).save(enriched)

biosamples = 528, SRR accessions = 528.
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE167nnn/GSE167977/metadata.json


In [12]:
# step 5: merge srr_urls into data

# sample.<sample_id>.SRR.<SRR accession>."ftp.sra.ebi.ac.uk"
file_name = 'srr_fastq_urls_simple.json'
urls_json = os.path.join('../results', file_name)
urls = Utils.from_json(urls_json)

# parse urls
data = LabelData(meta_dir).get_meta(geo)
enriched = ParseSra.parse_srr_urls(data, urls)
LabelData(meta_dir).save(enriched)

{'srr': 528, 'available': 0, 'unknown': 528, 'updated': 0}
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE167nnn/GSE167977/metadata.json


In [13]:
# step 6a: download raw data using biosample accession using nextflow
from create_config import CreateConfig
from label_data import LabelData

sample_iter = LabelData(meta_dir).geo_sample_iter(geo)
# create bash script for downloading
CreateConfig(download_dir, geo).fetch_biosample(sample_iter)
# run the bash script showed here
# Outcome: nf download failed

Number of biosamples:  528
Number of biosamples with SRR:  528
bash /home/yuan/rawdata/SRR/GSE167977/fetch1.sh


In [14]:
#Step 6b: collect all SRR without local_fastq then execute bash run2.sh
from create_config import CreateConfig
from label_data import LabelData

sample_iter = LabelData(meta_dir).geo_sample_iter(geo)
# create bash script for downloading
CreateConfig(download_dir, geo).fetch_ebi_srr(sample_iter)
# run the bash script showed here

Number of SRR:  528
bash /home/yuan/rawdata/SRR/GSE167977/fetch2.sh


In [15]:
#Step 6c: collect all SRR without local_fastq then execute bash run3.sh
from create_config import CreateConfig
from label_data import LabelData

sample_iter = LabelData(meta_dir).geo_sample_iter(geo)
# create bash script for downloading
CreateConfig(download_dir, geo).fetch_srr(sample_iter)
# run the bash script showed here

Number of SRR:  528
bash /home/yuan/rawdata/SRR/GSE167977/fetch3.sh


In [16]:
# step 7: parse path of local fastq.gz with SRR in GEO data
# Note: run the process multiple times to make sure all SRRs are accessed
# print all samples if not any local *.fastq.gz is parsed
from parse_sra import ParseSra
from label_data import LabelData

data = LabelData(meta_dir).get_meta(geo)
data, unparsing = ParseSra.parse_local_fastq(data, download_dir)
if unparsing:
    print(geo, len(unparsing), unparsing[:6])
LabelData(meta_dir).save(data)

GSE167977 528 ['SRR13816316', 'SRR13816317', 'SRR13816318', 'SRR13816319', 'SRR13816320', 'SRR13816321']
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE167nnn/GSE167977/metadata.json
