In [16]:
import os
import numpy as np
import pandas as pd
import Bio
import json

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## download soft data:
- download soft data, read counts data
- use python scripts
- download geo using nextflow. That is recommended

In [20]:
acc_no = 287800
url = 'ftp.ncbi.nlm.nih.gov/geo/series'
geo_local_dir = '/home/yuan/data/'

In [21]:
# prepare soft_urls.txt consumed by nextflow
n = 0
with open('../results/soft_urls.txt', 'w') as f:
    for i in range(1, acc_no):
        i = str(i)
        k1 = "GSE" + i
        k2 = i[:-3] if len(i) > 3 else ''
        k2 = "GSE" + k2 + 'nnn'
        f.write(f"ftp://{url}/{k2}/{k1}/soft"+ "\n")
        n += 1
print(f"Total {n} soft are downloaded")

Total 287799 soft are downloaded


In [26]:
# download through Python if that *.gz doesn't exist
import subprocess

n = m = 0
with open('../results/download_soft.log', 'w') as log, \
    open('../results/download_soft.err', 'w') as err, \
    open('../results/download_soft_skip.log', 'w') as skip:
    for i in range(1, acc_no):
        n += 1
        i = str(i)
        k1 = "GSE" + i
        k2 = i[:-3] if len(i) > 3 else ''
        k2 = "GSE" + k2 + 'nnn'
        # check if gz exists
        local_path = os.path.join(geo_local_dir, f"{url}/{k2}/{k1}/soft")
        tag = 1
        if os.path.isdir(local_path):
            for name in os.listdir(local_path):
                if name.endswith('.soft.gz'):
                    tag = 0
                    skip.write(name + '\n')
                    break
        # download
        if tag == 1:
            try:
                cmd = ['wget', '-c', '-r', '-q',  f"ftp://{url}/{k2}/{k1}/soft" , '-P',  geo_local_dir]
                # print(cmd)
                subprocess.run(cmd, check=True)
                log.write(' '.join(cmd) + '\n')
                m += 1
                if m % 100 == 0:
                    print(f"{n}-{m}", end=', ')
            except Exception as e:
                line = f"record={i}, command=" + ' '.join(cmd) + f", error=str(e)\n"
                err.write(line)
print(f"Among{n}, the number of {m} soft are downloaded")

223645-100, 266166-200, 282650-300, Among287799, the number of 366 soft are downloaded


In [None]:
# validate soft download
from parse_soft import ParseSoft

local_dir = '/home/yuan/data/'
pcl = ParseSoft(local_dir)
pcl.validate_soft('../data/validate_soft.bash')

## parse soft data

In [2]:
from parse_soft import ParseSoft

# store *.soft.gz
local_dir = '/home/yuan/data/'
# export pairwiseing of accessions to json
outdir = '../results'

In [2]:
# GEO~pubmed_id into geo_pmid.json
pcl = ParseSoft(local_dir, outdir)
pcl.geo_pmid()

5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000, 90000, 95000, 100000, 105000, 110000, 115000, 120000, 125000, 130000, 135000, 140000, 145000, 150000, 155000, 160000, 165000, 170000, 175000, 180000, 185000, 190000, 195000, 200000, 205000, 210000, 215000, 220000, 225000, 230000, 235000, 240000, 245000, 

('../results/geo_pmid.json', '../results/pmid_geo.json')

In [3]:
# PMID~sample_id into pmid_samples.json
pcl = ParseSoft(local_dir, outdir)
pcl.pmid_samples()

5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000, 90000, 95000, 100000, 105000, 110000, 115000, 120000, 125000, 130000, 135000, 140000, 145000, 150000, 155000, 160000, 165000, 170000, 175000, 180000, 185000, 190000, 195000, 200000, 205000, 210000, 215000, 220000, 225000, 230000, 235000, 240000, 245000, 

'../results/pmid_samples.json'

In [4]:
# GEO~sample_id into geo_sample_id.json
pcl = ParseSoft(local_dir, outdir)
pcl.geo_sample_id()

5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000, 90000, 95000, 100000, 105000, 110000, 115000, 120000, 125000, 130000, 135000, 140000, 145000, 150000, 155000, 160000, 165000, 170000, 175000, 180000, 185000, 190000, 195000, 200000, 205000, 210000, 215000, 220000, 225000, 230000, 235000, 240000, 245000, 

'../results/geo_sampleid.json'

In [5]:
# GEO~sample_id into geo_samples.json
pcl = ParseSoft(local_dir, outdir)
pcl.geo_samples()

5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000, 90000, 95000, 100000, 105000, 110000, 115000, 120000, 125000, 130000, 135000, 140000, 145000, 150000, 155000, 160000, 165000, 170000, 175000, 180000, 185000, 190000, 195000, 200000, 205000, 210000, 215000, 220000, 225000, 230000, 235000, 240000, 245000, 

'../results/geo_samples.json'

## parse SRA data

In [3]:
from parse_sra import ParseSra
from slicer import Slicer

ps = ParseSra(local_dir, outdir)
ps.acc_samn('SRX', Slicer.SRX)

('../results/srx_samn.json', '../results/samn_srx.json')

In [4]:
ps = ParseSra(local_dir, outdir)
ps.acc_samn('SRS', Slicer.SRX)

('../results/srs_samn.json', '../results/samn_srs.json')

In [5]:
ps = ParseSra(local_dir, outdir)
ps.acc_samn('SRR', Slicer.SRR)

('../results/srr_samn.json', '../results/samn_srr.json')

In [10]:
ps.search('BioSample', 'SAMN14266271')

{'Accession': 'SRR11218342', 'Submission': 'SRA1050378', 'Status': 'live', 'Updated': '2023-06-29T13:40:50Z', 'Published': '2020-04-13T21:16:19Z', 'Received': '2020-03-03T15:44:40Z', 'Type': 'RUN', 'Center': 'SUB7078846', 'Visibility': 'public', 'Alias': 'HYJJ7BGXY_370_normal_lungcancer_1-24-17_lib82_S1_L001_I1_001.fastq.gz', 'Experiment': 'SRX7830690', 'Sample': 'SRS6241293', 'Study': 'SRP251372', 'Loaded': '1', 'Spots': '108115853', 'Bases': '9622310917', 'Md5sum': 'f4e43adff9e8a24ddc4d35e0d416c1af', 'BioSample': 'SAMN14266271', 'BioProject': 'PRJNA609924', 'ReplacedBy': '-'}
{'Accession': 'SRS6241293', 'Submission': 'SRA1050378', 'Status': 'live', 'Updated': '2020-04-13T21:21:18Z', 'Published': '2020-04-13T21:16:27Z', 'Received': '2020-03-03T15:46:05Z', 'Type': 'SAMPLE', 'Center': 'pda|leaderam', 'Visibility': 'public', 'Alias': 'normal_370', 'Experiment': '-', 'Sample': '-', 'Study': '-', 'Loaded': '-', 'Spots': '-', 'Bases': '-', 'Md5sum': 'ed0a84569e43d05ade82c73cc6ca0ca7', 'BioS

## parase URLs of fastq.gz given SRR accessions

In [None]:
# TODO: failed many times due to timeout
from retrieve_url import RetrieveUrl
data = {}
res = RetrieveUrl.scan_sra_ebi(data)
#save
outfile = Utils.to_json(data, '../results', 'srr_fastq_urls.json')

# alternative:
```
python src/run_srr_fastq.py
```

## process fastq

fastq files downloaded from SRA are stored in ~/data/SRA

In [87]:
# collect fastq into another direcotry for organization
local_dir = '/home/yuan/data/fastq'
outdir = '/home/yuan/data/SRA/'
ps = ParseSra(local_dir, outdir)
ps.move_srr_fastq()

0 files are moved to /home/yuan/data/SRA/.
