In [6]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import Bio
import json

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## download data:
- download soft data, read counts data
- use python scripts
- download geo using nextflow. That is recommended

In [20]:
acc_no = 287800
url = 'ftp.ncbi.nlm.nih.gov/geo/series'
geo_local_dir = '/home/yuan/data/'

In [21]:
# prepare soft_urls.txt consumed by nextflow
n = 0
with open('../results/soft_urls.txt', 'w') as f:
    for i in range(1, acc_no):
        i = str(i)
        k1 = "GSE" + i
        k2 = i[:-3] if len(i) > 3 else ''
        k2 = "GSE" + k2 + 'nnn'
        f.write(f"ftp://{url}/{k2}/{k1}/soft"+ "\n")
        n += 1
print(f"Total {n} soft are downloaded")

Total 287799 soft are downloaded


In [26]:
# download through Python if that *.gz doesn't exist
import subprocess

n = m = 0
with open('../results/download_soft.log', 'w') as log, \
    open('../results/download_soft.err', 'w') as err, \
    open('../results/download_soft_skip.log', 'w') as skip:
    for i in range(1, acc_no):
        n += 1
        i = str(i)
        k1 = "GSE" + i
        k2 = i[:-3] if len(i) > 3 else ''
        k2 = "GSE" + k2 + 'nnn'
        # check if gz exists
        local_path = os.path.join(geo_local_dir, f"{url}/{k2}/{k1}/soft")
        tag = 1
        if os.path.isdir(local_path):
            for name in os.listdir(local_path):
                if name.endswith('.soft.gz'):
                    tag = 0
                    skip.write(name + '\n')
                    break
        # download
        if tag == 1:
            try:
                cmd = ['wget', '-c', '-r', '-q',  f"ftp://{url}/{k2}/{k1}/soft" , '-P',  geo_local_dir]
                # print(cmd)
                subprocess.run(cmd, check=True)
                log.write(' '.join(cmd) + '\n')
                m += 1
                if m % 100 == 0:
                    print(f"{n}-{m}", end=', ')
            except Exception as e:
                line = f"record={i}, command=" + ' '.join(cmd) + f", error=str(e)\n"
                err.write(line)
print(f"Among{n}, the number of {m} soft are downloaded")

223645-100, 266166-200, 282650-300, Among287799, the number of 366 soft are downloaded


In [None]:
# validate soft download
from parse_soft import ParseSoft

local_dir = '/home/yuan/data/'
pcl = ParseSoft(local_dir)
pcl.validate_soft('../data/validate_soft.bash')

## parse data based on soft

In [40]:
local_dir = '/home/yuan/data/'
outdir = '../results'

In [None]:
from parse_soft import ParseSoft
# GEO~pubmed_id into geo_pmid.json
pcl = ParseSoft(local_dir, outdir)
pcl.geo_pmid()

In [None]:
# PMID~sample_id into pmid_samples.json
pcl = ParseSoft(local_dir, outdir)
pcl.pmid_samples()

In [None]:
# GEO~sample_id into geo_sample_id.json
pcl = ParseSoft(local_dir, outdir)
pcl.geo_sample_id()

In [34]:
# GEO~sample_id into geo_samples.json
pcl = ParseSoft(local_dir, outdir)
pcl.geo_samples()

5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000, 90000, 95000, 100000, 105000, 110000, 115000, 120000, 125000, 130000, 135000, 140000, 145000, 150000, 155000, 160000, 165000, 170000, 175000, 180000, 185000, 190000, 195000, 200000, 205000, 210000, 215000, 220000, 225000, 230000, 235000, 240000, 245000, 

'../data/geo_samples.json'

## parse data based on SRA data

In [62]:
from parse_sra import ParseSra

ps = ParseSra(local_dir, outdir)
ps.srx_samn()

('../results/srx_samn.json', '../results/samn_srx.json')

In [63]:
ps = ParseSra(local_dir, outdir)
ps.srr_samn()

('../results/srr_samn.json', '../results/samn_srr.json')

In [57]:
ps.search(17, 'GSM4088789')

## process fastq

fastq files downloaded from SRA are stored in ~/data/SRA

In [75]:
# collect fastq into another direcotry for organization
local_dir = '/home/yuan/data/scRNAseq'
outdir = '/home/yuan/data/SRA/'
ps = ParseSra(local_dir, outdir)
ps.move_srr_fastq()

0 files are moved to /home/yuan/data/SRA/.
