In [125]:
import json
import os
import numpy as np
import pandas as pd

from utils import Utils
from label_data import LabelData
from parse_soft import ParseSoft
from search_soft import SearchSoft
from enrich_soft import EnrichSoft
from parse_sra import ParseSra

%reload_ext autoreload
%autoreload 2


In [143]:
# load pairwising data
pmid_samples = Utils.from_json('../results/pmid_samples.json')
pmid_geo = Utils.from_json('../results/pmid_geo.json')
geo_sampleid = Utils.from_json('../results/geo_sampleid.json')
samn_srr = Utils.from_json('../results/samn_srr.json')

In [148]:
# set direcotry

# 
data_dir = '/home/yuan/data'

# 
label_dir = '../data/labels'
if not os.path.isdir(label_dir):
    os.mkdir(label_dir)
#
fastq_dir = '/home/yuan/data/SRA'

In [None]:
attr = ['disease', 'tissue', 'cell_type', 'cell_line', 'treated', 'cultivation']

In [100]:
# test parse, and enrich
geo ='GSE286399'
parser = ParseSoft(data_dir)
soft_path = parser.soft_local_path(geo)
softer = SearchSoft(soft_path)
data = softer.filter_data(softer.parse_rows)
enriched = EnrichSoft(data)()
enriched

{'GEO': 'GSE286399',
 'geo_http': 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE286399',
 'PMID': None,
 'taxid': '9606',
 'platform': [{'platform_id': 'GPL24676',
   'platform_title': 'Illumina NovaSeq 6000 (Homo sapiens)',
   'platform_technology': 'high-throughput sequencing'}],
 'samples': {'GSM8726430': {'sample_id': 'GSM8726430',
   'tissue': 'Lung cancer cell line',
   'cell line': 'NCI-H1299',
   'cell type': 'Lung cancer cell line',
   'treatment': 'DMSO',
   'Sample_description': ['Library name: sample1'],
   'BioSample': 'https://www.ncbi.nlm.nih.gov/biosample/SAMN46202173',
   'SRA': 'SRX27313887',
   'SRA_url': 'https://www.ncbi.nlm.nih.gov/sra?term=SRX27313887'},
  'GSM8726431': {'sample_id': 'GSM8726431',
   'tissue': 'Lung cancer cell line',
   'cell line': 'NCI-H1299',
   'cell type': 'Lung cancer cell line',
   'treatment': 'JTE-607',
   'Sample_description': ['Library name: sample2'],
   'BioSample': 'https://www.ncbi.nlm.nih.gov/biosample/SAMN46202172',
   

## H1299

In [167]:
parser = ParseSoft(data_dir)
cl = LabelData(label_dir)

geo_h1299 = ['GSE286399', 'GSE280041', 'GSE144357', 'GSE121309', 'GSE183590', 'GSE148729']
for geo in geo_h1299:
    print(geo, end=', ')
    soft_path = parser.soft_local_path(geo)
    softer = SearchSoft(soft_path)
    data = softer.filter_data(softer.parse_rows)
    enriched = EnrichSoft(data)()
    cl.save(enriched)

GSE286399, GSE280041, GSE144357, GSE121309, GSE183590, GSE148729, 

## cell marker

In [95]:
cellmarker = pd.ExcelFile('../results/Cell_marker_Seq.xlsx')
cm = cellmarker.parse('seq')

In [96]:
# get PMID
lung_cm = cm[(cm['species']=='Human') & cm['tissue_type'].str.contains('Lung')]
pmid_list= np.unique(lung_cm['PMID'])
print(pmid_list)

[30259978 30523199 30554520 30650190 30784054 31221805 31233341 31289132
 31299246 31333652 31405848 31834999 31840053 31892341 31996486 32004478
 32072637 32109386 32112047 32122885 32203281 32246845 32317009 32317643
 32373206 32398875 32405060 32497778 32580738 32603599 32810439 32822576
 32832598 32832599 32849643 32882007 32968798 32973742 33057196 33083004
 33123174 33144684 33178221 33377642 33382972 33500718 33514641 33571124
 33598101 33657410 33705361 33717172 33822772 33879239 33953163 33972311
 34017124 34030460 34049947 34247147 34313733 34330889 34475869 34504485
 34603282 34624218 34663877 34715018 34764257 34780851 34804043 34876692
 34914922 34916290 35078977 35108060 35126365 35184398 35213222 35216676
 35354645 35430336]


In [168]:
# retrieve soft data
parser = ParseSoft(data_dir)
cl = LabelData(label_dir)

for pmid in pmid_list:
    geo_pool = Utils.pmid_get(pmid_geo, pmid)
    for geo in geo_pool:
        print(geo, end=', ')
        soft_path = parser.soft_local_path(geo)
        softer = SearchSoft(soft_path)
        data = softer.filter_data(softer.parse_rows)
        enriched = EnrichSoft(data)()
        cl.save(enriched)

GSE112274, GSE121611, GSE122960, GSE128033, GSE128169, GSE133747, GSE137799, GSE137805, GSE137811, GSE124885, GSE132771, GSE147066, GSE145926, GSE140819, GSE135851, GSE135893, GSE136831, GSE217722, GSE162936, GSE162498, GSE162499, GSE162500, GSE158055, GSE166033, GSE166034, GSE166035, GSE166036, GSE166037, GSE166059, GSE168710, GSE148071, GSE164829, GSE156311, GSE168299, 

In [169]:
# Given BioSample, parse SRR accessions and local fastq.gz if that exists
cl = LabelData(label_dir)
for data in cl.json_iter():
    data = ParseSra.parse_srr(data, samn_srr, fastq_dir)
    cl.save(data)

In [177]:
# prepare SRR id list for download
num_geo = num_sample = 0
cl = LabelData(label_dir)
for data in cl.json_iter():
    if data['taxid'] == '9606':
        num_geo += 1
        for sample_id, sample in data['samples'].items():
            if sample['scrnaseq'] and sample['SRR']:
                #print(data['GEO'], sample_id)
                num_sample += 1
                for srr_acc, local_path in sample['SRR'].items():
                    if not local_path:
                        print(f"fastq-dump {srr_acc} -O /home/yuan/results/fastq --gzip")
print(num_geo, num_sample)

fastq-dump SRR9970044 -O /home/yuan/results/fastq --gzip
fastq-dump SRR9970045 -O /home/yuan/results/fastq --gzip
fastq-dump SRR9970046 -O /home/yuan/results/fastq --gzip
fastq-dump SRR9970047 -O /home/yuan/results/fastq --gzip
fastq-dump SRR9970048 -O /home/yuan/results/fastq --gzip
fastq-dump SRR9970049 -O /home/yuan/results/fastq --gzip
fastq-dump SRR9970050 -O /home/yuan/results/fastq --gzip
fastq-dump SRR10974768 -O /home/yuan/results/fastq --gzip
fastq-dump SRR10975280 -O /home/yuan/results/fastq --gzip
fastq-dump SRR10975281 -O /home/yuan/results/fastq --gzip
fastq-dump SRR10975282 -O /home/yuan/results/fastq --gzip
fastq-dump SRR10975283 -O /home/yuan/results/fastq --gzip
fastq-dump SRR10975284 -O /home/yuan/results/fastq --gzip
fastq-dump SRR10975286 -O /home/yuan/results/fastq --gzip
fastq-dump SRR10975290 -O /home/yuan/results/fastq --gzip
fastq-dump SRR11549937 -O /home/yuan/results/fastq --gzip
fastq-dump SRR11549945 -O /home/yuan/results/fastq --gzip
fastq-dump SRR1154994