In [None]:
# only biological reads without technical reads

In [1]:
import os
from parse_soft import ParseSoft
from search_soft import SearchSoft
from enrich_soft import EnrichSoft
from label_data import LabelData

%load_ext autoreload
%autoreload 2

In [91]:
# step 1: set defaults
geo = "GSE154826"
soft_dir = '/home/yuan/data'
meta_dir = '../data'
download_dir = '/home/yuan/rawdata/SRR'

In [92]:
# step 2: parse soft, and enrich

# analyze soft file
parser = ParseSoft(soft_dir)
soft_path = parser.soft_local_path(geo)
print('Retrieve meta data from soft file: ', soft_path)
softer = SearchSoft(soft_path)
data = softer.filter_data(softer.parse_rows)

# save to json
LabelData(meta_dir).save(data)

Retrieve meta data from soft file:  /home/yuan/data/ftp.ncbi.nlm.nih.gov/geo/series/GSE154nnn/GSE154826/soft/GSE154826_family.soft.gz
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE154nnn/GSE154826/metadata.json


In [93]:
# step 3: enrich data
data = LabelData(meta_dir).get_meta(geo)
enriched = EnrichSoft(data)()
LabelData(meta_dir).save(enriched)

Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE154nnn/GSE154826/metadata.json


In [94]:
# step 4a: load sample information
import pandas as pd
xl_file = pd.ExcelFile('./constants/GSE154826.xlsx')
df = xl_file.parse('sample_table')
df.index = df['old_lib_name']
df.head()

Unnamed: 0_level_0,sample_ID,patient_ID,amp_batch_ID,old_lib_name,HTO,tissue,disease,Use.in.Clustering.Model.,library_chemistry,prime,vdj_kit,prep,metadata_indicator,biopsy_site
old_lib_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
558T,36,558,36,558T,,Tumor,LUAD,Yes,V2,3,,beads,,
564N,37,564,37,564N,,Normal,LUAD,Yes,V2,3,,beads,,
564T,38,564,38,564T,,Tumor,LUAD,Yes,V2,3,,beads,,
569N,39,569,39,569N,,Normal,LUAD,Yes,V2,3,,beads,,
569T,40,569,40,569T,,Tumor,LUAD,Yes,V2,3,,beads,,


In [95]:
disease_info = {}
for rec in df.to_dict(orient='records'):
    d = str(rec['disease'])
    n = str(rec['patient_ID'])
    disease_info[n] = d
disease_info

{'558': 'LUAD',
 '564': 'LUAD',
 '569': 'LUAD',
 '570': 'LUAD',
 '571': 'LUAD',
 '572': 'LUAD',
 '578': 'LUAD',
 '581': 'LUAD',
 '584': 'LUSC',
 '593': 'LUAD',
 '596': 'LUAD',
 '630': 'LUAD',
 '626': 'LUAD',
 '695': 'LUAD',
 '338': 'LUAD',
 '370': 'LUAD',
 '371': 'LUAD',
 '377': 'LUSC',
 '378': 'LUAD',
 '393': 'LUAD',
 '403': 'LUAD',
 '406': 'LUAD',
 '408': 'LUAD',
 '410': 'LUAD',
 '458': 'LUAD',
 '460': 'LUAD',
 '464': 'LUAD',
 '514': 'LUAD',
 '522': 'LUSC',
 '532': 'LUAD',
 '714': 'LUSC',
 '729': 'LUAD',
 '725': 'LUAD',
 '706': 'LUSC',
 '800': 'LUAD',
 'Lambrechts_2': 'LUSC',
 'Lambrechts_1': 'LUSC',
 'Lambrechts_3': 'LUAD',
 'Lambrechts_4': 'LUAD',
 'Lambrechts_5': 'Large cell',
 'Lambrechts_7': 'LUSC',
 'Lambrechts_8': 'Pleiomorphic',
 'Lambrechts_6': 'LUAD',
 'zilionis_1': 'LUSC',
 'zilionis_2': 'LUSC',
 'zilionis_3': 'LUAD',
 'zilionis_4': 'LUAD',
 'zilionis_5': 'LUAD',
 'zilionis_6': 'LUAD',
 'zilionis_7': 'LUAD'}

In [96]:
# step 3b: label samples manually
from label_sample import LabelSample

def collator(sample, disease_info):
    c = LabelSample(sample)
    # disease
    patient_id = c.sample['characteristics']['patient_id']
    # print(patient_id, disease_info[patient_id], end=', ')
    disease = disease_info[patient_id]
    if disease == "LUAD":
        disease = 'lung adenocarcinoma, non-small cell lung cancer'
    elif disease == 'LUSC':
        disease = 'lung squamous cell carcinoma, non-small cell lung cancer'
    c._disease(disease)
    # kit
    kit = c.sample['characteristics']["10x_chromium_encapsulation_kit"]
    if kit == "3' v2":
        c._protocol('scrna-seq', '10x3v2')
    elif kit == "5' v1":
        c._protocol('tcr-seq', '10x5v1')
    elif kit == "3' v3":
        c._protocol('scrna-seq', '10x3v3')
    
data = LabelData(meta_dir).get_meta(geo)
for sample_id, sample in data['samples'].items():
    collator(sample, disease_info)
LabelData(meta_dir).save(data)

Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE154nnn/GSE154826/metadata.json


In [97]:
for s in data['samples'].values():
    print(s['sample_id'], s['labels'])

GSM4680740 {'group': 'normal', 'tissue': 'lung', 'disease': 'lung adenocarcinoma, non-small cell lung cancer'}
GSM4680741 {'group': 'tumor', 'tissue': 'lung', 'disease': 'lung adenocarcinoma, non-small cell lung cancer'}
GSM4680742 {'group': 'normal', 'tissue': 'lung', 'disease': 'lung adenocarcinoma, non-small cell lung cancer'}
GSM4680743 {'group': 'tumor', 'tissue': 'lung', 'disease': 'lung adenocarcinoma, non-small cell lung cancer'}
GSM4680744 {'group': 'normal', 'tissue': 'lung', 'disease': 'lung squamous cell carcinoma, non-small cell lung cancer'}
GSM4680745 {'group': 'tumor', 'tissue': 'lung', 'disease': 'lung squamous cell carcinoma, non-small cell lung cancer'}
GSM4680746 {'group': 'normal', 'tissue': 'lung', 'disease': 'lung adenocarcinoma, non-small cell lung cancer'}
GSM4680747 {'group': 'tumor', 'tissue': 'lung', 'disease': 'lung adenocarcinoma, non-small cell lung cancer'}
GSM4680748 {'group': 'tumor', 'tissue': 'lung', 'disease': 'lung adenocarcinoma, non-small cell lu

In [98]:
# step 4a: load some data
from utils import Utils

# data are determined previously
samn_srr = Utils.from_json('../results/samn_srr.json')
print(len(samn_srr))

47


In [99]:
# step 4b: Given BioSample, parse SRR accessions into samples
from parse_sra import ParseSra
from label_data import LabelData

data = LabelData(meta_dir).get_meta(geo)
enriched = ParseSra.parse_srr(data, samn_srr)
LabelData(meta_dir).save(enriched)

biosamples = 166, SRR accessions = 250.
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE154nnn/GSE154826/metadata.json


In [100]:
# step 5: merge srr_urls into data

# sample.<sample_id>.SRR.<SRR accession>."ftp.sra.ebi.ac.uk"
file_name = 'srr_fastq_urls_simple.json'
urls_json = os.path.join('../results', file_name)
urls = Utils.from_json(urls_json)

# parse urls
data = LabelData(meta_dir).get_meta(geo)
enriched = ParseSra.parse_srr_urls(data, urls)
LabelData(meta_dir).save(enriched)

{'srr': 166, 'available': 250, 'unknown': 0, 'updated': 250}
Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE154nnn/GSE154826/metadata.json


In [101]:
# step 6a: download raw data using biosample accession using nextflow
from create_config import CreateConfig
from label_data import LabelData

sample_iter = LabelData(meta_dir).geo_sample_iter(geo)
# create bash script for downloading
CreateConfig(download_dir, geo).fetch_biosample(sample_iter)
# run the bash script showed here
# Outcome: nf download failed

Number of biosamples:  166
Number of biosamples with SRR:  156
bash /home/yuan/rawdata/SRR/GSE154826/fetch1.sh


In [102]:
# step 7: parse path of local fastq.gz with SRR in GEO data
# Note: run the process multiple times to make sure all SRRs are accessed
# print all samples if not any local *.fastq.gz is parsed
from parse_sra import ParseSra
from label_data import LabelData

data = LabelData(meta_dir).get_meta(geo)
data, unparsing = ParseSra.parse_local_fastq(data, download_dir)
if unparsing:
    print(geo, len(unparsing), unparsing[:6])
LabelData(meta_dir).save(data)

Save meta data into json:  /home/yuan/bio/scrnaseq_reference/data/labels/GSE154nnn/GSE154826/metadata.json


## nf-core/scrnaseq pipeline
create samplehseet.csv and params.config

In [118]:
name = 'scrnaseq_lung_tissue_patient'
output_dir = '/home/yuan/output'
labeler = LabelData(meta_dir, name, output_dir, True)

sample_sheet = {}
data = LabelData(meta_dir).get_meta(geo)
for sample, geo, run_acc, fastq_sample in labeler.fastq_iter(data):
    c = LabelSample(sample)
    print(c.filter(name), c.disease, c.tissue, sample['sample_id'])
    if c.filter(name):
        Utils.key_update(sample_sheet, [geo,], fastq_sample['sample_sheet'])
# export
if sample_sheet:
    print("Number of samples in samplesheet.csv: ", len(sample_sheet[geo]))
    labeler.to_sample_sheet(sample_sheet, 'paired')
    labeler.nf_cmd()
else:
    print(f'No samples meet creteria {name}')

False lung adenocarcinoma, non-small cell lung cancer lung GSM4680840
False lung adenocarcinoma, non-small cell lung cancer lung GSM4680842
True lung adenocarcinoma, non-small cell lung cancer lung GSM4680843
True lung adenocarcinoma, non-small cell lung cancer lung GSM4680844
False lung squamous cell carcinoma, non-small cell lung cancer lung GSM4680846
False lung squamous cell carcinoma, non-small cell lung cancer lung GSM4680848
False lung squamous cell carcinoma, non-small cell lung cancer lung GSM4680901
False lung squamous cell carcinoma, non-small cell lung cancer lung GSM4680904
Number of samples in samplesheet.csv:  2
cd /home/yuan/output/scrnaseq_lung_tissue_patient/paired_GSE154826 && nextflow run nf-core/scrnaseq -r 3.0.0 -c params.config
