# label data
- determine biosamples related to lung and lung cancer
- prepare ids.csv for nextflow pipeline downloading FASTQ

In [None]:
import json
import os
import numpy as np
import pandas as pd

from utils import Utils
from label_data import LabelData
from parse_soft import ParseSoft
from search_soft import SearchSoft
from enrich_soft import EnrichSoft
from parse_sra import ParseSra
from slicer import Slicer

%reload_ext autoreload
%autoreload 2


## prepare

In [11]:
# load pairwising data
pmid_geo = Utils.from_json('../results/pmid_geo.json')


In [3]:
# the direcotry stores soft files
data_dir = '/home/yuan/data'

# exported labeling data
label_dir = '../data/labels'
if not os.path.isdir(label_dir):
    os.mkdir(label_dir)
#
fastq_dir = '/home/yuan/rawdata/SRX'

In [4]:
# test parse, and enrich
geo ='GSE286399'
parser = ParseSoft(data_dir)
soft_path = parser.soft_local_path(geo)
softer = SearchSoft(soft_path)
data = softer.filter_data(softer.parse_rows)
enriched = EnrichSoft(data)()
enriched

{'GEO': 'GSE286399',
 'geo_http': 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE286399',
 'title': 'single cell RNA-seq analysis of lung cancer cell lines treated with JTE-607',
 'PMID': None,
 'taxid': '9606',
 'platform': [{'platform_id': 'GPL24676',
   'platform_title': 'Illumina NovaSeq 6000 (Homo sapiens)',
   'platform_technology': 'high-throughput sequencing'}],
 'samples': {'GSM8726430': {'sample_id': 'GSM8726430',
   'characteristics': {'tissue': 'Lung cancer cell line',
    'cell_line': 'NCI-H1299',
    'cell_type': 'Lung cancer cell line',
    'treatment': 'DMSO'},
   'scrnaseq': False,
   'Sample_description': ['Library name: sample1'],
   'BioSample': 'SAMN46202173',
   'SRA': 'SRX27313887',
   'SRA_url': 'https://www.ncbi.nlm.nih.gov/sra?term=SRX27313887',
   'BioSample_url': 'https://www.ncbi.nlm.nih.gov/biosample/SAMN46202173'},
  'GSM8726431': {'sample_id': 'GSM8726431',
   'characteristics': {'tissue': 'Lung cancer cell line',
    'cell_line': 'NCI-H1299',
  

## select GEO

### H1299

In [14]:
parser = ParseSoft(data_dir)
cl = LabelData(label_dir)

n = 0
geo_h1299 = ['GSE286399', 'GSE280041', 'GSE144357', 'GSE121309', 'GSE183590', 'GSE148729']
for geo in geo_h1299:
    print(geo, end=', ')
    soft_path = parser.soft_local_path(geo)
    softer = SearchSoft(soft_path)
    data = softer.filter_data(softer.parse_rows)
    enriched = EnrichSoft(data)()
    cl.save(enriched)
    n += 1
print(f'\n{n} GEO data are created.')

GSE286399, GSE280041, GSE144357, GSE121309, GSE183590, GSE148729, 
6 GEO data are created.


### cell marker

In [6]:
cellmarker = pd.ExcelFile('../results/Cell_marker_Seq.xlsx')
cm = cellmarker.parse('seq')

In [15]:
# get PMID
lung_cm = cm[(cm['species']=='Human') & cm['tissue_type'].str.contains('Lung')]
pmid_list= np.unique(lung_cm['PMID'])
print(pmid_list)

[30259978 30523199 30554520 30650190 30784054 31221805 31233341 31289132
 31299246 31333652 31405848 31834999 31840053 31892341 31996486 32004478
 32072637 32109386 32112047 32122885 32203281 32246845 32317009 32317643
 32373206 32398875 32405060 32497778 32580738 32603599 32810439 32822576
 32832598 32832599 32849643 32882007 32968798 32973742 33057196 33083004
 33123174 33144684 33178221 33377642 33382972 33500718 33514641 33571124
 33598101 33657410 33705361 33717172 33822772 33879239 33953163 33972311
 34017124 34030460 34049947 34247147 34313733 34330889 34475869 34504485
 34603282 34624218 34663877 34715018 34764257 34780851 34804043 34876692
 34914922 34916290 35078977 35108060 35126365 35184398 35213222 35216676
 35354645 35430336]


In [16]:
# retrieve soft data
parser = ParseSoft(data_dir)
cl = LabelData(label_dir)
n = 0
for pmid in pmid_list:
    pmid_keys = Slicer.PMID(pmid)
    geo_pool = Utils.key_get(pmid_geo, pmid_keys)
    for geo in geo_pool:
        print(geo, end=', ')
        soft_path = parser.soft_local_path(geo)
        softer = SearchSoft(soft_path)
        data = softer.filter_data(softer.parse_rows)
        enriched = EnrichSoft(data)()
        cl.save(enriched)
        n += 1
print(f'\n{n} GEO data are created.')

GSE112274, GSE121611, GSE122960, GSE128033, GSE128169, GSE133747, GSE137811, GSE137805, GSE137799, GSE124885, GSE132771, GSE147066, GSE145926, GSE140819, GSE135851, GSE135893, GSE136831, GSE217722, GSE162936, GSE162499, GSE162500, GSE162498, GSE158055, GSE166059, GSE166034, GSE166036, GSE166033, GSE166035, GSE166037, GSE168710, GSE148071, GSE164829, GSE156311, GSE168299, 
34 GEO data are created.


In [17]:
# include all PMID human related
human_cm = cm[cm['species']=='Human']
is_lung = human_cm.apply(lambda x: x.str.lower().str.contains('lung').any(), axis=1)
human_cm = human_cm[is_lung]
human_pmid_list= np.unique(human_cm['PMID'])
print(len(human_pmid_list))

110


In [18]:
# retrieve soft data
parser = ParseSoft(data_dir)
cl = LabelData(label_dir)
n = 0
for pmid in human_pmid_list:
    pmid_keys = Slicer.PMID(pmid)
    geo_pool = Utils.key_get(pmid_geo, pmid_keys)
    for geo in geo_pool:
        print(geo, end=', ')
        soft_path = parser.soft_local_path(geo)
        softer = SearchSoft(soft_path)
        data = softer.filter_data(softer.parse_rows)
        enriched = EnrichSoft(data)()
        cl.save(enriched)
        n += 1
print(f'\n{n} GEO data are created.')

GSE112274, GSE121611, GSE122960, GSE127472, GSE127462, GSE127813, GSE127471, GSE130148, GSE128033, GSE128169, GSE133747, GSE137811, GSE137805, GSE137799, GSE124885, GSE132771, GSE147066, GSE125188, GSE145926, GSE140819, GSE135851, GSE135893, GSE136831, GSE158127, GSE161089, GSE217722, GSE155249, GSE162936, GSE162499, GSE162500, GSE162498, GSE158055, GSE166059, GSE166034, GSE166036, GSE166033, GSE166035, GSE166037, GSE168710, GSE148071, GSE164829, GSE156311, GSE155515, GSE180864, GSE185044, GSE185043, GSE185045, GSE154826, GSE168299, GSE190510, GSE196303, GSE180908, GSE180063, 
53 GEO data are created.


## parse SRR given BioSample and SRX

In [None]:
# data are determined previously
samn_srr = Utils.from_json('../results/samn_srr.json')
srr_fastq = Utils.from_json('../results/srr_fastq_urls.json')

In [20]:
# Given BioSample, parse SRR accessions if they exist
for data in Utils.json_iter(label_dir):
    data = ParseSra.parse_srr(data, samn_srr)
    cl.save(data)

In [None]:
# retrieve FTP URLs of *gz files given their SRR accessions
for data in Utils.json_iter(label_dir):
    data = ParseSra.parse_ftp_fastq(data, srr_fastq)
    cl.save(data)

Start GSE133747...	0 out of 8 SRR are updated.
Start GSE128033...	0 out of 0 SRR are updated.
Start GSE128169...	0 out of 0 SRR are updated.
Start GSE154826...	0 out of 249 SRR are updated.
Start GSE112274...	0 out of 2721 SRR are updated.
Start GSE280041...	0 out of 155 SRR are updated.
Start GSE168710...	0 out of 0 SRR are updated.
Start GSE168299...	0 out of 7 SRR are updated.
Start GSE140819...	0 out of 0 SRR are updated.
Start GSE166059...	0 out of 92 SRR are updated.
Start GSE166034...	0 out of 13 SRR are updated.
Start GSE166033...	0 out of 9 SRR are updated.
Start GSE166037...	0 out of 4 SRR are updated.
Start GSE166035...	0 out of 14 SRR are updated.
Start GSE166036...	0 out of 48 SRR are updated.
Start GSE145926...	0 out of 20 SRR are updated.
Start GSE183590...	0 out of 1455 SRR are updated.
Start GSE125188...	0 out of 8 SRR are updated.
Start GSE217722...	0 out of 0 SRR are updated.
Start GSE135851...	0 out of 7 SRR are updated.
Start GSE135893...	0 out of 227 SRR are updat

## select biosamples and 

In [82]:
# confirm characteristics of biosample
features, sample_srr = {}, {}
cl = LabelData(label_dir)
for data in cl.json_iter():
    if data['taxid'] == '9606':
        for sample_id, sample in data['samples'].items():
            if sample.get('scrnaseq') and sample['SRR']:
                #print(data['GEO'], sample_id)
                features[sample_id] = sample['characteristics']
                sample_srr[sample_id] = sample['SRR']
#
df = pd.DataFrame.from_dict(features).T
df = df[sorted(list(df))]
df.to_csv('../data/features.csv')
df

Unnamed: 0,age,cd34+_hsc_donor,cell_line,cell_type,disease_state,fetal_lung_tissue_donor,genotype/variation,group,inoculation_dose,library_type,...,mouse_strain,patient_id,sample_type,samplename_old,sex,technology,time_point,tissue,tissue_type,treatment
GSM3926539,9 weeks,,,,,,,,,,...,,,,,male,,,,,
GSM3926540,9 weeks,,,,,,,,,,...,,,,,female,,,,,
GSM3926541,9 weeks,,,,,,,,,,...,,,,,male,,,,,
GSM3926542,9 weeks,,,,,,,,,,...,,,,,female,,,,,
GSM3926543,3.5 months,,,,,,,,,,...,,,,,male,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM4889468,,,,airway basal stem cells,,,,,,,...,,,,,,10X Genomics Single Cell RNA Sequencing,,,,cigarette smoke
GSM3557941,,,,CD45+CD14-CD16-CD3E+,,,,,,,...,,,,,,,,,,
GSM3557942,,,,CD45+CD14-CD16-CD3E+,,,,,,,...,,,,,,,,,,
GSM3557943,,,,CD45+CD14-CD16-CD3E-,,,,,,,...,,,,,,,,,,


In [65]:
# prepare SRR id list for download
num_geo = num_sample = 0
cl = LabelData(label_dir)
with open('../results/srr_ids.txt', 'w') as f:
    for data in cl.json_iter():
        if data['taxid'] == '9606':
            num_geo += 1
            for sample_id, sample in data['samples'].items():
                if sample.get('scrnaseq') and sample['SRR']:
                    #print(data['GEO'], sample_id)
                    num_sample += 1
                    for srr_acc, local_path in sample['SRR'].items():
                        if not local_path:
                            # print(f"fastq-dump {srr_acc} -O /home/yuan/results/fastq --gzip")
                            f.write(srr_acc + '\n')
print(num_geo, num_sample)

52 400


## manual label

### lung cancer key=cell_line
- NCI-H1299 is an epithelial-like cell that was isolated from the lung of a White, 43-year-old, male patient with carcinoma.
- A549 cells were isolated from the lung tissue of a White, male, lung cancer patient.
- Calu-3 epithelial cells are isolated from lung tissue derived from a 25-year-old, White, male patient with lung adenocarcinoma who received prior therapy with cytoxan, bleomycin, and adriamycin. This cell line is valuable for SARS-CoV-2 propagation in vitro, is a suitable transfection host, and has applications in cancer and toxicology research.

In [87]:
def lung_cancer_cell_line(df):
    res = []
    for name in ['H1299', 'Calu3', 'A549']:
        df1 = df[df['cell_line']==name]
        print(name, len(df1))
        for sample_id in list(df1.index):
            for srr_acc, fq in sample_srr[sample_id].items():
                if fq:
                    label, fq1, fq2 = f"{name}_{srr_acc}", fq.get('R1'), fq.get('R2')
                    res.append((label, fq1, fq2))
    res = pd.DataFrame(res, columns=['sample', 'fastq_1', 'fastq_2'])
    return res
df1 = lung_cancer_cell_line(df)
df1.to_csv('../data/samplesheet_cellline_lungcancer.csv', index=False)
df1.head()

H1299 40
Calu3 90
A549 1


Unnamed: 0,sample,fastq_1,fastq_2
0,H1299_SRR11549938,/home/yuan/data/SRA/SRR115/038/SRR11549938.fas...,
1,H1299_SRR11549939,/home/yuan/data/SRA/SRR115/039/SRR11549939.fas...,
2,H1299_SRR11549940,/home/yuan/data/SRA/SRR115/040/SRR11549940.fas...,
3,H1299_SRR11549941,/home/yuan/data/SRA/SRR115/041/SRR11549941.fas...,
4,H1299_SRR11549942,/home/yuan/data/SRA/SRR115/042/SRR11549942.fas...,


### lung cell line
- WI-38 cell line is the first human diploid cell line to be used in human vaccine preparation. WI-38 cells were isolated from the lung tissue of a 3-month-old, female, embryo. WI-38 is used in viruscide testing.
 

### colon cancer, key=cell_line
- HCT 116 is an adherent cell line isolated from the colon of a patient with colon cancer. It has a mutation in codon 13 of the ras-proto-oncogene. This cell line is near-diploid and has a relatively stable genetic profile, making it a valuable in vitro model. This line can be utilized in cancer research and gastrointestinal (GI) research.
- Caco-2 [Caco2] is an adherent cell line isolated from colon tissue derived from a patient with colorectal adenocarcinoma. 

### normal cell line
- cell_line="RUES2 human embryonic stem cell". RUES2 is a human embryonic stem cell line derived from a de-identified frozen embryo that was originally generated for reproductive purposes.
- cell_type = "airway basal stem cells"
- cell_type = "CD45+", group="healthy control"

### lung tissue
Normal
- tissue=lung, tissue_type=Normal
- tissue=lung, group=healthy control
- tissue = Peripheral blood mononuclear cell, disease_state=Health donors
patient
- tissue=lung, tissue_type=Tumor
- tissue=lung, disease_state = Coronavirus infected disease-19 (COVID-19)
- tissue=lung, group = severe COVID-19 patient
- tissue=lung, disease_state = Lymphangioleiomyomatosis (LAM) patient
- tissue = Peripheral blood mononuclear cell, disease_state=Patient with ASS-ILD