# prepare dataset
lung cancer, cell line H1299
https://www.cytion.com/NCI-H1299-Cells/300485

In [None]:
import os
import json
import Bio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
torch.manual_seed(1)

%load_ext autoreload
%autoreload 2

## raw data

Prepare GEO accesions, and put them into the file data/GEO_H1299.csv. Those datasets are from scRNA-seq, 
- Non-small cell lung cancer (NSCLC) cell line: H1299
GSE286399: SRR31958404 SRR31958403	
GSE280041: SRR31072090	SRR31072089	SRR31072128	SRR31072129 SRR31072127 SRR31072132	
GSE144357: GSM4286770
GSE121309: SRR8064284 SRR8064285 SRR8064286 SRR8064287
GSE183590 : GSM5562430-GSM5562449
GSE148729: GSM4477968-GSM4477983	

- control:


download raw data from GEO
```
cd ~/bio/nextflow
nextflow run nf-core/fetchngs -r 1.12.0 -profile docker --input ~/bio/scrnaseq_reference/data/GEO_H1299.csv --outdir ~/data/test
```

In [31]:
from parse_soft import ParseSoft
from create_config import CreateConfig

data_dir = '/home/yuan/data'
softer = ParseSoft(data_dir)
config_dir = '../data'
configer = CreateConfig(config_dir)

raw_dir = '/home/yuan/data/lung_cancer'
# 'GSE286399': failed
geo_list = ['GSE286399', 'GSE280041', 'GSE144357', 'GSE121309', 'GSE183590', 'GSE148729']
for geo in geo_list:
    samples = softer.sample_ids(geo)
    cmd = configer.fetch_geo(geo, samples, raw_dir)
    print(cmd)
    

nextflow run nf-core/fetchngs -r 1.12.0 -profile docker -resume -c /home/yuan/bio/scrnaseq_reference/data/ids/GSE286399.config
nextflow run nf-core/fetchngs -r 1.12.0 -profile docker -resume -c /home/yuan/bio/scrnaseq_reference/data/ids/GSE280041.config
nextflow run nf-core/fetchngs -r 1.12.0 -profile docker -resume -c /home/yuan/bio/scrnaseq_reference/data/ids/GSE144357.config
nextflow run nf-core/fetchngs -r 1.12.0 -profile docker -resume -c /home/yuan/bio/scrnaseq_reference/data/ids/GSE121309.config
nextflow run nf-core/fetchngs -r 1.12.0 -profile docker -resume -c /home/yuan/bio/scrnaseq_reference/data/ids/GSE183590.config
nextflow run nf-core/fetchngs -r 1.12.0 -profile docker -resume -c /home/yuan/bio/scrnaseq_reference/data/ids/GSE148729.config


### analyze Cell Marker 2.0

In [38]:
cellmarker = pd.ExcelFile('../data/Cell_marker_Seq.xlsx')
cm = cellmarker.parse('seq')
cm.head()

Unnamed: 0,species,tissue_class,tissue_type,uberonongology_id,cancer_type,cell_type,cell_name,cellontology_id,marker,Symbol,GeneID,Genetype,Genename,UNIPROTID,technology_seq,marker_source,PMID,Title,journal,year
0,Human,Abdomen,Abdominal fat pad,,Normal,Normal cell,Brown adipocyte,CL_0000449,FABP4,FABP4,2167.0,protein_coding,fatty acid binding protein 4,E7DVW4,sci-RNA-seq,Experiment,32355218,Single-cell transcriptional networks in differ...,Nature communications,2020
1,Human,Abdomen,Abdominal fat pad,,Normal,Normal cell,Brown adipocyte,CL_0000449,PDGFRα,,,,,,sci-RNA-seq,Experiment,32355218,Single-cell transcriptional networks in differ...,Nature communications,2020
2,Human,Abdomen,Abdominal fat pad,,Normal,Normal cell,Brown adipocyte,CL_0000449,UCP1,UCP1,7350.0,protein_coding,uncoupling protein 1,P25874,sci-RNA-seq,Experiment,32355218,Single-cell transcriptional networks in differ...,Nature communications,2020
3,Mouse,Abdomen,Muscle,UBERON_0001630,Normal,Normal cell,Fibro-adipogenic progenitor cell,,Wisp1,Ccn4,22402.0,protein_coding,cellular communication network factor 4,O54775,10x Chromium,Experiment,35439171,An estrogen-sensitive fibroblast population dr...,JCI insight,2022
4,Mouse,Abdomen,Muscle,UBERON_0001630,Normal,Normal cell,Myoblast,CL_0000056,Myod1,Myod1,17927.0,protein_coding,myogenic differentiation 1,P10085,10x Chromium,Experiment,35439171,An estrogen-sensitive fibroblast population dr...,JCI insight,2022


In [54]:
cm['technology_seq'].value_counts()

technology_seq
10x Chromium                                                                13402
Single-cell sequencing                                                       8886
sci-RNA-seq                                                                   972
Smart-seq2                                                                    829
CyTOF                                                                         541
Drop-seq                                                                      503
snRNA-seq                                                                     347
10x Chromium/Smart-Seq2                                                       314
Seq-Well                                                                      145
inDrops                                                                       144
scATAC-seq                                                                    132
CEL-Seq2                                                                       84
S

In [44]:
list(cm)

['species',
 'tissue_class',
 'tissue_type',
 'uberonongology_id',
 'cancer_type',
 'cell_type',
 'cell_name',
 'cellontology_id',
 'marker',
 'Symbol',
 'GeneID',
 'Genetype',
 'Genename',
 'UNIPROTID',
 'technology_seq',
 'marker_source',
 'PMID',
 'Title',
 'journal',
 'year']

In [55]:
normal_cm = cm[(cm['species']=='Human') & cm['tissue_type'].str.contains('Lung') & cm['cell_type'].str.contains('Normal')]
pmid_list= np.unique(normal_cm['PMID'])
print(pmid_list)

[30554520 30784054 31221805 31233341 31299246 31333652 31405848 31834999
 31840053 31892341 31996486 32004478 32109386 32112047 32122885 32246845
 32317643 32398875 32497778 32603599 32810439 32832598 32832599 32882007
 32968798 32973742 33057196 33083004 33178221 33382972 33500718 33657410
 33705361 33717172 33822772 33879239 34030460 34049947 34313733 34330889
 34504485 34603282 34715018 34876692 34914922 35078977 35213222 35216676
 35354645]


In [None]:
from parse_soft import ParseSoft

data_dir = '/home/yuan/data'
softer = ParseSoft(data_dir)
normal_geo = softer.pmid_to_geo(pmid_list)
print(normal_geo)

5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000, 90000, 

In [None]:
geo_pool

In [None]:
# lung normal cell
from parse_soft import ParseSoft
from create_config import CreateConfig

data_dir = '/home/yuan/data'
softer = ParseSoft(data_dir)
config_dir = '../data'
configer = CreateConfig(config_dir)

raw_dir = '/home/yuan/data/lung_cancer'
geo_list = ['', '', '', '', '', '']
for geo in geo_list:
    samples = softer.sample_ids(geo)
    cmd = configer.fetch_geo(geo, samples, raw_dir)
    print(cmd)

In [57]:
{i:0 for i in range(4)}

{0: 0, 1: 0, 2: 0, 3: 0}