In [5]:
from patternq.dataset import sample_measurements
%%capture
%load_ext autoreload
%autoreload 2
import pandas as pd
import patternq.query as pqq
import patternq.dataset as pqd
import patternq.reference as pqr
import patternq.schema as pqs

## PatternQ Query Wrappers Example

This notebook shows an example of access data in the Pattern Data Commons,
using the query wrapping convience routines in `patternq.dataset` and `patternq.reference`.

In [6]:
%%capture
import os
os.environ
# confirm PATTERNQ_API_KEY is set to your API key here.

In [13]:
# using tcga data
dataset = 'tcga-brca'
database = 'tcga-brca'

In [15]:
# verify schema version
pqs.schema_info(db_name=database)

SchemaInfo(name=':org.rcrf/candel', version='1.3.1')

In [16]:
subjects = pqd.subjects('tcga-brca', db_name='tcga-brca')
subjects

Unnamed: 0,subject-HLA-A-type,subject-uid,subject-race,subject-freetext-disease,subject-HLA-B-type,db-id,subject-age,subject-sex,subject-HLA-C-type,subject-dead,subject-id,subject-ethnicity
0,[hla_a_68_01_02],"[tcga-brca, 48958e0a033b76dd2cde746ec340d7b8]",:subject.race/white,Invasive breast carcinoma,"[hla_b_40_01_02, hla_b_44_02_01_03]",17592187509997,50.0,:subject.sex/female,"[hla_c_03_04_01_01, hla_c_07_04_01]",True,TCGA-AR-A1AR,
1,"[hla_a_25_01_01, hla_a_31_01_02]","[tcga-brca, cdff6ccae7b3a01e06583b6eb6935647]",:subject.race/white,Invasive breast carcinoma,"[hla_b_18_01_01_02, hla_b_51_08]",17592187510001,68.0,:subject.sex/female,"[hla_c_12_03_01_01, hla_c_16_02_01]",True,TCGA-BH-A1EO,
2,[hla_a_03_01_01_01],"[tcga-brca, 58de8decc5806306e122d7d2e0361077]",:subject.race/white,Invasive breast carcinoma,[hla_b_07_02_01],17592187510005,35.0,:subject.sex/female,[hla_c_07_02_01_03],True,TCGA-BH-A1ES,
3,[hla_a_26_01_03],"[tcga-brca, e33c10ac862066f01799593f93226d63]",:subject.race/white,Invasive breast carcinoma,"[hla_b_38_01_01, hla_b_40_02_01]",17592187510009,55.0,:subject.sex/female,"[hla_c_02_02_02, hla_c_12_03_01_01]",True,TCGA-BH-A1ET,
4,[hla_a_02_01_01_01],"[tcga-brca, 33c8d0592a63e6d929493c6fee83049a]",:subject.race/white,Invasive breast carcinoma,"[hla_b_13_02_01, hla_b_18_01_01_02]",17592187510013,83.0,:subject.sex/female,"[hla_c_06_02_01_01, hla_c_07_01_01_01]",True,TCGA-BH-A1EU,
...,...,...,...,...,...,...,...,...,...,...,...,...
1093,"[hla_a_01_01_01_01, hla_a_26_01_01]","[tcga-brca, 28355fea1611cf76788e36f7d8d1e63b]",:subject.race/white,Invasive breast carcinoma,"[hla_b_07_02_01, hla_b_08_01_07]",17592187514126,44.0,:subject.sex/female,"[hla_c_07_01_01_01, hla_c_07_02_01_03]",False,TCGA-E2-A1B6,
1094,"[hla_a_01_01_01_01, hla_a_24_02_01_01]","[tcga-brca, b49d76c1ed3ce2f807c30aca593700e5]",,Invasive breast carcinoma,"[hla_b_08_01_01, hla_b_44_05_01]",17592187514130,63.0,:subject.sex/female,"[hla_c_02_02_02, hla_c_07_01_01_01]",False,TCGA-E2-A1BC,
1095,[hla_a_02_01_01_01],"[tcga-brca, 679b389e478544f624e010f2fba4951c]",:subject.race/white,Invasive breast carcinoma,"[hla_b_07_02_01, hla_b_35_01_01_02]",17592187514133,53.0,:subject.sex/female,"[hla_c_04_01_01_01, hla_c_07_02_01_03]",False,TCGA-E2-A1BD,
1096,"[hla_a_31_01_02, hla_a_80_01_01_01]","[tcga-brca, ac707e57a213016c5f7f1a643c938bc0]",:subject.race/african-american,Invasive breast carcinoma,"[hla_b_27_05_02, hla_b_40_01_02]",17592187517522,,:subject.sex/female,"[hla_c_03_04_01_01, hla_c_06_02_01_01]",False,TCGA-OL-A66H,:subject.ethnicity/not-hispanic


Because a patient can have multiple HLA A, B, or C types, we explode the column here
(create multiple rows when we have multiple values in a list in a single cell in the DataFrame).

In [17]:
by_hla = subjects[['subject-id', 'subject-HLA-A-type', 'subject-HLA-B-type', 'subject-HLA-C-type']].explode(column='subject-HLA-A-type')
by_hla = by_hla.explode(column='subject-HLA-B-type')
by_hla = by_hla.explode(column='subject-HLA-C-type')
by_hla

Unnamed: 0,subject-id,subject-HLA-A-type,subject-HLA-B-type,subject-HLA-C-type
0,TCGA-AR-A1AR,hla_a_68_01_02,hla_b_40_01_02,hla_c_03_04_01_01
0,TCGA-AR-A1AR,hla_a_68_01_02,hla_b_40_01_02,hla_c_07_04_01
0,TCGA-AR-A1AR,hla_a_68_01_02,hla_b_44_02_01_03,hla_c_03_04_01_01
0,TCGA-AR-A1AR,hla_a_68_01_02,hla_b_44_02_01_03,hla_c_07_04_01
1,TCGA-BH-A1EO,hla_a_25_01_01,hla_b_18_01_01_02,hla_c_12_03_01_01
...,...,...,...,...
1096,TCGA-OL-A66H,hla_a_80_01_01_01,hla_b_27_05_02,hla_c_03_04_01_01
1096,TCGA-OL-A66H,hla_a_80_01_01_01,hla_b_27_05_02,hla_c_06_02_01_01
1096,TCGA-OL-A66H,hla_a_80_01_01_01,hla_b_40_01_02,hla_c_03_04_01_01
1096,TCGA-OL-A66H,hla_a_80_01_01_01,hla_b_40_01_02,hla_c_06_02_01_01


Here we use the HLA B data to limit follow-on queries to patients who are HLA B27 positive, a marker common in autoimmune disease.

In [18]:
by_hla['subject-HLA-B-type'].value_counts()

subject-HLA-B-type
hla_b_07_02_01       841
hla_b_08_01_01       508
hla_b_15_01_01_01    423
hla_b_44_02_01_01    393
hla_b_35_01_01_02    332
                    ... 
hla_b_51_07_02         2
hla_b_39_01_03         1
hla_b_44_02_15         1
hla_b_14_03            1
hla_b_07_02_06         1
Name: count, Length: 102, dtype: int64

In [19]:
b27_matches = by_hla['subject-HLA-B-type'].astype('string').str.contains("b_27")
b27_matches.value_counts()
b27_sub = by_hla[b27_matches]
b27_sub

Unnamed: 0,subject-id,subject-HLA-A-type,subject-HLA-B-type,subject-HLA-C-type
15,TCGA-E2-A15I,hla_a_03_01_19,hla_b_27_05_02,hla_c_01_02_01
15,TCGA-E2-A15I,hla_a_03_01_19,hla_b_27_05_02,hla_c_15_02_01
15,TCGA-E2-A15I,hla_a_26_01_03,hla_b_27_05_02,hla_c_01_02_01
15,TCGA-E2-A15I,hla_a_26_01_03,hla_b_27_05_02,hla_c_15_02_01
33,TCGA-B6-A1KN,hla_a_03_01_01_01,hla_b_27_05_02,hla_c_02_02_02
...,...,...,...,...
1064,TCGA-AR-A1AQ,hla_a_02_06_05,hla_b_27_02_01,hla_c_16_01_01
1096,TCGA-OL-A66H,hla_a_31_01_02,hla_b_27_05_02,hla_c_03_04_01_01
1096,TCGA-OL-A66H,hla_a_31_01_02,hla_b_27_05_02,hla_c_06_02_01_01
1096,TCGA-OL-A66H,hla_a_80_01_01_01,hla_b_27_05_02,hla_c_03_04_01_01


Then, for patients in this group, we find which assays have been run on their samples.

In [22]:
sub_ids = b27_sub['subject-id'].unique().tolist()
assays_for_sub = pqd.patient_assays(dataset, sub_ids, db_name='tcga-brca')
assays_for_sub

Unnamed: 0,subject-id,sample-id,assay-tech,assay-name,measurement-set-name
0,TCGA-A2-A0ER,tcga-a2-a0er-10a,:assay.technology/RNA-seq,RNA-seq data,immune-cell-deconvolution
1,TCGA-A2-A0ST,tcga-a2-a0st-10a,:assay.technology/RNA-seq,RNA-seq data,immune-cell-deconvolution
2,TCGA-A7-A13H,tcga-a7-a13h-01a,:assay.technology/SNP-array,cnv and variants,snp CNV data
3,TCGA-AN-A0AL,tcga-an-a0al-01a,:assay.technology/SNP-array,cnv and variants,snp CNV data
4,TCGA-A8-A099,tcga-a8-a099-01a,:assay.technology/WES,WES,baseline mutations
...,...,...,...,...,...
596,TCGA-A2-A04V,tcga-a2-a04v-01a,:assay.technology/RNA-seq,RNA-seq data,gx
597,TCGA-AC-A3W6,tcga-ac-a3w6-01a,:assay.technology/WES,WES,baseline mutations
598,TCGA-OL-A66H,tcga-ol-a66h-01a,:assay.technology/miscellaneous,Tumor purity,tumor purity
599,TCGA-E9-A1R6,tcga-e9-a1r6-01a,:assay.technology/RNA-seq,RNA-seq data,immune-cell-deconvolution


In [23]:
samples = pqd.samples("tcga-brca", db_name='tcga-brca')
samples

Unnamed: 0,sample-id,sample-tumor-type,sample-specimen,sample-metastasis,sample-uid,db-id,sample-timepoint-timepoint-id,sample-subject-subject-id,sample-study-day-study-day-id,sample-type-db-id,sample-gdc-anatomic-site-gdc-anatomic-site-name
0,tcga-a2-a0eu-01a,breast,:sample.specimen/fresh-frozen,False,"[tcga-brca, 1083184581b3c24ac85291d75a832b9a]",17592187518720,sample procurement,TCGA-A2-A0EU,35,17592186045420,Breast
1,tcga-a2-a0eu-10a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, e96e9d5e9e41bba015ba8e4daefda967]",17592187518803,sample procurement,TCGA-A2-A0EU,35,17592186045421,Breast
2,tcga-a7-a0d9-01a,breast,:sample.specimen/fresh-frozen,False,"[tcga-brca, 8c1dbb6ba31a61b3b978a1f93dccb86c]",17592187518823,sample procurement,TCGA-A7-A0D9,10,17592186045420,Breast
3,tcga-a7-a0d9-10a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, ba8add19b462211346377f6c7835cb6a]",17592187519258,sample procurement,TCGA-A7-A0D9,10,17592186045421,Breast
4,tcga-ao-a0jf-01a,breast,:sample.specimen/fresh-frozen,False,"[tcga-brca, 0bffbcada51f70f1978d207e22089d60]",17592187519626,sample procurement,TCGA-AO-A0JF,57,17592186045420,Breast
...,...,...,...,...,...,...,...,...,...,...,...
2291,tcga-b6-a402-10a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 9ede5e348369b0d13d4c7989eb565e3c]",17592188110342,sample procurement,TCGA-B6-A402,0,17592186045421,Breast
2292,tcga-e9-a1ne-11a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 85358f998e91a477db9bc868554ea46c]",17592188110466,sample procurement,TCGA-E9-A1NE,0,17592186045421,Breast
2293,tcga-ol-a5da-10a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 7882611517385fe91546199a73b523c7]",17592188111439,sample procurement,TCGA-OL-A5DA,33,17592186045421,Breast
2294,tcga-bh-a5j0-10a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 934e529d67c8a4b6e7840615131ce315]",17592188112629,sample procurement,TCGA-BH-A5J0,104,17592186045421,Breast


In [24]:
samples_sub = samples[samples["sample-subject-subject-id"].apply(lambda x: x in sub_ids)]
samples_sub

Unnamed: 0,sample-id,sample-tumor-type,sample-specimen,sample-metastasis,sample-uid,db-id,sample-timepoint-timepoint-id,sample-subject-subject-id,sample-study-day-study-day-id,sample-type-db-id,sample-gdc-anatomic-site-gdc-anatomic-site-name
69,tcga-e2-a15i-01a,breast,:sample.specimen/fresh-frozen,False,"[tcga-brca, 6418023f5ea1c0660b0ae8e57363a1d8]",17592187539851,sample procurement,TCGA-E2-A15I,0,17592186045420,Breast
70,tcga-e2-a15i-10a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 3e3a7b86768f0bff5fabc547366d9091]",17592187540016,sample procurement,TCGA-E2-A15I,0,17592186045421,Breast
71,tcga-e2-a15i-11a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 9538fcf5d7e50fa6e8304367b61d704a]",17592187540116,sample procurement,TCGA-E2-A15I,0,17592186045421,Breast
105,tcga-b6-a1kn-01a,breast,:sample.specimen/fresh-frozen,False,"[tcga-brca, 83fe2368405c6e3d2a08288d83f8f0db]",17592187547478,sample procurement,TCGA-B6-A1KN,36,17592186045420,Breast
106,tcga-b6-a1kn-10a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 870c4800388cc94b29d58203db3a065b]",17592187548097,sample procurement,TCGA-B6-A1KN,36,17592186045421,Breast
...,...,...,...,...,...,...,...,...,...,...,...
2135,tcga-ar-a1aq-01a,breast,:sample.specimen/fresh-frozen,False,"[tcga-brca, ba15ce01b2d7006adcf93c4c164fd668]",17592188092973,sample procurement,TCGA-AR-A1AQ,8,17592186045420,Breast
2136,tcga-ar-a1aq-10a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 55cfb9a0147ff218289d4a58f76624a9]",17592188093360,sample procurement,TCGA-AR-A1AQ,8,17592186045421,Breast
2256,tcga-d8-a27e-10a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 3c207e60055076c99b42219f1cc01442]",17592188109129,sample procurement,TCGA-D8-A27E,41,17592186045421,Breast
2284,tcga-e9-a1n9-11a,,:sample.specimen/fresh-frozen,False,"[tcga-brca, 93a83f1dd48b8968e49cd72d2eedac35]",17592188110096,sample procurement,TCGA-E9-A1N9,0,17592186045421,Breast


#### Clinical Data

Because this is open access TCGA data, we only ahve access to the PFS, OS, and recurrence clinical data.

In [25]:
clin_sum = pqd.clinical_summary("tcga-brca", db_name='tcga-brca')
clin_sum

Unnamed: 0,clinical-observation-set-name
0,pfs os recurrence


In [26]:
samples_for_meas = assays_for_sub[assays_for_sub['measurement-set-name'] == 'baseline mutations']
samples_for_meas

Unnamed: 0,subject-id,sample-id,assay-tech,assay-name,measurement-set-name
4,TCGA-A8-A099,tcga-a8-a099-01a,:assay.technology/WES,WES,baseline mutations
28,TCGA-A7-A0DA,tcga-a7-a0da-01a,:assay.technology/WES,WES,baseline mutations
51,TCGA-AO-A12G,tcga-ao-a12g-01a,:assay.technology/WES,WES,baseline mutations
62,TCGA-A2-A0EO,tcga-a2-a0eo-01a,:assay.technology/WES,WES,baseline mutations
71,TCGA-A2-A04X,tcga-a2-a04x-01a,:assay.technology/WES,WES,baseline mutations
...,...,...,...,...,...
586,TCGA-E9-A1R4,tcga-e9-a1r4-01a,:assay.technology/WES,WES,baseline mutations
588,TCGA-A8-A07B,tcga-a8-a07b-01a,:assay.technology/WES,WES,baseline mutations
591,TCGA-A8-A06T,tcga-a8-a06t-01a,:assay.technology/WES,WES,baseline mutations
597,TCGA-AC-A3W6,tcga-ac-a3w6-01a,:assay.technology/WES,WES,baseline mutations


In [54]:
assays_for_sub['measurement-set-name'].value_counts()

measurement-set-name
immune-cell-deconvolution    173
snp CNV data                 170
gx                            91
tumor purity                  85
baseline mutations            82
Name: count, dtype: int64

#### Samples and Measurements

This query selects the measurements for all samples that were included in a particular measurement set. In this case, the WES
(whole exome sequencing) data at baseline, with measurement set name 'baseline mutations').

In [27]:
meas_samples = pqd.sample_measurements("tcga-brca", "baseline mutations",
                                       samples_for_meas['sample-id'].tolist(), db_name='tcga-brca',
                                       timeout=120)
meas_samples

Unnamed: 0,db-id,measurement-id,measurement-uid,measurement-sample-sample-id,measurement-variant-variant-id
0,17592234968693,tcga-a1-a0sh-01a-GRCh37:chr23:+:108911440:1089...,"[tcga-brca, a98f5fd31e7eb25a11aabcc29df6ebd7]",tcga-a1-a0sh-01a,GRCh37:chr23:+:108911440:108911440/C/T
1,17592234968695,tcga-a1-a0sh-01a-GRCh37:chr1:+:247040513:24704...,"[tcga-brca, 04a0aebe4f1e1662f62cee204cbc74b8]",tcga-a1-a0sh-01a,GRCh37:chr1:+:247040513:247040513/C/A
2,17592234968697,tcga-a1-a0sh-01a-GRCh37:chr6:+:85400295:854002...,"[tcga-brca, 2743c86cf741a47ec9aa882d755f01d8]",tcga-a1-a0sh-01a,GRCh37:chr6:+:85400295:85400295/G/A
3,17592234968699,tcga-a1-a0sh-01a-GRCh37:chr2:+:61965629:619656...,"[tcga-brca, edbd0160cf703fb987f0810ebcb98d62]",tcga-a1-a0sh-01a,GRCh37:chr2:+:61965629:61965629/T/C
4,17592234968701,tcga-a1-a0sh-01a-GRCh37:chr19:+:117865025:1178...,"[tcga-brca, cf54bb287c1d40d26e2b4af7551038de]",tcga-a1-a0sh-01a,GRCh37:chr19:+:117865025:117865025/C/G
...,...,...,...,...,...
8028,17592235128883,tcga-ol-a66h-01a-GRCh37:chr3:+:12447485:124474...,"[tcga-brca, 62f30b4472c51f6dedf626fa89c3cf34]",tcga-ol-a66h-01a,GRCh37:chr3:+:12447485:12447485/C/G
8029,17592235128885,tcga-ol-a66h-01a-GRCh37:chr8:+:10464643:104646...,"[tcga-brca, d584100cdf5d68946a6a47e3254d550e]",tcga-ol-a66h-01a,GRCh37:chr8:+:10464643:10464643/T/A
8030,17592235128887,tcga-ol-a66h-01a-GRCh37:chr17:+:19584880:19584...,"[tcga-brca, 23aa10c05ed127c8ea6ce17a3b3a6f29]",tcga-ol-a66h-01a,GRCh37:chr17:+:19584880:19584880/A/G
8031,17592235128889,tcga-ol-a66h-01a-GRCh37:chr4:+:88411538:884115...,"[tcga-brca, 985874c25fcacc499fb1b59e6eecf464]",tcga-ol-a66h-01a,GRCh37:chr4:+:88411538:88411538/C/G


#### Variant Data

Here we access variant data for each of the samples 

In [28]:
var = pqr.variants(meas_samples['measurement-variant-variant-id'].tolist(), db_name='tcga-brca')
var

Unnamed: 0,variant-type,variant-ref-allele,variant-id,variant-alt-allele,db-id,variant-classification,variant-gene-gene-hgnc-symbol,variant-genomic-coordinates-genomic-coordinate-id,unify-import-most-recent-db-id
0,:variant.type/snp,C,GRCh37:chr23:+:108911440:108911440/C/T,T,17592187348366,:variant.classification/missense,CNTN4,GRCh37:chr23:+:108911440:108911440,17592186177812
1,:variant.type/snp,C,GRCh37:chr1:+:247040513:247040513/C/A,A,17592187348368,:variant.classification/nonsense,COL14A1,GRCh37:chr1:+:247040513:247040513,17592186177812
2,:variant.type/snp,G,GRCh37:chr6:+:85400295:85400295/G/A,A,17592187348370,:variant.classification/missense,COL7A1,GRCh37:chr6:+:85400295:85400295,17592186177812
3,:variant.type/snp,T,GRCh37:chr2:+:61965629:61965629/T/C,C,17592187348372,:variant.classification/missense,CPEB2,GRCh37:chr2:+:61965629:61965629,17592186177812
4,:variant.type/snp,C,GRCh37:chr19:+:117865025:117865025/C/G,G,17592187348374,:variant.classification/missense,CSMD1,GRCh37:chr19:+:117865025:117865025,17592186177812
...,...,...,...,...,...,...,...,...,...
8014,:variant.type/snp,C,GRCh37:chr3:+:12447485:12447485/C/G,G,17592187508556,:variant.classification/missense,PARP8,GRCh37:chr3:+:12447485:12447485,17592186177812
8015,:variant.type/snp,T,GRCh37:chr8:+:10464643:10464643/T/A,A,17592187508558,:variant.classification/missense,ASB9,GRCh37:chr8:+:10464643:10464643,17592186177812
8016,:variant.type/snp,A,GRCh37:chr17:+:19584880:19584880/A/G,G,17592187508560,:variant.classification/missense,ZNF660,GRCh37:chr17:+:19584880:19584880,17592186177812
8017,:variant.type/snp,C,GRCh37:chr4:+:88411538:88411538/C/G,G,17592187508562,:variant.classification/missense,SIPA1L2,GRCh37:chr4:+:88411538:88411538,17592186177812


As is often the case with the query wrappers, we now have two tables (sets of tuples) that need to be
joined together with the `merge` method in `pandas.DataFrame`, so that all measurement, variant, and
sample data is linked.

In [29]:
w_var = meas_samples.merge(
    var[['variant-id',
         'variant-ref-allele',
         'variant-alt-allele',
         'variant-gene-gene-hgnc-symbol',
         'variant-genomic-coordinates-genomic-coordinate-id',
         'variant-type',
         'variant-classification']],
    left_on='measurement-variant-variant-id',
    right_on='variant-id',
    how='left'
)
w_var

Unnamed: 0,db-id,measurement-id,measurement-uid,measurement-sample-sample-id,measurement-variant-variant-id,variant-id,variant-ref-allele,variant-alt-allele,variant-gene-gene-hgnc-symbol,variant-genomic-coordinates-genomic-coordinate-id,variant-type,variant-classification
0,17592234968693,tcga-a1-a0sh-01a-GRCh37:chr23:+:108911440:1089...,"[tcga-brca, a98f5fd31e7eb25a11aabcc29df6ebd7]",tcga-a1-a0sh-01a,GRCh37:chr23:+:108911440:108911440/C/T,GRCh37:chr23:+:108911440:108911440/C/T,C,T,CNTN4,GRCh37:chr23:+:108911440:108911440,:variant.type/snp,:variant.classification/missense
1,17592234968695,tcga-a1-a0sh-01a-GRCh37:chr1:+:247040513:24704...,"[tcga-brca, 04a0aebe4f1e1662f62cee204cbc74b8]",tcga-a1-a0sh-01a,GRCh37:chr1:+:247040513:247040513/C/A,GRCh37:chr1:+:247040513:247040513/C/A,C,A,COL14A1,GRCh37:chr1:+:247040513:247040513,:variant.type/snp,:variant.classification/nonsense
2,17592234968697,tcga-a1-a0sh-01a-GRCh37:chr6:+:85400295:854002...,"[tcga-brca, 2743c86cf741a47ec9aa882d755f01d8]",tcga-a1-a0sh-01a,GRCh37:chr6:+:85400295:85400295/G/A,GRCh37:chr6:+:85400295:85400295/G/A,G,A,COL7A1,GRCh37:chr6:+:85400295:85400295,:variant.type/snp,:variant.classification/missense
3,17592234968699,tcga-a1-a0sh-01a-GRCh37:chr2:+:61965629:619656...,"[tcga-brca, edbd0160cf703fb987f0810ebcb98d62]",tcga-a1-a0sh-01a,GRCh37:chr2:+:61965629:61965629/T/C,GRCh37:chr2:+:61965629:61965629/T/C,T,C,CPEB2,GRCh37:chr2:+:61965629:61965629,:variant.type/snp,:variant.classification/missense
4,17592234968701,tcga-a1-a0sh-01a-GRCh37:chr19:+:117865025:1178...,"[tcga-brca, cf54bb287c1d40d26e2b4af7551038de]",tcga-a1-a0sh-01a,GRCh37:chr19:+:117865025:117865025/C/G,GRCh37:chr19:+:117865025:117865025/C/G,C,G,CSMD1,GRCh37:chr19:+:117865025:117865025,:variant.type/snp,:variant.classification/missense
...,...,...,...,...,...,...,...,...,...,...,...,...
8028,17592235128883,tcga-ol-a66h-01a-GRCh37:chr3:+:12447485:124474...,"[tcga-brca, 62f30b4472c51f6dedf626fa89c3cf34]",tcga-ol-a66h-01a,GRCh37:chr3:+:12447485:12447485/C/G,GRCh37:chr3:+:12447485:12447485/C/G,C,G,PARP8,GRCh37:chr3:+:12447485:12447485,:variant.type/snp,:variant.classification/missense
8029,17592235128885,tcga-ol-a66h-01a-GRCh37:chr8:+:10464643:104646...,"[tcga-brca, d584100cdf5d68946a6a47e3254d550e]",tcga-ol-a66h-01a,GRCh37:chr8:+:10464643:10464643/T/A,GRCh37:chr8:+:10464643:10464643/T/A,T,A,ASB9,GRCh37:chr8:+:10464643:10464643,:variant.type/snp,:variant.classification/missense
8030,17592235128887,tcga-ol-a66h-01a-GRCh37:chr17:+:19584880:19584...,"[tcga-brca, 23aa10c05ed127c8ea6ce17a3b3a6f29]",tcga-ol-a66h-01a,GRCh37:chr17:+:19584880:19584880/A/G,GRCh37:chr17:+:19584880:19584880/A/G,A,G,ZNF660,GRCh37:chr17:+:19584880:19584880,:variant.type/snp,:variant.classification/missense
8031,17592235128889,tcga-ol-a66h-01a-GRCh37:chr4:+:88411538:884115...,"[tcga-brca, 985874c25fcacc499fb1b59e6eecf464]",tcga-ol-a66h-01a,GRCh37:chr4:+:88411538:88411538/C/G,GRCh37:chr4:+:88411538:88411538/C/G,C,G,SIPA1L2,GRCh37:chr4:+:88411538:88411538,:variant.type/snp,:variant.classification/missense


With that additional context, we can narrow the query down to look at missense mutations only.

In [30]:
w_var[w_var['variant-classification'] == ':variant.classification/missense']

Unnamed: 0,db-id,measurement-id,measurement-uid,measurement-sample-sample-id,measurement-variant-variant-id,variant-id,variant-ref-allele,variant-alt-allele,variant-gene-gene-hgnc-symbol,variant-genomic-coordinates-genomic-coordinate-id,variant-type,variant-classification
0,17592234968693,tcga-a1-a0sh-01a-GRCh37:chr23:+:108911440:1089...,"[tcga-brca, a98f5fd31e7eb25a11aabcc29df6ebd7]",tcga-a1-a0sh-01a,GRCh37:chr23:+:108911440:108911440/C/T,GRCh37:chr23:+:108911440:108911440/C/T,C,T,CNTN4,GRCh37:chr23:+:108911440:108911440,:variant.type/snp,:variant.classification/missense
2,17592234968697,tcga-a1-a0sh-01a-GRCh37:chr6:+:85400295:854002...,"[tcga-brca, 2743c86cf741a47ec9aa882d755f01d8]",tcga-a1-a0sh-01a,GRCh37:chr6:+:85400295:85400295/G/A,GRCh37:chr6:+:85400295:85400295/G/A,G,A,COL7A1,GRCh37:chr6:+:85400295:85400295,:variant.type/snp,:variant.classification/missense
3,17592234968699,tcga-a1-a0sh-01a-GRCh37:chr2:+:61965629:619656...,"[tcga-brca, edbd0160cf703fb987f0810ebcb98d62]",tcga-a1-a0sh-01a,GRCh37:chr2:+:61965629:61965629/T/C,GRCh37:chr2:+:61965629:61965629/T/C,T,C,CPEB2,GRCh37:chr2:+:61965629:61965629,:variant.type/snp,:variant.classification/missense
4,17592234968701,tcga-a1-a0sh-01a-GRCh37:chr19:+:117865025:1178...,"[tcga-brca, cf54bb287c1d40d26e2b4af7551038de]",tcga-a1-a0sh-01a,GRCh37:chr19:+:117865025:117865025/C/G,GRCh37:chr19:+:117865025:117865025/C/G,C,G,CSMD1,GRCh37:chr19:+:117865025:117865025,:variant.type/snp,:variant.classification/missense
5,17592234968703,tcga-a1-a0sh-01a-GRCh37:chr7:+:28506628:285066...,"[tcga-brca, 5d45e3e29c3ffaf60e4314c87ecb5241]",tcga-a1-a0sh-01a,GRCh37:chr7:+:28506628:28506628/G/T,GRCh37:chr7:+:28506628:28506628/G/T,G,T,CUBN,GRCh37:chr7:+:28506628:28506628,:variant.type/snp,:variant.classification/missense
...,...,...,...,...,...,...,...,...,...,...,...,...
8028,17592235128883,tcga-ol-a66h-01a-GRCh37:chr3:+:12447485:124474...,"[tcga-brca, 62f30b4472c51f6dedf626fa89c3cf34]",tcga-ol-a66h-01a,GRCh37:chr3:+:12447485:12447485/C/G,GRCh37:chr3:+:12447485:12447485/C/G,C,G,PARP8,GRCh37:chr3:+:12447485:12447485,:variant.type/snp,:variant.classification/missense
8029,17592235128885,tcga-ol-a66h-01a-GRCh37:chr8:+:10464643:104646...,"[tcga-brca, d584100cdf5d68946a6a47e3254d550e]",tcga-ol-a66h-01a,GRCh37:chr8:+:10464643:10464643/T/A,GRCh37:chr8:+:10464643:10464643/T/A,T,A,ASB9,GRCh37:chr8:+:10464643:10464643,:variant.type/snp,:variant.classification/missense
8030,17592235128887,tcga-ol-a66h-01a-GRCh37:chr17:+:19584880:19584...,"[tcga-brca, 23aa10c05ed127c8ea6ce17a3b3a6f29]",tcga-ol-a66h-01a,GRCh37:chr17:+:19584880:19584880/A/G,GRCh37:chr17:+:19584880:19584880/A/G,A,G,ZNF660,GRCh37:chr17:+:19584880:19584880,:variant.type/snp,:variant.classification/missense
8031,17592235128889,tcga-ol-a66h-01a-GRCh37:chr4:+:88411538:884115...,"[tcga-brca, 985874c25fcacc499fb1b59e6eecf464]",tcga-ol-a66h-01a,GRCh37:chr4:+:88411538:88411538/C/G,GRCh37:chr4:+:88411538:88411538/C/G,C,G,SIPA1L2,GRCh37:chr4:+:88411538:88411538,:variant.type/snp,:variant.classification/missense


In [41]:
assays_for_sub[['sample-id', 'measurement-set-name']]

Unnamed: 0,sample-id,measurement-set-name
0,tcga-a2-a0er-10a,immune-cell-deconvolution
1,tcga-a2-a0st-10a,immune-cell-deconvolution
2,tcga-a7-a13h-01a,snp CNV data
3,tcga-an-a0al-01a,snp CNV data
4,tcga-a8-a099-01a,baseline mutations
...,...,...
596,tcga-a2-a04v-01a,gx
597,tcga-ac-a3w6-01a,baseline mutations
598,tcga-ol-a66h-01a,tumor purity
599,tcga-e9-a1r6-01a,immune-cell-deconvolution


In [44]:
purity_df = pqd.sample_measurements(dataset, 'tumor purity', assays_for_sub['sample-id'].unique().tolist(), db_name=database, timeout=120)
purity_df

Unnamed: 0,db-id,measurement-id,measurement-uid,measurement-tumor-purity,measurement-sample-sample-id
0,17592188233843,tcga-a1-a0sh-01a,"[tcga-brca, fafc777825d48b83c123f20c2ae222e4]",0.6574,tcga-a1-a0sh-01a
1,17592188233873,tcga-a2-a04v-01a,"[tcga-brca, 526790b7270e3350267f925389583cf1]",0.8341,tcga-a2-a04v-01a
2,17592188233877,tcga-a2-a04x-01a,"[tcga-brca, 24013d8c7199288318687bdecf318186]",0.7223,tcga-a2-a04x-01a
3,17592188233913,tcga-a2-a0d1-01a,"[tcga-brca, 778a6e9db87be8c19f5883381102c1bc]",0.8980,tcga-a2-a0d1-01a
4,17592188233926,tcga-a2-a0eo-01a,"[tcga-brca, 11e54d89b3f3f9a87cda81ce059aa0e6]",0.6086,tcga-a2-a0eo-01a
...,...,...,...,...,...
80,17592188235806,tcga-ew-a3u0-01a,"[tcga-brca, 0d92a32853dade598a90bd563ef3c903]",0.4688,tcga-ew-a3u0-01a
81,17592188235923,tcga-ll-a9q3-01a,"[tcga-brca, 4ddfd382a8225ddfe0abf2c535d152a3]",0.7531,tcga-ll-a9q3-01a
82,17592188235938,tcga-ol-a5da-01a,"[tcga-brca, e4e430917944ef4c38f4f8012b6622f2]",0.8184,tcga-ol-a5da-01a
83,17592188235954,tcga-ol-a66h-01a,"[tcga-brca, 4c586df7b791fa2a0f97977e43725789]",0.7466,tcga-ol-a66h-01a


In [51]:
cnv_samples = assays_for_sub[assays_for_sub['measurement-set-name'] == 'snp CNV data']['sample-id'].unique().tolist()
cnv_samples = cnv_samples[:10]
cnv_samples

['tcga-a7-a13h-01a',
 'tcga-an-a0al-01a',
 'tcga-d8-a1xm-10a',
 'tcga-a2-a04v-10a',
 'tcga-ar-a0tx-01a',
 'tcga-ll-a9q3-10a',
 'tcga-ol-a66h-10a',
 'tcga-d8-a27r-01a',
 'tcga-bh-a18t-11a',
 'tcga-a8-a06t-01a']

In [52]:
cnv_df = pqd.sample_measurements(dataset, 'snp CNV data', cnv_samples, db_name=database, timeout=120)
cnv_df

Unnamed: 0,db-id,measurement-id,measurement-uid,measurement-segment-mean-lrr,measurement-sample-sample-id,measurement-cnv-cnv-id
0,17592187609808,tcga-d8-a1xm-10a-GRCh37:chr1:+:3218610:84897981,"[tcga-brca, 323d056054bd78b823e7ac6973981543]",0.0036,tcga-d8-a1xm-10a,GRCh37:chr1:+:3218610:84897981
1,17592187609811,tcga-d8-a1xm-10a-GRCh37:chr1:+:84900968:84901637,"[tcga-brca, acbb98d687078bcf45dd768576ae9d8d]",-1.0001,tcga-d8-a1xm-10a,GRCh37:chr1:+:84900968:84901637
2,17592187609813,tcga-d8-a1xm-10a-GRCh37:chr1:+:84903286:247813706,"[tcga-brca, c1668ce6474fe9fb500f22703539a370]",0.0031,tcga-d8-a1xm-10a,GRCh37:chr1:+:84903286:247813706
3,17592187609815,tcga-d8-a1xm-10a-GRCh37:chr2:+:484222:242476062,"[tcga-brca, 24eb05f2c699e686fdd6777cc0c16863]",-0.0029,tcga-d8-a1xm-10a,GRCh37:chr2:+:484222:242476062
4,17592187609817,tcga-d8-a1xm-10a-GRCh37:chr3:+:2212571:197538677,"[tcga-brca, 48837855b213115d0ed01e14446117e8]",-0.0004,tcga-d8-a1xm-10a,GRCh37:chr3:+:2212571:197538677
...,...,...,...,...,...,...
1155,17592188068986,tcga-bh-a18t-11a-GRCh37:chr21:+:15347621:47678774,"[tcga-brca, c7c47fc3654f1a6e755edb75eb960d09]",-0.0017,tcga-bh-a18t-11a,GRCh37:chr21:+:15347621:47678774
1156,17592188068988,tcga-bh-a18t-11a-GRCh37:chr22:+:17423930:49331012,"[tcga-brca, 3c442f95d5d4fb264f97bde372e783fa]",0.0008,tcga-bh-a18t-11a,GRCh37:chr22:+:17423930:49331012
1157,17592188068990,tcga-bh-a18t-11a-GRCh37:chr23:+:3157107:26896610,"[tcga-brca, 8ed0f1b84badf371bb6bf5602da3f5a5]",0.0013,tcga-bh-a18t-11a,GRCh37:chr23:+:3157107:26896610
1158,17592188068992,tcga-bh-a18t-11a-GRCh37:chr23:+:26897763:26903542,"[tcga-brca, 8de24161f49478184f97ba904af60543]",-1.1763,tcga-bh-a18t-11a,GRCh37:chr23:+:26897763:26903542


In [63]:
gx_samples = assays_for_sub[['sample-id', 'measurement-set-name']]
gx_samples = gx_samples[gx_samples['measurement-set-name'] == 'gx']
gx_samples_sub = gx_samples['sample-id'].unique().tolist()[:10]
gx_samples_sub[:1]

['tcga-ac-a3w6-01a']