In [31]:
import pandas as pd
import numpy as np
import os
from glob import glob
pd.set_option('display.max_rows', 500)

In [32]:
# bioportal
df_bp = pd.concat([pd.read_csv(filename, sep='\t') for filename in 
                ['bioportal_CRC.txt', 'bioportal_STAD.txt', 'bioportal_ESCA.txt', 'bioportal_UCEC.txt']])
df_bp['cohort'] = df_bp.Subtype.str[:4]
df_bp['Subtype'] = df_bp['Subtype'].replace({'UCEC_CN_HIGH': 'UCEC_CIN', 'UCEC_CN_LOW':'UCEC_GS'})
df_bp['subtype'] = df_bp.Subtype.str[5:]
df_bp['patient_id'] = df_bp['Patient ID']
df_bp.head()

Unnamed: 0,Study ID,Patient ID,Subtype,cohort,subtype,patient_id
0,coadread_tcga_pan_can_atlas_2018,TCGA-AF-2687,READ_CIN,READ,CIN,TCGA-AF-2687
1,coadread_tcga_pan_can_atlas_2018,TCGA-AF-2690,READ_CIN,READ,CIN,TCGA-AF-2690
2,coadread_tcga_pan_can_atlas_2018,TCGA-AF-2693,READ_CIN,READ,CIN,TCGA-AF-2693
3,coadread_tcga_pan_can_atlas_2018,TCGA-AF-3911,READ_CIN,READ,CIN,TCGA-AF-3911
4,coadread_tcga_pan_can_atlas_2018,TCGA-AF-4110,READ_CIN,READ,CIN,TCGA-AF-4110


In [33]:
df_bp.patient_id.is_unique

True

In [34]:
df_bp.cohort.unique(), df_bp.subtype.unique()

(array(['READ', 'COAD', 'STAD', 'ESCA', 'UCEC'], dtype=object),
 array(['CIN', 'GS', 'POLE', 'MSI', 'EBV', 'ESCC'], dtype=object))

In [35]:
pd.pivot_table(df_bp, values='patient_id', index=['cohort'],
                       columns=['subtype'], aggfunc='count', margins=True)

subtype,CIN,EBV,ESCC,GS,MSI,POLE,All
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
COAD,226.0,,,49.0,60.0,6.0,341
ESCA,74.0,,90.0,1.0,2.0,2.0,169
READ,102.0,,,9.0,3.0,4.0,118
STAD,223.0,30.0,,50.0,73.0,7.0,383
UCEC,163.0,,,147.0,148.0,49.0,507
All,788.0,30.0,90.0,256.0,286.0,68.0,1518


In [36]:
df_bp = df_bp[df_bp.subtype.isin(['CIN', 'GS', 'MSI', 'POLE'])]
df_bp.shape

(1398, 6)

In [37]:
df_manifest = pd.read_csv('manifest_all_dx_COAD_READ_STAD_UCEC_ESCA.txt', sep='\t')
df_manifest['patient_id'] = df_manifest.filename.str[:12]
df_manifest['slide_id'] = df_manifest['filename'].apply(lambda f: f.split('.')[0])
display(df_manifest.head(3))
display(df_manifest.shape)

Unnamed: 0,id,filename,md5,size,state,patient_id,slide_id
0,03f6946c-d9c3-4bd5-b767-710f5e1efb30,TCGA-B5-A1MX-01Z-00-DX1.FAC60C1A-5927-403E-BCC...,60873a549fefc93c36827bc9a670d1c2,1533140919,released,TCGA-B5-A1MX,TCGA-B5-A1MX-01Z-00-DX1
1,da60edeb-60c8-4fbb-a69e-4ae04528fdfc,TCGA-D1-A179-01Z-00-DX1.088EEE68-F1C0-4877-BD6...,f9ed9e977bd6be59894c9c9f8382fee6,2245479755,released,TCGA-D1-A179,TCGA-D1-A179-01Z-00-DX1
2,4981d0e8-0de2-494e-9f49-71ef7203591e,TCGA-D1-A0ZQ-01Z-00-DX1.B2F36A73-674A-4083-997...,2c15d65c8fab9c1261a1b85f22df4b7e,2195885857,released,TCGA-D1-A0ZQ,TCGA-D1-A0ZQ-01Z-00-DX1


(1790, 7)

In [39]:
df_merged_manifest = df_bp.merge(df_manifest, on='patient_id', how='inner')
display(df_merged_manifest.head(3))
display(df_merged_manifest.shape)

Unnamed: 0,Study ID,Patient ID,Subtype,cohort,subtype,patient_id,id,filename,md5,size,state,slide_id
0,coadread_tcga_pan_can_atlas_2018,TCGA-AF-2687,READ_CIN,READ,CIN,TCGA-AF-2687,2c0da95f-22b6-42c4-ad5d-1e509735b7f7,TCGA-AF-2687-01Z-00-DX1.bbcd88f6-11d5-4b57-969...,a14971966fac6442d49bf7f8850dfcd1,237001551,released,TCGA-AF-2687-01Z-00-DX1
1,coadread_tcga_pan_can_atlas_2018,TCGA-AF-2690,READ_CIN,READ,CIN,TCGA-AF-2690,242947d0-a9ff-49d7-888e-8195c86186fe,TCGA-AF-2690-01Z-00-DX1.22cb3fa6-519d-449f-b48...,889390a6c1d40e3d3542ab51d84769bb,722835723,released,TCGA-AF-2690-01Z-00-DX1
2,coadread_tcga_pan_can_atlas_2018,TCGA-AF-2693,READ_CIN,READ,CIN,TCGA-AF-2693,2f21d4ce-cc8c-4ccb-b91e-3780cce55249,TCGA-AF-2693-01Z-00-DX1.620a9998-65df-4024-b71...,e50fb9b8e5f481bb7231fad4c3dc2a22,147031181,released,TCGA-AF-2693-01Z-00-DX1


(1389, 12)

In [40]:
df_merged_manifest[df_merged_manifest.subtype.isin(['CIN', 'GS'])].shape

(1038, 12)

In [15]:
folder = '/mnt/data/users/sharonpe/slides'
slide_paths = glob(f"{folder}/**/*.svs", recursive=True)
len(slide_paths)

1386

In [41]:
df_slide_ids = pd.DataFrame({'slide_path': slide_paths})
df_slide_ids['slide_id'] = df_slide_ids['slide_path'].apply(lambda f: f.split('/')[-1].split('.')[0])
df_slide_ids.head(2)

Unnamed: 0,slide_path,slide_id
0,/mnt/data/users/sharonpe/slides/5cc73ca6-b47f-...,TCGA-VR-AA4D-01Z-00-DX1
1,/mnt/data/users/sharonpe/slides/3107b1c0-d416-...,TCGA-PG-A914-01Z-00-DX2


In [43]:
df_slide_ids['slide_id'].isin(df_merged_manifest.slide_id).all()

True

In [45]:
(~df_merged_manifest['slide_id'].isin(df_slide_ids.slide_id)).sum() # failed to download

3

In [46]:
df_merged_manifest = df_merged_manifest.merge(df_slide_ids, on='slide_id', how='inner')
display(df_merged_manifest.head(1))
display(df_merged_manifest.shape)

Unnamed: 0,Study ID,Patient ID,Subtype,cohort,subtype,patient_id,id,filename,md5,size,state,slide_id,slide_path
0,coadread_tcga_pan_can_atlas_2018,TCGA-AF-2687,READ_CIN,READ,CIN,TCGA-AF-2687,2c0da95f-22b6-42c4-ad5d-1e509735b7f7,TCGA-AF-2687-01Z-00-DX1.bbcd88f6-11d5-4b57-969...,a14971966fac6442d49bf7f8850dfcd1,237001551,released,TCGA-AF-2687-01Z-00-DX1,/mnt/data/users/sharonpe/slides/2c0da95f-22b6-...
1,coadread_tcga_pan_can_atlas_2018,TCGA-AF-2690,READ_CIN,READ,CIN,TCGA-AF-2690,242947d0-a9ff-49d7-888e-8195c86186fe,TCGA-AF-2690-01Z-00-DX1.22cb3fa6-519d-449f-b48...,889390a6c1d40e3d3542ab51d84769bb,722835723,released,TCGA-AF-2690-01Z-00-DX1,/mnt/data/users/sharonpe/slides/242947d0-a9ff-...
2,coadread_tcga_pan_can_atlas_2018,TCGA-AF-2693,READ_CIN,READ,CIN,TCGA-AF-2693,2f21d4ce-cc8c-4ccb-b91e-3780cce55249,TCGA-AF-2693-01Z-00-DX1.620a9998-65df-4024-b71...,e50fb9b8e5f481bb7231fad4c3dc2a22,147031181,released,TCGA-AF-2693-01Z-00-DX1,/mnt/data/users/sharonpe/slides/2f21d4ce-cc8c-...


(1386, 13)

In [47]:
df_merged_manifest.to_csv("manifest_labeled_dx_molecular_subtype.tsv", sep='\t', index=False)