In [9]:
import pandas as pd
import os
import json
pd.options.display.max_columns=100
import re

In [10]:
df_meta = pd.read_csv("../../data/SuppTable1_sample_metadata - metadata.tsv",sep="\t") # Supp. Table 1 
df_meta=df_meta[df_meta["is_blacklisted"]==False]
df_meta_hmf=df_meta[df_meta["cohort"]=="Hartwig"]
df_meta_pcawg=df_meta[df_meta["cohort"]=="PCAWG"]
df_pre = pd.read_csv("/home/fran/Documents/cuppen/HPC/tunnel/cuppen/shared_resources/HMF_data/DR-104-update5/pre_biopsy_drugs.tsv",sep="\t") # This information is part of the clinical data of Hartwig, not avaliable unless you have approved access
df_pre.shape

(9659, 6)

In [11]:
# Match sample ID with patient treatment
def get_patient_identifier(row):
    return re.search("([A-Z]+[0-9]+)[TI]+",row["sample_id"]).group(1)
df_meta_hmf["patientIdentifier"] = df_meta_hmf.apply(lambda row: get_patient_identifier(row),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Select cancer types with, at least, 15 samples

In [12]:
# ttypes primary
tmp  = df_meta_pcawg["cancer_type_code"].value_counts()
ttypes_represented_primary = list(tmp[tmp>14].index)
# ttypes met
tmp  = df_meta_hmf["cancer_type_code"].value_counts()
ttypes_represented_met = list(tmp[tmp>14].index)

ttypes_selected = set(ttypes_represented_met).intersection(ttypes_represented_primary)

In [13]:
len(ttypes_selected)

23

### Select patients from Hartwig

In [14]:
samples_id = set(df_meta_hmf["patient_id"].values)
df_pre=df_pre.merge(df_meta_hmf[["cancer_type","cancer_type_code","patientIdentifier","sample_id"]])

In [15]:
df_pre.to_csv("../data/info_treatment_patients_selected.tsv",sep="\t",index=False)

In [16]:
len(set(df_pre["patientIdentifier"]))

1902

In [17]:
df_pre[["mechanism","cancer_type_code"]].drop_duplicates().shape

(323, 2)

### mechanism 

In [18]:
d_mechanisms = {}
eqv_names = {}

### Create them

In [19]:
d_selected_mechanisms = {"Platinum":[],"Pyrimidine (ant)agonist":[],"Taxane":[],"Anthracycline":[],"Alkylating":[],"Selective ER modulator":["Anti-ER"],
                        "Aromatase inhibitor":[],"Anti-VEGF":[],"Anti-AR/GnRH":["Androgen inhibitor","GnRH (ant)agonist"],"Topoisomerase inhibitor":[],"Alkaloid":[],"Folate antagonist":["Antifolate"],"Anti-EGFR":[],
                        "Anti-HER2":[],"mTOR inhibitor":["mTOR"],"CDK4/6 inhibitor":[],"Multikinase inhibitor":["tyrosine kinase inhibitor"],
                        "Antitumor antibiotic":[],"Immunotherapy":["Anti-CTLA-4","Anti-PD-1","Immunetherapy","Anti-PD-L1"],"Vinca Alkaloid":[],"Microtubule inhibitor":[],
                        "MEK inhibitor":[],"BRAF inhibitor":[],"PARP inhibitor":[],"Anti-PDGFR":[],"ALK/ROS1 inhibitor":["ALK inhibitor"]}
s=set()
for ttype in ttypes_selected:
    patients_ttype=set(df_meta_hmf[(df_meta_hmf["cancer_type_code"]==ttype)]["patientIdentifier"].values)
    d_mechanisms[ttype] = {}
    for m in d_selected_mechanisms:
        patients = set(df_pre[(df_pre["mechanism"].str.contains(m,na=False,case=False,regex=False))&(~df_pre["mechanism"].str.contains("placebo",na=False,case=False))]["patientIdentifier"].unique())
        # syn patients
        for syn in d_selected_mechanisms[m]:
            t=set(df_pre[(df_pre["mechanism"].str.contains(syn,na=False,case=False,regex=False))&(~df_pre["mechanism"].str.contains("placebo",na=False,case=False))]["patientIdentifier"].unique())
            patients=patients.union(t)
        new_name=m.replace("(","").replace(")","").replace(" ","_").replace("/","__").replace("-","_")
        # total patients with ttype
        selected_pat = patients.intersection(patients_ttype)
        if len(selected_pat) >= 10:
            samples = list(df_meta_hmf[(df_meta_hmf["patientIdentifier"].isin(selected_pat))]["sample_id"].values)
            d_mechanisms[ttype][new_name] = samples
            s.add(new_name)
            print (new_name,len(samples),len(selected_pat))
        eqv_names[m] = new_name
        

Platinum 64 64
Pyrimidine_antagonist 288 288
Taxane 367 367
Anthracycline 387 387
Alkylating 385 385
Selective_ER_modulator 398 398
Aromatase_inhibitor 354 354
Anti_VEGF 36 36
Anti_AR__GnRH 78 78
Alkaloid 48 48
Folate_antagonist 34 34
Anti_HER2 78 78
mTOR_inhibitor 67 67
CDK4__6_inhibitor 68 68
Microtubule_inhibitor 22 22
Immunotherapy 43 43
MEK_inhibitor 11 11
BRAF_inhibitor 13 13
Platinum 274 274
Pyrimidine_antagonist 314 314
Anti_VEGF 195 195
Topoisomerase_inhibitor 136 136
Anti_EGFR 44 44
Platinum 18 18
Pyrimidine_antagonist 17 17
Platinum 13 13
Pyrimidine_antagonist 13 13
Multikinase_inhibitor 22 22
Platinum 75 75
Pyrimidine_antagonist 68 68
Platinum 64 64
Pyrimidine_antagonist 29 29
Taxane 45 45
Platinum 18 18
Platinum 21 21
Pyrimidine_antagonist 16 16
Platinum 10 10
Pyrimidine_antagonist 22 22
Topoisomerase_inhibitor 11 11
Anthracycline 13 13
Platinum 71 71
Taxane 12 12
Folate_antagonist 65 65
Anti_EGFR 41 41
Immunotherapy 11 11
Platinum 21 21
Taxane 13 13
Taxane 229 229
Anti_AR

In [20]:
c=0
for i,x in d_mechanisms.items():
    if len(x) == 0:
        print (i)
c

PANET
DLBCL
LPS
LIHC
THCA


0

### Untreated patients

In [21]:
d_untreated={}
for ttype in ttypes_selected:
    patients_ttype=set(df_meta_pcawg[(df_meta_pcawg["cancer_type_code"]==ttype)]["sample_id"].values)
    d_untreated[ttype] = {}
    d_untreated[ttype]["untreated"] = list(patients_ttype)
        

### Create subgroups by subtypes in BRCA


##### BRCA

In [22]:
hmf=df_meta[(df_meta["cancer_type_code"]=="BRCA")&(df_meta["cohort"]=="Hartwig")]["cancer_subtype"].value_counts()>=10
pcawg=df_meta[(df_meta["cancer_type_code"]=="BRCA")&(df_meta["cohort"]=="PCAWG")]["cancer_subtype"].value_counts()>=10
subtypes_selected=hmf[hmf&pcawg].index

In [23]:
subtypes_selected

Index(['ER+/HER2-', 'TNBC', 'ER+/HER2+'], dtype='object')

In [24]:
subtypes_short_name={"ER+/HER2-":"ERpos","TNBC":"TNB","ER+/HER2+":"HER2pos"}

In [25]:
ttype="BRCA"
for subtype in subtypes_selected:
    samples_subtype = df_meta[df_meta["cancer_subtype"]==subtype]["sample_id"]
    patients_ttype=set(df_meta_hmf[(df_meta_hmf["cancer_type_code"]==ttype)&(df_meta_hmf["sample_id"].isin(samples_subtype))]["patientIdentifier"].values)
    d_mechanisms[ttype+"_"+subtypes_short_name[subtype]] = {}
    for m in d_selected_mechanisms:
        patients = set(df_pre[(df_pre["mechanism"].str.contains(m,na=False,case=False,regex=False))&(~df_pre["mechanism"].str.contains("placebo",na=False,case=False))]["patientIdentifier"].unique())
        # syn patients
        for syn in d_selected_mechanisms[m]:
            t=set(df_pre[(df_pre["mechanism"].str.contains(syn,na=False,case=False,regex=False))&(~df_pre["mechanism"].str.contains("placebo",na=False,case=False))]["patientIdentifier"].unique())
            patients=patients.union(t)
        new_name=m.replace("(","").replace(")","").replace(" ","_").replace("/","__").replace("-","_")
        # total patients with ttype
        selected_pat = patients.intersection(patients_ttype)
        if len(selected_pat) >= 10:
            samples = list(df_meta_hmf[(df_meta_hmf["patientIdentifier"].isin(selected_pat))]["sample_id"].values)
            d_mechanisms[ttype+"_"+subtypes_short_name[subtype]][new_name] = samples
            s.add(new_name)
            print (new_name,len(samples),len(selected_pat))
        eqv_names[m] = new_name

Platinum 27 27
Pyrimidine_antagonist 212 212
Taxane 230 230
Anthracycline 265 265
Alkylating 264 264
Selective_ER_modulator 328 328
Aromatase_inhibitor 288 288
Anti_VEGF 25 25
Anti_AR__GnRH 63 63
Alkaloid 29 29
Folate_antagonist 26 26
Anti_HER2 10 10
mTOR_inhibitor 57 57
CDK4__6_inhibitor 62 62
Microtubule_inhibitor 15 15
Platinum 27 27
Pyrimidine_antagonist 35 35
Taxane 64 64
Anthracycline 65 65
Alkylating 63 63
Pyrimidine_antagonist 20 20
Taxane 47 47
Anthracycline 34 34
Alkylating 36 36
Selective_ER_modulator 44 44
Aromatase_inhibitor 40 40
Anti_AR__GnRH 10 10
Anti_HER2 44 44


##### Untreated of BRCA subtypes

In [26]:
ttype="BRCA"
for subtype in subtypes_selected:
    samples_subtype = samples_subtype = df_meta[(df_meta["cancer_subtype"]==subtype)&(df_meta["cohort"]=="PCAWG")]["sample_id"]
    patients_ttype=set(df_meta_pcawg[(df_meta_pcawg["cancer_type_code"]==ttype)&(df_meta_pcawg["sample_id"].isin(samples_subtype))]["sample_id"].values)
    d_untreated[ttype+"_"+subtypes_short_name[subtype]] = {}
    d_untreated[ttype+"_"+subtypes_short_name[subtype]]["untreated"] = list(patients_ttype)


### Colorectal MSI/POLE groups

In [27]:
hmf=df_meta[(df_meta["cancer_type_code"]=="COREAD")&(df_meta["cohort"]=="Hartwig")]["cancer_subtype"].value_counts()>=10
pcawg=df_meta[(df_meta["cancer_type_code"]=="COREAD")&(df_meta["cohort"]=="PCAWG")]["cancer_subtype"].value_counts()>=10
subtypes_selected=hmf[hmf&pcawg].index

In [28]:
subtypes_selected

Index(['MSS'], dtype='object')

In [29]:
subtypes_short_name={"MSS":"MSS","MSI/POLE":"MSI_POLE"}

In [30]:
ttype="COREAD"
for subtype in subtypes_selected:
    samples_subtype = df_meta[(df_meta["cancer_subtype"]==subtype)&(df_meta["cancer_type_code"]=="COREAD")]["sample_id"]
    patients_ttype=set(df_meta_hmf[(df_meta_hmf["cancer_type_code"]==ttype)&(df_meta_hmf["sample_id"].isin(samples_subtype))]["patientIdentifier"].values)
    
    for m in d_selected_mechanisms:
        patients = set(df_pre[(df_pre["mechanism"].str.contains(m,na=False,case=False,regex=False))&(~df_pre["mechanism"].str.contains("placebo",na=False,case=False))]["patientIdentifier"].unique())
        # syn patients
        for syn in d_selected_mechanisms[m]:
            t=set(df_pre[(df_pre["mechanism"].str.contains(syn,na=False,case=False,regex=False))&(~df_pre["mechanism"].str.contains("placebo",na=False,case=False))]["patientIdentifier"].unique())
            patients=patients.union(t)
        new_name=m.replace("(","").replace(")","").replace(" ","_").replace("/","__").replace("-","_")
        # total patients with ttype
        selected_pat = patients.intersection(patients_ttype)
        if len(selected_pat) >= 10:
            if not(ttype+"_"+subtypes_short_name[subtype])  in d_mechanisms:
                d_mechanisms[ttype+"_"+subtypes_short_name[subtype]] = {}
            samples = list(df_meta_hmf[(df_meta_hmf["patientIdentifier"].isin(selected_pat))]["sample_id"].values)
            d_mechanisms[ttype+"_"+subtypes_short_name[subtype]][new_name] = samples
            s.add(new_name)
            print (new_name,len(samples),len(selected_pat))
        eqv_names[m] = new_name

Platinum 269 269
Pyrimidine_antagonist 305 305
Anti_VEGF 192 192
Topoisomerase_inhibitor 132 132
Anti_EGFR 43 43


##### Untreated of COREAD subtypes

In [31]:
ttype="COREAD"
for subtype in subtypes_selected:
    samples_subtype = df_meta[(df_meta["cancer_subtype"]==subtype)&(df_meta["cancer_type_code"]=="COREAD")&(df_meta["cohort"]=="PCAWG")]["sample_id"]
    patients_ttype=set(df_meta_pcawg[(df_meta_pcawg["cancer_type_code"]==ttype)&(df_meta_pcawg["sample_id"].isin(samples_subtype))]["sample_id"].values)
    d_untreated[ttype+"_"+subtypes_short_name[subtype]] = {}
    d_untreated[ttype+"_"+subtypes_short_name[subtype]]["untreated"] = list(patients_ttype)


##### This data will be then used for the positive selection analysis

In [32]:
with open("../data/samples_mechanisms_updated.json",'w') as f: # This json will be used by the positive selection analysis in groups of patients treated with same treatement
    json.dump(d_mechanisms,f)

In [33]:
with open("../data/samples_untreated_updated.json",'w') as f: # This json will be used by the positive selection analysis in groups of untreated patients
    json.dump(d_untreated,f)

In [None]:
!scp ../data/samples_mechanisms_updated.json   gw2hpcs03:/home/cog/fmartinez/scripts/resistance/data/control_hmf/ # Transfer to HPC perform the discovery
!scp ../data/samples_untreated_updated.json   gw2hpcs03:/home/cog/fmartinez/scripts/resistance/data/control_hmf/ # Transfer to HPC perform the discovery
