Look into Chowell 2018 data

    -> find all HLA-I alleles from Chowell 2018 data that either belong to or have match in the 85 HLA-I alleles with pseudo sequences, and output a file of all these 146 HLA-I alleles to submit to CLAIRE server for prediction
    
    -> combine two sheets from Chowell 2018 data into one data frame, process, add matched HLA-I alleles and save out.
    
Note that the mutcnt from sheet 1 should be divided by 30 before matching combining
with the results from sheet 2

In [3]:
import pandas as pd
import numpy as np

from collections import defaultdict
from collections import Counter

import re

In [2]:
import matplotlib.pyplot as plt

In [4]:
data_dir = "../data/"
chowell_folder = "Chowell_2018/"

In [5]:
spreadsheet = "aao4572_chowell_sm-table-s9.xlsx"

df_2018_1 = pd.read_excel(data_dir + chowell_folder + spreadsheet, 'Cohort 1')
df_2018_1.shape

  warn(msg)


(371, 15)

In [6]:
df_2018_1[:3]

Unnamed: 0,Sample,OS_Months,OS_Event,Drug Class,Stage M,Stage,Age,Gender,MutCnt,Cancer Type,Reference,HLA Class I Alleles,HLA Class I Supertypes,Homozygous (1=Yes; 0=No),LOH (1=Yes; 0=No)
0,CR0095,67.936885,0.0,CTLA-4,M1b,Stage 4,74.0,M,282.0,Melanoma,Snyder et al. 2014,"A0201,A3101,B3502,B3906,C0702,C0401","A02,A03,B07,B27,UNK,UNK",0.0,0.0
1,CR04885,25.617525,0.0,CTLA-4,M0,Stage 3,49.0,F,2102.0,Melanoma,Snyder et al. 2014,"A0201,A3201,B0702,B1801,C0701,C0702","A02,A01,B07,B44,UNK,UNK",0.0,0.0
2,CR06670,44.430394,1.0,CTLA-4,M0,Stage 3,79.0,F,590.0,Melanoma,Snyder et al. 2014,"A2402,A2601,B1801,B4403,C1601,C1203","A24,A01,B44,B44,UNK,UNK",0.0,0.0


In [7]:
df_2018_1.isna().sum()

Sample                       0
OS_Months                    2
OS_Event                     1
Drug Class                   0
Stage M                     13
Stage                       13
Age                          0
Gender                       0
MutCnt                      67
Cancer Type                  0
Reference                    0
HLA Class I Alleles          1
HLA Class I Supertypes       0
Homozygous (1=Yes; 0=No)     1
LOH (1=Yes; 0=No)           69
dtype: int64

In [8]:
Counter(df_2018_1.Gender)

Counter({'M': 222, 'F': 149})

In [9]:
keep_1 = [i for i, (a, b, c) in enumerate(zip(df_2018_1["OS_Months"].tolist(), 
                                              df_2018_1["OS_Event"].tolist(), 
                                              df_2018_1["HLA Class I Alleles"].tolist()))
          if ((not pd.isna(a)) and (not pd.isna(b)) and (not pd.isna(c)))]

len(keep_1)

369

In [10]:
df_2018_1_kept = df_2018_1.iloc[keep_1]
df_2018_1_kept.shape

(369, 15)

In [11]:
df_2018_1_kept.isna().sum()

Sample                       0
OS_Months                    0
OS_Event                     0
Drug Class                   0
Stage M                     13
Stage                       13
Age                          0
Gender                       0
MutCnt                      66
Cancer Type                  0
Reference                    0
HLA Class I Alleles          0
HLA Class I Supertypes       0
Homozygous (1=Yes; 0=No)     0
LOH (1=Yes; 0=No)           67
dtype: int64

In [12]:
Counter(df_2018_1_kept["Cancer Type"])

Counter({'Melanoma': 269, 'Non-Small Cell Lung Cancer': 100})

In [13]:
df_2018_2 = pd.read_excel(data_dir + chowell_folder + spreadsheet, 'Cohort 2')
df_2018_2.shape

  warn(msg)


(1166, 11)

In [14]:
# there are four subjects with unknown cancer type in cohort 2
df_2018_2.isna().sum()

Sample                      0
Age Group                   0
OS_Months                   0
OS_Event                    0
IMPACT-MutCnt               0
Drug Class                  0
Cancer Type                 4
Reference                   0
HLA Class I Alleles         0
HLA Class I Supertypes      0
Homozygous (1=Yes; 0=No)    0
dtype: int64

In [15]:
Counter(df_2018_2['Age Group'])

Counter({'31-50': 492, '<30': 28, '>71': 137, '61-70': 200, '50-60': 309})

In [16]:
Counter(df_2018_2["Cancer Type"])

Counter({'Non-Small Cell Lung Cancer': 271,
         'Melanoma': 246,
         'Bladder Cancer': 87,
         'Glioma': 82,
         'Renal Cell Carcinoma': 129,
         'Head and Neck Cancer': 58,
         'Esophagogastric Cancer': 29,
         'Soft Tissue Sarcoma': 14,
         'Gastrointestinal Neuroendocrine Tumor': 2,
         'Mesothelioma': 9,
         'Skin Cancer; Non-Melanoma': 15,
         'Cancer of Unknown Primary': 27,
         'Colorectal Cancer': 44,
         'Hepatobiliary Cancer': 16,
         'Small Cell Lung Cancer': 15,
         'Breast Cancer': 29,
         'Adrenocortical Carcinoma': 12,
         'Non-Hodgkin Lymphoma': 16,
         'Pancreatic Cancer': 22,
         'Gastrointestinal Stromal Tumor': 2,
         'Prostate Cancer': 10,
         'Thyroid Cancer': 6,
         'Bone Cancer': 7,
         'Hodgkin Lymphoma': 3,
         'Anal Cancer': 7,
         'Sex Cord Stromal Tumor': 1,
         'Salivary Gland Cancer': 1,
         nan: 4,
         'Ampullary Car

In [17]:
# there is no overlap between the cohort 1 subjects kept and the cohort 2 subjects
len(set(df_2018_1_kept["Sample"]).intersection(set(df_2018_2["Sample"])))

0

In [18]:
len(set(df_2018_1_kept["Sample"]).union(set(df_2018_2["Sample"])))

1535

In [19]:
df_2018_1_kept.shape[0] + df_2018_2.shape[0]

1535

In [20]:
# prepare columns for the data to keep

sample_list = df_2018_1_kept["Sample"].tolist() + df_2018_2["Sample"].tolist()
age_list = df_2018_1_kept["Age "].tolist() + [np.nan for _ in range(df_2018_2.shape[0])]

def group_age(x):
    if (x<=30):
        return "<30"
    elif (x <= 50):
        return "31-50"
    elif (x <= 60):
        return "50-60"
    elif (x <= 70):
        return "61-70"
    else:
        return ">71"

age_group_1 = [group_age(x) for x in df_2018_1_kept["Age "].tolist()]
age_group_list = age_group_1 + df_2018_2["Age Group"].tolist()
OS_Months_list = df_2018_1_kept["OS_Months"].tolist() + df_2018_2["OS_Months"].tolist()
OS_Event_list = df_2018_1_kept["OS_Event"].tolist() + df_2018_2["OS_Event"].tolist()

scaled_mutcnt_1 = [x/30 for x in df_2018_1_kept["MutCnt"].tolist()]
MutCnt_list = scaled_mutcnt_1 + df_2018_2["IMPACT-MutCnt"].tolist()
gender_list = df_2018_1_kept["Gender"].tolist() + [np.nan for _ in range(df_2018_2.shape[0])]

Drug_list = df_2018_1_kept["Drug Class"].tolist() + df_2018_2["Drug Class"].tolist()
Drug_list = ["CTLA-4" if x=="CTLA4" else x for x in Drug_list ]

Type_list = df_2018_1_kept["Cancer Type"].tolist() + df_2018_2["Cancer Type"].tolist()
Stage_M_list = df_2018_1_kept["Stage M"].tolist() + [np.nan for _ in range(df_2018_2.shape[0])]
Stage_list = df_2018_1_kept["Stage"].tolist() + [np.nan for _ in range(df_2018_2.shape[0])]
Reference_list = df_2018_1_kept["Reference"].tolist() + df_2018_2["Reference"].tolist()
alleles_list = df_2018_1_kept["HLA Class I Alleles"].tolist() + df_2018_2["HLA Class I Alleles"].tolist()
Homozygous_list = df_2018_1_kept["Homozygous (1=Yes; 0=No)"].tolist() + df_2018_2["Homozygous (1=Yes; 0=No)"].tolist()

In [21]:
df_2018_combined = pd.DataFrame(zip(sample_list, 
                                    age_list, 
                                    age_group_list, 
                                    OS_Months_list, 
                                    OS_Event_list, 
                                    MutCnt_list, 
                                    gender_list, 
                                    Drug_list, 
                                    Type_list, 
                                    Stage_M_list, 
                                    Stage_list, 
                                    Reference_list, 
                                    alleles_list, 
                                    Homozygous_list), 
                             columns = ["ID", "age", "age_group", "os_months", "os_event", 
                                        "mutcnt", "gender", "drug_class", "cancer_type", "stage_m", "stage", 
                                        "reference", "hla_class_i_alleles", "homozygous"])

In [22]:
df_2018_combined.shape

(1535, 14)

In [23]:
df_2018_combined[:2]

Unnamed: 0,ID,age,age_group,os_months,os_event,mutcnt,gender,drug_class,cancer_type,stage_m,stage,reference,hla_class_i_alleles,homozygous
0,CR0095,74.0,>71,67.936885,0.0,9.4,M,CTLA-4,Melanoma,M1b,Stage 4,Snyder et al. 2014,"A0201,A3101,B3502,B3906,C0702,C0401",0.0
1,CR04885,49.0,31-50,25.617525,0.0,70.066667,F,CTLA-4,Melanoma,M0,Stage 3,Snyder et al. 2014,"A0201,A3201,B0702,B1801,C0701,C0702",0.0


In [26]:
# only keep the subjects with hla i alleles among the 85 ones with pseudo sequences
# or the hlas that we can find a replacement for from the hla alleles that we have pseudo sequence for

df_pseudo = pd.read_csv(data_dir + "for_encoders/HLA_I_pseudo_40.csv", header = 0)
hla_i_considered = set(["".join(re.split('-|\*|:', x)[1:]) for x in df_pseudo.hla.tolist()])
#hla_i_considered

In [27]:
chowell_2018_hla_i_group_list = [x.split(",") for x in df_2018_combined["hla_class_i_alleles"].tolist()]
chowell_2018_hla_i_list = [y for x in chowell_2018_hla_i_group_list for y in x]

print(len(chowell_2018_hla_i_list))
print(len(set(chowell_2018_hla_i_list)))

9210
158


In [28]:
# there are 73 alleles from chowell that are not included in the 85 ones
len(set(chowell_2018_hla_i_list) - hla_i_considered)

73

In [29]:
# all 85 ones with pseudo sequences are in chowell 2018
len(hla_i_considered - set(chowell_2018_hla_i_list))

0

In [30]:
temp_hla_candidate_list = list(set(chowell_2018_hla_i_list) - hla_i_considered)
temp_hla_candidate_list.sort()
#temp_hla_candidate_list

In [31]:
hla_i_considered_list = list(hla_i_considered)

In [32]:
hla_replace_dict = defaultdict(str)

hla_replace_dict['A0102'] = 'A0101'
hla_replace_dict['A0103'] = 'A0101'

hla_replace_dict['A0202'] = 'A0201'
hla_replace_dict['A0203'] = 'A0201'
hla_replace_dict['A0207'] = 'A0201'
hla_replace_dict['A0217'] = 'A0201'
    
hla_replace_dict['A0305'] = 'A0301'

hla_replace_dict['A1102'] = 'A1101'
hla_replace_dict['A1104'] = 'A1101'

hla_replace_dict['A2407'] = 'A2402'
hla_replace_dict['A2421'] = 'A2402'
hla_replace_dict['A2426'] = 'A2402'

hla_replace_dict['A2602'] = 'A2601'
hla_replace_dict['A2608'] = 'A2601'

hla_replace_dict['A3004'] = 'A3001'
hla_replace_dict['A3010'] = 'A3001'
    
hla_replace_dict['A3402'] = 'A3401'

# 'A3601', 'A6901', 'A7401' do not find matching hla 
    
hla_replace_dict['A6602'] = 'A6601'

hla_replace_dict['A6810'] = 'A6801'



hla_replace_dict['B0704'] = 'B0702'

hla_replace_dict['B1301'] = 'B1302'
    
hla_replace_dict['B1502'] = 'B1501'
hla_replace_dict['B1505'] = 'B1501'
hla_replace_dict['B1508'] = 'B1501'
hla_replace_dict['B1509'] = 'B1501'
hla_replace_dict['B1510'] = 'B1501'
hla_replace_dict['B1512'] = 'B1501'
hla_replace_dict['B1516'] = 'B1501'
hla_replace_dict['B1525'] = 'B1501'
hla_replace_dict['B1527'] = 'B1501'

hla_replace_dict['B1803'] = 'B1801'
    
hla_replace_dict['B2702'] = 'B2705'
hla_replace_dict['B2703'] = 'B2705'
hla_replace_dict['B2704'] = 'B2705' 

hla_replace_dict['B3505'] = 'B3501'
    
hla_replace_dict['B3902'] = 'B3901'
hla_replace_dict['B3910'] = 'B3901'
hla_replace_dict['B3924'] = 'B3901'

hla_replace_dict['B4003'] = 'B4001'
hla_replace_dict['B4004'] = 'B4001'

# 'B4201', 'B4601', 'B4701',  'B5401',  'B6701', 'B7301', 'B7801', 'B8101', not found match
    
hla_replace_dict['B4405'] = 'B4402'

hla_replace_dict['B5102'] = 'B5101'
hla_replace_dict['B5107'] = 'B5101'
hla_replace_dict['B5108'] = 'B5101'
    
hla_replace_dict['B5502'] = 'B5501'
    
hla_replace_dict['B5604'] = 'B5601'

hla_replace_dict['B5702'] = 'B5701'
hla_replace_dict['B5703'] = 'B5701'

hla_replace_dict['B5802'] = 'B5801'
    

# 

hla_replace_dict['C0103'] = 'C0102'

hla_replace_dict['C0210'] = 'C0202'

hla_replace_dict['C0305'] = 'C0302'

hla_replace_dict['C0403'] = 'C0401'
hla_replace_dict['C0404'] = 'C0401'

hla_replace_dict['C0706'] = 'C0701'
hla_replace_dict['C0718'] = 'C0701'

hla_replace_dict['C0804'] = 'C0801'

hla_replace_dict['C1202'] = 'C1203'

hla_replace_dict['C1403'] = 'C1402'

hla_replace_dict['C1504'] = 'C1502'

hla_replace_dict['C1604'] = 'C1601'
 
# 'C1801' not found

In [33]:
len(hla_replace_dict) + 12

73

In [34]:
# use a function to find matching hla allele with pseudo sequence, 
# and build the dictionary again, to verify whether the previously built 
# dictionary is correct

# map each two digit HLA allele to its first match among the alleles

two_digit_dict = defaultdict(str)

all_two_digits = list(set([x[:3] for x in hla_i_considered_list]))

for a01 in all_two_digits:
    
    cur_list = [x for x in hla_i_considered_list if x[:3]==a01]
    
    cur_list.sort()
    
    two_digit_dict[a01] = cur_list[0]

In [35]:
print(len(all_two_digits))
print(len(two_digit_dict))

54
54


In [35]:
hla_replace_dict_verify = defaultdict(str)

for temp_hla in temp_hla_candidate_list:
    
    if temp_hla[:3] in set(all_two_digits):
        
        hla_replace_dict_verify[temp_hla] = two_digit_dict[temp_hla[:3]]

len(hla_replace_dict_verify)

61

In [36]:
temp_keys = list(hla_replace_dict.keys())

In [37]:
# now we have verified that the translation dictionary is correct
print(set(hla_replace_dict.keys()) == set(hla_replace_dict_verify.keys()))
sum([hla_replace_dict[x]==hla_replace_dict_verify[x] for x in temp_keys])

True


61

In [38]:
# output the list of hla alleles that either have pseudo sequences or can find a match
# to prepare for getting scores from Glazer 2022 model

hla_alleles_for_Glazer_2022 = hla_i_considered_list + temp_keys
hla_alleles_for_Glazer_2022.sort()
print(len(hla_alleles_for_Glazer_2022))
print(len(set(hla_alleles_for_Glazer_2022)))

146
146


In [39]:
df_chowell_2018_alleles = pd.DataFrame(hla_alleles_for_Glazer_2022, columns = ["hla"])
df_chowell_2018_alleles.shape

(146, 1)

In [41]:
df_chowell_2018_alleles.to_csv("../results/st6_chowell_2018_alleles_for_Glazer_2022.csv", index = False)

In [42]:
# count how many subjects from chowell 2018 combined have hla alleles all under our consideration
# or belong to one of those that we can find match for

hla_alleles_for_Glazer_2022_set = set(hla_alleles_for_Glazer_2022)

subjects_to_consider = []

for name, hlas in zip(df_2018_combined["ID"].tolist(), df_2018_combined.hla_class_i_alleles.tolist()):
    cur_hla_set = set(hlas.split(","))
    if (len(cur_hla_set.intersection(hla_alleles_for_Glazer_2022_set)) == len(cur_hla_set)):
        subjects_to_consider += [name]

In [43]:
len(subjects_to_consider)

1443

In [44]:
# subset df_2018 data to only keep those with HLAs that we have pseudo amino acid information for
# or can find a match with pseudo information for
df_2018_kept = df_2018_combined.loc[df_2018_combined['ID'].isin(subjects_to_consider)]
df_2018_kept.shape

(1443, 14)

In [45]:
Counter([a==b for a, b in zip(df_2018_kept.ID, subjects_to_consider)])

Counter({True: 1443})

In [46]:
df_2018_kept[:2]

Unnamed: 0,ID,age,age_group,os_months,os_event,mutcnt,gender,drug_class,cancer_type,stage_m,stage,reference,hla_class_i_alleles,homozygous
0,CR0095,74.0,>71,67.936885,0.0,9.4,M,CTLA-4,Melanoma,M1b,Stage 4,Snyder et al. 2014,"A0201,A3101,B3502,B3906,C0702,C0401",0.0
1,CR04885,49.0,31-50,25.617525,0.0,70.066667,F,CTLA-4,Melanoma,M0,Stage 3,Snyder et al. 2014,"A0201,A3201,B0702,B1801,C0701,C0702",0.0


In [47]:
Counter(df_2018_kept.age_group)

Counter({'>71': 217, '31-50': 530, '50-60': 375, '61-70': 285, '<30': 36})

In [48]:
Counter(df_2018_kept.gender)

Counter({'M': 216, 'F': 141, nan: 1086})

In [49]:
Counter(df_2018_kept.drug_class)

Counter({'CTLA-4': 218,
         'PD-1': 189,
         'PD-L1 + CTLA-4': 4,
         'CTLA-4 + PD-1': 3,
         'PD-L1': 1,
         'PD-1/PDL-1': 857,
         'Combo': 171})

In [50]:
Counter(df_2018_kept.cancer_type)

Counter({'Melanoma': 498,
         'Non-Small Cell Lung Cancer': 337,
         'Bladder Cancer': 84,
         'Glioma': 76,
         'Renal Cell Carcinoma': 123,
         'Head and Neck Cancer': 56,
         'Esophagogastric Cancer': 29,
         'Soft Tissue Sarcoma': 14,
         'Gastrointestinal Neuroendocrine Tumor': 2,
         'Skin Cancer; Non-Melanoma': 13,
         'Cancer of Unknown Primary': 25,
         'Colorectal Cancer': 40,
         'Hepatobiliary Cancer': 15,
         'Small Cell Lung Cancer': 13,
         'Breast Cancer': 26,
         'Adrenocortical Carcinoma': 12,
         'Mesothelioma': 7,
         'Non-Hodgkin Lymphoma': 14,
         'Pancreatic Cancer': 20,
         'Gastrointestinal Stromal Tumor': 2,
         'Prostate Cancer': 10,
         'Thyroid Cancer': 6,
         'Bone Cancer': 6,
         'Hodgkin Lymphoma': 3,
         'Anal Cancer': 6,
         'Sex Cord Stromal Tumor': 1,
         'Salivary Gland Cancer': 1,
         'Ampullary Carcinoma': 1,
     

In [51]:
Counter(df_2018_kept.stage_m)

Counter({'M1b': 37, 'M0': 12, 'M1c': 266, 'M1a': 29, nan: 1099})

In [52]:
Counter(df_2018_kept.stage)

Counter({'Stage 4': 332, 'Stage 3': 12, nan: 1099})

In [53]:
Counter(df_2018_kept.reference)

Counter({'Snyder et al. 2014': 62,
         'Van Allen et al. 2015': 98,
         'Riaz et al. 2017': 68,
         'Hugo et al. 2016': 37,
         'Rizvi et al. 2015': 34,
         'Rizvi_CUMC': 58,
         'MSK-IMPACT': 1086})

In [54]:
Counter(df_2018_kept.homozygous)

Counter({0.0: 1106, 1.0: 337})

In [55]:
df_2018_kept.isna().sum()

ID                        0
age                    1086
age_group                 0
os_months                 0
os_event                  0
mutcnt                   58
gender                 1086
drug_class                0
cancer_type               2
stage_m                1099
stage                  1099
reference                 0
hla_class_i_alleles       0
homozygous                0
dtype: int64

In [56]:
# translate hla alleles to those we have pseudo sequence for

kept_trans_alleles = []

for x in df_2018_kept["hla_class_i_alleles"].tolist():
    
    cur_alleles = x.split(",")
    cur_trans_alleles = [hla if hla in hla_i_considered else hla_replace_dict[hla] for hla in cur_alleles]
    kept_trans_alleles += [cur_trans_alleles]

In [57]:
len(set([y for x in kept_trans_alleles for y in x]))

85

In [58]:
set([y for x in kept_trans_alleles for y in x]) == hla_i_considered

True

In [59]:
kept_trans_alleles_string = [",".join(x) for x in kept_trans_alleles]
kept_trans_alleles_string[3:6]

['A3301,A2601,B3801,B3501,C1203,C0401',
 'A0301,A0201,B4101,B5001,C1701,C0602',
 'A0205,A6801,B1402,B1801,C0701,C0802']

In [60]:
df_2018_kept["hla_class_i_alleles"].tolist()[3:6]

['A3301,A2601,B3801,B3501,C1203,C0401',
 'A0301,A0201,B4101,B5001,C1701,C0602',
 'A0205,A6801,B1402,B1801,C0701,C0802']

In [61]:
df_2018_kept['hla_class_i_alleles_replace'] = kept_trans_alleles_string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018_kept['hla_class_i_alleles_replace'] = kept_trans_alleles_string


In [62]:
list(zip(df_2018_kept['hla_class_i_alleles'], df_2018_kept['hla_class_i_alleles_replace']))[:3]

[('A0201,A3101,B3502,B3906,C0702,C0401',
  'A0201,A3101,B3502,B3906,C0702,C0401'),
 ('A0201,A3201,B0702,B1801,C0701,C0702',
  'A0201,A3201,B0702,B1801,C0701,C0702'),
 ('A2402,A2601,B1801,B4403,C1601,C1203',
  'A2402,A2601,B1801,B4403,C1601,C1203')]

In [64]:
df_2018_kept.to_csv("../results/st6_chowell_2018_kept_hla_match_replace.csv", index = False)