Put DQA\&DQB, DPA\&DPB into combinations following the format of HLA-II alleles in DeWitt_2018 data, also as in the format in file:

    ../data/for_encoders/HLA_II_pseudo_45.csv

In addition, extract the list of all unique HLA-II alleles (including DRB1, DPAB and DQAB). 



In [1]:
import numpy as np
import pandas as pd

from collections import defaultdict
from collections import Counter

import re

In [2]:
data_dir = "../data/Liu_2019"

In [3]:
df_liu_2019_kept = pd.read_csv("../results/st18_liu_2019_kept_hla_ii_match_replace.csv", header=0)
df_liu_2019_kept.columns

Index(['Unnamed: 0', 'total_muts', 'nonsyn_muts', 'clonal_muts',
       'subclonal_muts', 'heterogeneity', 'total_neoantigens', 'CNA_prop',
       'gender (Male=1, Female=0)', 'biopsy site', 'monthsBiopsyPreTx', 'BR',
       'PFS', 'OS', 'TimeToBR', 'cyclesOnTherapy', 'txOngoing', 'Tx',
       'Mstage (IIIC=0, M1a=1, M1b=2, M1c=3)', 'Tx_Start_ECOG', 'Tx_Start_LDH',
       'LDH_Elevated', 'Brain_Met', 'Cut_SubQ_Met', 'LN_Met', 'Lung_Met',
       'Liver_Visc_Met', 'Bone_Met', 'progressed', 'dead', 'Primary_Type',
       'Histology', 'IOTherapy', 'steroidsGT10mgDaily', 'priorMAPKTx',
       'priorCTLA4', 'postCTLA4', 'postMAPKTx', 'postCombinedCTLA_PD1',
       'numPriorTherapies', 'biopsy site_categ',
       'biopsyContext (1=Pre-Ipi; 2=On-Ipi; 3=Pre-PD1; 4=On-PD1)',
       'daysBiopsyToPD1', 'daysBiopsyAfterIpiStart', 'purity', 'ploidy',
       'hla_class_ii_alleles', 'homozygous', 'hla_class_ii_alleles_replace'],
      dtype='object')

In [4]:
df_liu_2019_kept.hla_class_ii_alleles_replace.tolist()[0]

'DRB1*04:01,DRB1*15:01,DQA1*01:02,DQA1*03:01,DQB1*03:02,DQB1*06:02,DPA1*01:03,DPA1*01:03,DPB1*04:01,DPB1*04:01'

In [None]:
HLA-DPAB*01:03_02:01
HLA-DQAB*01:01_02:01
HLA-DRB1*10:01

In [5]:
def generate_alleles(x):
    
    items = x.split(",")
    
    DRB_pairs = ["HLA-"+items[0], "HLA-"+items[1]]
    
    DQA_1 = items[2].split("*")[-1]
    DQA_2 = items[3].split("*")[-1]
    DQB_1 = items[4].split("*")[-1]
    DQB_2 = items[5].split("*")[-1]
    
    DQAB_pairs = ["HLA-DQAB*"+DQA_1+"_"+DQB_1, 
                  "HLA-DQAB*"+DQA_1+"_"+DQB_2, 
                  "HLA-DQAB*"+DQA_2+"_"+DQB_1, 
                  "HLA-DQAB*"+DQA_2+"_"+DQB_2]
    
    DPA_1 = items[6].split("*")[-1]
    DPA_2 = items[7].split("*")[-1]
    DPB_1 = items[8].split("*")[-1]
    DPB_2 = items[9].split("*")[-1]
    
    DPAB_pairs = ["HLA-DPAB*"+DPA_1+"_"+DPB_1, 
                  "HLA-DPAB*"+DPA_1+"_"+DPB_2, 
                  "HLA-DPAB*"+DPA_2+"_"+DPB_1, 
                  "HLA-DPAB*"+DPA_2+"_"+DPB_2]   
    
    return DRB_pairs+DQAB_pairs+DPAB_pairs

In [6]:
generate_alleles(df_liu_2019_kept.hla_class_ii_alleles_replace.tolist()[20])

['HLA-DRB1*03:01',
 'HLA-DRB1*11:01',
 'HLA-DQAB*05:01_02:01',
 'HLA-DQAB*05:01_03:01',
 'HLA-DQAB*05:05_02:01',
 'HLA-DQAB*05:05_03:01',
 'HLA-DPAB*01:03_01:01',
 'HLA-DPAB*01:03_02:01',
 'HLA-DPAB*02:01_01:01',
 'HLA-DPAB*02:01_02:01']

In [7]:
df_liu_2019_kept.hla_class_ii_alleles_replace.tolist()[20]

'DRB1*03:01,DRB1*11:01,DQA1*05:01,DQA1*05:05,DQB1*02:01,DQB1*03:01,DPA1*01:03,DPA1*02:01,DPB1*01:01,DPB1*02:01'

In [8]:
hla_class_ii_replace_reformat_list = [generate_alleles(x) for x in \
                                      df_liu_2019_kept.hla_class_ii_alleles_replace.tolist()]

hla_class_ii_replace_reformat_string = [",".join(cur_list) for cur_list in \
                                        hla_class_ii_replace_reformat_list]

len(hla_class_ii_replace_reformat_string)

120

In [9]:
df_liu_2019_kept['hla_class_ii_replace_reformat'] = hla_class_ii_replace_reformat_string

In [10]:
df_liu_2019_kept.shape

(120, 50)

In [11]:
df_liu_2019_kept.to_csv("../results/st19_liu_2019_kept_hla_ii_replace_reformat.csv", 
                        index=False)

Extract unique HLA-II alleles in the format of DeWitt 2018

In [12]:
expand_list = [x for cur_list in hla_class_ii_replace_reformat_list for x in cur_list]
len(expand_list)

1200

In [13]:
unique_ii_allele_list = list(set(expand_list))
unique_ii_allele_list.sort()
len(unique_ii_allele_list)

141

Verify whether all unique HLA-II alleles here are in the combinations formed by DQA&DQB, DPA&DPB and DRB from DeWitt 2018 data

The answer is yes. 

In [14]:
df_pseudo = pd.read_csv("../data/for_encoders/HLA_II_pseudo_45.csv", header = 0)

In [15]:
set(unique_ii_allele_list) - set(df_pseudo.hla)

set()

In [16]:
df_hla_ii_liu_2019_format = pd.DataFrame(unique_ii_allele_list, columns=['hla'])
df_hla_ii_liu_2019_format[:6]

Unnamed: 0,hla
0,HLA-DPAB*01:03_01:01
1,HLA-DPAB*01:03_02:01
2,HLA-DPAB*01:03_03:01
3,HLA-DPAB*01:03_04:01
4,HLA-DPAB*01:03_04:02
5,HLA-DPAB*01:03_13:01


In [17]:
df_hla_ii_liu_2019_format.shape

(141, 1)

In [36]:
df_hla_ii_liu_2019_format.to_csv(data_dir+"/liu_2019_hla_ii_format.csv", 
                                 index=False)