In [1]:
import io
from IPython.nbformat import current
def execute_notebook(nbfile):
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    ip = get_ipython()
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
execute_notebook("/cellar/users/ramarty/Projects/hla_ii/bin/imports.ipynb")


- use IPython.nbformat for read/write/validate public API
- use IPython.nbformat.vX directly to composing notebooks of a particular version

  """)


Populating the interactive namespace from numpy and matplotlib


### Collecting results for alleles

Setup for script

In [2]:
alleles = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/other/netMHCIIpan_alleles.txt').readlines() if x.strip()[:2] != 'H-']
len(alleles)

5620

In [3]:
def collect_allele_BR(category):
    mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
    df = pd.DataFrame({'mutation': mutations})
    
    for i, allele in enumerate(alleles):
        if i % 100 == 0:
            print i
        BR = []
        aff = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/affinities/{0}/{1}.csv'.format(category, allele), sep='\t', skiprows=1)
        for mutation in mutations: 
            BR.append(aff[aff.ID == mutation].Rank.min())
        df[allele] = BR
    df.to_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.csv'.format(category))

Running on the cluster

In [5]:
def create_cluster_script_alleles(categories, conditions):
    
    new_script_file = '/cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/allele_matrices.sh'
    
    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(categories)))
        #out_file.write("#$ -t 1-2\n".format(len(samples)))
        out_file.write("#$ -l h_vmem=2G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set categories=({0})\n".format(" ".join(categories)))
        out_file.write("set conditions=({0})\n".format(" ".join(conditions)))
        out_file.write("\n")

        out_file.write("set category=$categories[$SGE_TASK_ID]\n")
        out_file.write("set condition=$conditions[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("\n")
  
        out_file.write("python /cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/creating_allele_matrices.py $category $condition\n")
        out_file.write("\n")
                       
        out_file.write("date\n")

In [6]:
categories = ['cgc_new']
conditions = ['mut']
create_cluster_script_alleles(categories, conditions)

### Merging results into patients

Setup for script

In [31]:
def PHBR(x):
    number_of_alleles = len(x)
    s = 0
    for i in range(number_of_alleles):
        s += 1/float(x[i])
    return number_of_alleles / float(s)

In [5]:
def create_patient_matrix(category):
    df = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.csv'.format(category), index_col=0)
    patients_used = []
    # need to update patients used - maybe we are using less because they don't have types with affinity calls???
    for patient in patient_dictionary.keys():
        patient_alleles = patient_dictionary[patient]
        try:
            df[patient] = df[patient_alleles].apply(PHBR, axis=1)
            patients_used.append(patient)
        except:
            print patient
    df.index = df['mutation']
    df[patients_used].to_csv('/cellar/users/ramarty/Data/hla_ii/presentation/patient_matrices/{0}.csv'.format(category))

In [17]:
patient_dictionary = pickle.load(open('/cellar/users/ramarty/Data/hla_ii/hla_types/TCGA.HLA_classII.p'))

In [18]:
patient_dictionary.keys()[:3]

['TCGA-D3-A3CE', 'TCGA-D3-A3CB', 'TCGA-DH-A7UV']

In [19]:
patient_dictionary['TCGA-D3-A3CE']

['DRB1_0101',
 'DRB1_1501',
 'HLA-DPA10103_DPB10401',
 'HLA-DPA10103_DPB10301',
 'HLA-DPA10103_DPB10401',
 'HLA-DPA10103_DPB10301',
 'HLA-DQA10102_DQB10602',
 'HLA-DQA10102_DQB10501',
 'HLA-DQA10101_DQB10602',
 'HLA-DQA10101_DQB10501']

Running on the cluster

In [19]:
def create_cluster_script_patients_DR(categories, genes):
    
    new_script_file = '/cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/patient_matrices.genes.sh'
    
    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(categories)))
        out_file.write("#$ -l h_vmem=2G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set categories=({0})\n".format(" ".join(categories)))
        out_file.write("set genes=({0})\n".format(" ".join(genes)))
        out_file.write("\n")

        out_file.write("set category=$categories[$SGE_TASK_ID]\n")
        out_file.write("set gene=$genes[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("\n")
  
        out_file.write("python /cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/creating_patient_matrices.DR.py $category $gene\n")
        out_file.write("\n")
                       
        out_file.write("date\n")

In [20]:
categories = ['tsgenes', 'oncogenes','indels']*3
genes = ['DR', 'DR', 'DR', 'DP', 'DP', 'DP', 'DQ', 'DQ', 'DQ']
#categories = ['random']*3
#genes = ['DR', 'DP', 'DQ']
print categories, genes
create_cluster_script_patients_DR(categories, genes)

['tsgenes', 'oncogenes', 'indels', 'tsgenes', 'oncogenes', 'indels', 'tsgenes', 'oncogenes', 'indels'] ['DR', 'DR', 'DR', 'DP', 'DP', 'DP', 'DQ', 'DQ', 'DQ']


In [8]:
def create_cluster_script_patients_all(categories, population, conditions, subsets):
    
    new_script_file = '/cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/patient_matrices.classII.sh'
    
    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(categories)))
        out_file.write("#$ -l h_vmem=2G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set categories=({0})\n".format(" ".join(categories)))
        out_file.write("set conditions=({0})\n".format(" ".join(conditions)))
        out_file.write("set subsets=({0})\n".format(" ".join(subsets)))
        out_file.write("\n")

        out_file.write("set category=$categories[$SGE_TASK_ID]\n")
        out_file.write("set condition=$conditions[$SGE_TASK_ID]\n")
        out_file.write("set subset=$subsets[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("\n")
  
        out_file.write("python /cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/creating_patient_matrices.classII.py $category {0} $subset $condition\n".format(population))
        out_file.write("\n")
                       
        out_file.write("date\n")

In [9]:
categories = ['cgc_new']
conditions = ['mut']
subsets = ['inclusive']
create_cluster_script_patients_all(categories, 'TCGA', conditions, subsets)

In [2]:
categories = ['tsgenes', 'oncogenes', 'indels']
conditions = ['mut', 'mut', 'mut']
subsets = ['inclusive']*3
create_cluster_script_patients_all(categories, 'TCGA', conditions, subsets)

In [10]:
categories = ['germline', 'passenger']
conditions = ['mut', 'mut']
subsets = ['inclusive']*2
create_cluster_script_patients_all(categories, 'TCGA', conditions, subsets)

In [6]:
categories = ['tsgenes', 'oncogenes', 'indels']
conditions = ['mut', 'mut', 'mut']
create_cluster_script_patients_all(categories, 'alternate', conditions)

Also for MHC-I

In [8]:
def create_cluster_script_patients_all(categories, population, conditions, subsets):
    
    new_script_file = '/cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/patient_matrices.classI.sh'
    
    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(categories)))
        out_file.write("#$ -l h_vmem=2G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set categories=({0})\n".format(" ".join(categories)))
        out_file.write("set conditions=({0})\n".format(" ".join(conditions)))
        out_file.write("set subsets=({0})\n".format(" ".join(subsets)))
        out_file.write("\n")

        out_file.write("set category=$categories[$SGE_TASK_ID]\n")
        out_file.write("set condition=$conditions[$SGE_TASK_ID]\n")
        out_file.write("set subset=$subsets[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("\n")
  
        out_file.write("python /cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/creating_patient_matrices.classI.py $category {0} $subset $condition\n".format(population))
        out_file.write("\n")
                       
        out_file.write("date\n")

In [9]:
categories = ['non_cancer']
conditions = ['mut']
subsets = ['inclusive']
create_cluster_script_patients_all(categories, 'TCGA', conditions, subsets)

In [12]:
categories = ['germline', 'passenger']
conditions = ['mut', 'mut']
subsets = ['inclusive']*2
create_cluster_script_patients_all(categories, 'TCGA', conditions, subsets)

#### Merging results for MHC-I affinities of alternate populations

In [44]:
category='indels'

In [45]:
patient_dictionary = pickle.load(open('/cellar/users/ramarty/Data/hla_ii/hla_types/SRA.HLA_classI.p'))

In [46]:
patient_dictionary[patient_dictionary.keys()[0]]

{'A': ['A_0201', 'A_0101'],
 'B': ['B_0702', 'B_3701'],
 'C': ['C_0702', 'C_0602']}

In [47]:
df = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.csv'.format(category), index_col=0)

In [48]:
df.head()

Unnamed: 0,mutation,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0104,DRB1_0105,DRB1_0106,DRB1_0107,DRB1_0108,DRB1_0109,...,HLA-DQA10602-DQB10635,HLA-DQA10602-DQB10636,HLA-DQA10602-DQB10637,HLA-DQA10602-DQB10638,HLA-DQA10602-DQB10639,HLA-DQA10602-DQB10640,HLA-DQA10602-DQB10641,HLA-DQA10602-DQB10642,HLA-DQA10602-DQB10643,HLA-DQA10602-DQB10644
0,EGFR_ifd_55174772,47.0,38.0,47.0,55.0,47.0,37.0,47.0,47.0,40.0,...,39.0,75.0,65.0,75.0,75.0,70.0,70.0,75.0,39.0,70.0
1,NOTCH1_ifd_136518618,70.0,80.0,75.0,80.0,70.0,80.0,70.0,70.0,70.0,...,38.0,46.0,46.0,46.0,46.0,55.0,55.0,47.0,38.0,55.0
2,PIK3R1_ifd_68295304,19.0,18.0,12.0,12.0,19.0,13.0,19.0,19.0,16.0,...,65.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,65.0,60.0
3,CTNNB1_ifd_41224643,70.0,75.0,42.0,85.0,70.0,60.0,70.0,70.0,43.0,...,3.0,39.0,9.0,39.0,39.0,12.0,12.0,23.0,3.0,12.0
4,PIK3R1_ifd_68293795,80.0,85.0,90.0,85.0,80.0,90.0,80.0,80.0,85.0,...,70.0,37.0,47.0,37.0,37.0,60.0,60.0,43.0,70.0,60.0


In [49]:
df = pd.read_csv('/cellar/users/ramarty/Data/hla/residue/matrices/mut/class_i/oncogenes.all.max.txt'.format(category), index_col=0)
df.columns = alleles = ['{0}_{1}{2}'.format(x[4], x[5:7], x[8:10]) for x in list(df.columns)]
df['mutation'] = df.index

In [50]:
df.head()

Unnamed: 0,A_0101,A_0102,A_0103,A_0106,A_0107,A_0108,A_0109,A_0110,A_0112,A_0113,...,G_0101,G_0102,G_0103,G_0104,G_0106,G_0107,G_0108,G_0109,E_0101,mutation
BRAF_V600E,12.0,7.5,14.0,12.0,14.0,9.5,12.0,8.5,13.0,1.6,...,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,17.0,BRAF_V600E
IDH1_R132H,4.5,3.0,5.0,3.0,3.5,3.5,4.5,2.5,1.6,12.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,14.0,IDH1_R132H
PIK3CA_E545K,7.0,6.0,7.5,6.5,6.0,5.5,7.0,6.0,4.0,1.1,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,6.5,2.5,PIK3CA_E545K
PIK3CA_H1047R,5.5,6.0,5.5,6.5,5.0,4.0,5.5,5.0,6.0,12.0,...,6.5,6.5,6.5,6.5,6.5,6.5,6.5,7.0,9.5,PIK3CA_H1047R
KRAS_G12D,17.0,32.0,15.0,19.0,17.0,16.0,17.0,14.0,13.0,6.5,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,15.0,KRAS_G12D


In [51]:
patient_dictionary[patient_dictionary.keys()[0]]

{'A': ['A_0201', 'A_0101'],
 'B': ['B_0702', 'B_3701'],
 'C': ['C_0702', 'C_0602']}

In [52]:
df = pd.read_csv('/cellar/users/ramarty/Data/hla/residue/matrices/mut/class_i/{0}.all.max.txt'.format(category), index_col=0)
df.columns = alleles = ['{0}_{1}{2}'.format(x[4], x[5:7], x[8:10]) for x in list(df.columns)]
df['mutation'] = df.index

patients_used = []
for patient in patient_dictionary.keys():
    patient_alleles = []
    try:
        for gene in ['A', 'B', 'C']:
            patient_alleles.extend([x.strip() for x in patient_dictionary[patient][gene]])

        if len(patient_alleles) == 6:
            df[patient] = df[patient_alleles].apply(PHBR, axis=1)
            patients_used.append(patient)
    except:
        print patient
df.index = df['mutation']

In [53]:
df[patients_used].to_csv('/cellar/users/ramarty/Data/hla_ii/presentation/patient_matrices/{0}.all.SRA.classI.csv'.format(category))

### Merging MHC-I and MHC-II into a single predictor

In [6]:
patient_dictionary = pickle.load(open('/cellar/users/ramarty/Data/hla_ii/hla_types/TCGA.HLA_classII.p'))

In [14]:
category = 'oncogenes'
df = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.csv'.format(category), index_col=0)

In [21]:
patients_used = []
for patient in patient_dictionary.keys()[:10]:
    patient_alleles = []
    for gene in ['DR', 'DP', 'DQ']:
        patient_alleles.extend(patient_dictionary[patient][gene])

    if len(patient_alleles) == 12:
        #df[patient_alleles].apply(PHBR, axis=1)
        df[patient] = df[patient_alleles].apply(PHBR, axis=1)
        patients_used.append(patient)

In [16]:
df.head()

Unnamed: 0,mutation,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0104,DRB1_0105,DRB1_0106,DRB1_0107,DRB1_0108,DRB1_0109,...,HLA-DQA10602-DQB10635,HLA-DQA10602-DQB10636,HLA-DQA10602-DQB10637,HLA-DQA10602-DQB10638,HLA-DQA10602-DQB10639,HLA-DQA10602-DQB10640,HLA-DQA10602-DQB10641,HLA-DQA10602-DQB10642,HLA-DQA10602-DQB10643,HLA-DQA10602-DQB10644
0,BRAF_V600E,55.0,60.0,55.0,60.0,55.0,60.0,55.0,55.0,55.0,...,19.0,25.0,25.0,25.0,25.0,23.0,23.0,27.0,19.0,23.0
1,IDH1_R132H,36.0,33.0,23.0,31.0,36.0,27.0,36.0,36.0,28.0,...,25.0,55.0,46.0,55.0,55.0,43.0,43.0,48.0,25.0,43.0
2,PIK3CA_E545K,60.0,55.0,65.0,55.0,60.0,65.0,60.0,60.0,65.0,...,65.0,65.0,65.0,65.0,65.0,70.0,70.0,75.0,65.0,70.0
3,PIK3CA_H1047R,12.0,15.0,30.0,15.0,12.0,19.0,12.0,12.0,15.0,...,60.0,55.0,65.0,55.0,55.0,60.0,60.0,55.0,60.0,60.0
4,KRAS_G12D,24.0,23.0,12.0,24.0,24.0,14.0,24.0,24.0,18.0,...,6.0,13.0,7.5,13.0,13.0,9.0,9.0,10.0,6.0,9.0


### Debugging the create of patient matrices for alternate population

In [10]:
def PHBR(x):
    number_of_alleles = len(x)
    s = 0
    for i in range(number_of_alleles):
        s += 1/float(x[i])
    return number_of_alleles / float(s)

In [11]:
category='oncogenes'
population='alternate'
condition='mut'

In [12]:
if population == 'TCGA':
    patient_dictionary = pickle.load(open('/cellar/users/ramarty/Data/hla_ii/hla_types/TCGA.HLA_classII.p'))
else:
    patient_dictionary = pickle.load(open('/cellar/users/ramarty/Data/hla_ii/hla_types/Alternate.HLA_classII.p'))

if condition == 'mut':
    df = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.csv'.format(category), index_col=0)
else:
    df = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.wt.csv'.format(category), index_col=0)

In [28]:
patient = 'SRR486488'

In [29]:
patient_alleles = []
for gene in ['DR', 'DP', 'DQ']:
    patient_alleles.extend(patient_dictionary[patient][gene])
try:
    print len(patient_alleles)
    if len(patient_alleles) == 12:
        df[patient] = df[patient_alleles].apply(PHBR, axis=1)
        patients_used.append(patient)
except:
    print patient

12
SRR486488


In [31]:
df[patient_alleles].apply(PHBR, axis=1)

KeyError: "['HLA-DPA10103-DPB116901' 'HLA-DPA10103-DPB116901'] not in index"

In [27]:
patients_used = []
for patient in patient_dictionary.keys()[:10]:
    patient_alleles = []
    for gene in ['DR', 'DP', 'DQ']:
        patient_alleles.extend(patient_dictionary[patient][gene])
    try:
        print len(patient_alleles)
        if len(patient_alleles) == 12:
            df[patient] = df[patient_alleles].apply(PHBR, axis=1)
            patients_used.append(patient)
    except:
        print patient

12
12
12
12
SRR486488
12
12
12
12
12
12
