In [1]:
import io
from IPython.nbformat import current
def execute_notebook(nbfile):
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    ip = get_ipython()
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
execute_notebook("/cellar/users/ramarty/Projects/hla_ii/bin/imports.ipynb")

Populating the interactive namespace from numpy and matplotlib



- use IPython.nbformat for read/write/validate public API
- use IPython.nbformat.vX directly to composing notebooks of a particular version

  """)


### Collecting results for alleles

Setup for script

In [2]:
alleles = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/other/netMHCIIpan_alleles.txt').readlines() if x.strip()[:2] != 'H-']
len(alleles)

5620

In [3]:
def collect_allele_BR(category):
    mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
    df = pd.DataFrame({'mutation': mutations})
    
    for i, allele in enumerate(alleles):
        if i % 100 == 0:
            print i
        BR = []
        aff = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/affinities/{0}/{1}.csv'.format(category, allele), sep='\t', skiprows=1)
        for mutation in mutations: 
            BR.append(aff[aff.ID == mutation].Rank.min())
        df[allele] = BR
    df.to_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.csv'.format(category))

Running on the cluster

In [4]:
def create_cluster_script_alleles(categories):
    
    new_script_file = '/cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/allele_matrices.sh'
    
    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(categories)))
        #out_file.write("#$ -t 1-2\n".format(len(samples)))
        out_file.write("#$ -l h_vmem=2G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set categories=({0})\n".format(" ".join(categories)))
        out_file.write("\n")

        out_file.write("set category=$categories[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("\n")
  
        out_file.write("python /cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/creating_allele_matrices.py $category\n")
        out_file.write("\n")
                       
        out_file.write("date\n")

In [5]:
categories = ['germline']
create_cluster_script_alleles(categories)

### Merging results into patients

Setup for script

In [4]:
def PHBR(x):
    number_of_alleles = len(x)
    s = 0
    for i in range(number_of_alleles):
        s += 1/float(x[i])
    return number_of_alleles / float(s)

In [5]:
def create_patient_matrix(category):
    df = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.csv'.format(category), index_col=0)
    patients_used = []
    for patient in patient_dictionary.keys():
        patient_alleles = patient_dictionary[patient]
        try:
            df[patient] = df[patient_alleles].apply(PHBR, axis=1)
            patients_used.append(patient)
        except:
            print patient
    df.index = df['mutation']
    df[patients_used].to_csv('/cellar/users/ramarty/Data/hla_ii/presentation/patient_matrices/{0}.csv'.format(category))

In [17]:
patient_dictionary = pickle.load(open('/cellar/users/ramarty/Data/hla_ii/hla_types/TCGA.HLA_classII.p'))

In [18]:
patient_dictionary.keys()[:3]

['TCGA-D3-A3CE', 'TCGA-D3-A3CB', 'TCGA-DH-A7UV']

In [19]:
patient_dictionary['TCGA-D3-A3CE']

['DRB1_0101',
 'DRB1_1501',
 'HLA-DPA10103_DPB10401',
 'HLA-DPA10103_DPB10301',
 'HLA-DPA10103_DPB10401',
 'HLA-DPA10103_DPB10301',
 'HLA-DQA10102_DQB10602',
 'HLA-DQA10102_DQB10501',
 'HLA-DQA10101_DQB10602',
 'HLA-DQA10101_DQB10501']

Running on the cluster

In [19]:
def create_cluster_script_patients_DR(categories, genes):
    
    new_script_file = '/cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/patient_matrices.genes.sh'
    
    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(categories)))
        out_file.write("#$ -l h_vmem=2G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set categories=({0})\n".format(" ".join(categories)))
        out_file.write("set genes=({0})\n".format(" ".join(genes)))
        out_file.write("\n")

        out_file.write("set category=$categories[$SGE_TASK_ID]\n")
        out_file.write("set gene=$genes[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("\n")
  
        out_file.write("python /cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/creating_patient_matrices.DR.py $category $gene\n")
        out_file.write("\n")
                       
        out_file.write("date\n")

In [20]:
categories = ['tsgenes', 'oncogenes','indels']*3
genes = ['DR', 'DR', 'DR', 'DP', 'DP', 'DP', 'DQ', 'DQ', 'DQ']
#categories = ['random']*3
#genes = ['DR', 'DP', 'DQ']
print categories, genes
create_cluster_script_patients_DR(categories, genes)

['tsgenes', 'oncogenes', 'indels', 'tsgenes', 'oncogenes', 'indels', 'tsgenes', 'oncogenes', 'indels'] ['DR', 'DR', 'DR', 'DP', 'DP', 'DP', 'DQ', 'DQ', 'DQ']


In [21]:
def create_cluster_script_patients_all(categories, population):
    
    new_script_file = '/cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/patient_matrices.all.sh'
    
    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(categories)))
        out_file.write("#$ -l h_vmem=2G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set categories=({0})\n".format(" ".join(categories)))
        out_file.write("\n")

        out_file.write("set category=$categories[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("\n")
  
        out_file.write("python /cellar/users/ramarty/Projects/hla_ii/data_gathering/affinities/creating_patient_matrices.all.py $category {0}\n".format(population))
        out_file.write("\n")
                       
        out_file.write("date\n")

In [15]:
#categories = ['viral', 'bacterial']
categories = ['random']
create_cluster_script_patients_all(categories, 'TCGA')

In [22]:
categories = ['germline']
create_cluster_script_patients_all(categories, 'alternate')

### Merging MHC-I and MHC-II into a single predictor

In [6]:
patient_dictionary = pickle.load(open('/cellar/users/ramarty/Data/hla_ii/hla_types/TCGA.HLA_classII.p'))

In [14]:
category = 'oncogenes'
df = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.csv'.format(category), index_col=0)

In [21]:
patients_used = []
for patient in patient_dictionary.keys()[:10]:
    patient_alleles = []
    for gene in ['DR', 'DP', 'DQ']:
        patient_alleles.extend(patient_dictionary[patient][gene])

    if len(patient_alleles) == 12:
        #df[patient_alleles].apply(PHBR, axis=1)
        df[patient] = df[patient_alleles].apply(PHBR, axis=1)
        patients_used.append(patient)

In [16]:
df.head()

Unnamed: 0,mutation,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0104,DRB1_0105,DRB1_0106,DRB1_0107,DRB1_0108,DRB1_0109,...,HLA-DQA10602-DQB10635,HLA-DQA10602-DQB10636,HLA-DQA10602-DQB10637,HLA-DQA10602-DQB10638,HLA-DQA10602-DQB10639,HLA-DQA10602-DQB10640,HLA-DQA10602-DQB10641,HLA-DQA10602-DQB10642,HLA-DQA10602-DQB10643,HLA-DQA10602-DQB10644
0,BRAF_V600E,55.0,60.0,55.0,60.0,55.0,60.0,55.0,55.0,55.0,...,19.0,25.0,25.0,25.0,25.0,23.0,23.0,27.0,19.0,23.0
1,IDH1_R132H,36.0,33.0,23.0,31.0,36.0,27.0,36.0,36.0,28.0,...,25.0,55.0,46.0,55.0,55.0,43.0,43.0,48.0,25.0,43.0
2,PIK3CA_E545K,60.0,55.0,65.0,55.0,60.0,65.0,60.0,60.0,65.0,...,65.0,65.0,65.0,65.0,65.0,70.0,70.0,75.0,65.0,70.0
3,PIK3CA_H1047R,12.0,15.0,30.0,15.0,12.0,19.0,12.0,12.0,15.0,...,60.0,55.0,65.0,55.0,55.0,60.0,60.0,55.0,60.0,60.0
4,KRAS_G12D,24.0,23.0,12.0,24.0,24.0,14.0,24.0,24.0,18.0,...,6.0,13.0,7.5,13.0,13.0,9.0,9.0,10.0,6.0,9.0


Debugging

In [20]:
def PHBR(x):
    number_of_alleles = len(x)
    s = 0
    for i in range(number_of_alleles):
        s += 1/float(x[i])
    return number_of_alleles / float(s)

In [14]:
category = 'oncogenes'
patient_dictionary = pickle.load(open('/cellar/users/ramarty/Data/hla_ii/hla_types/TCGA.HLA_classII.p'))

In [15]:
df = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/allele_matrices/{0}.csv'.format(category), index_col=0)
patients_used = []
for patient in patient_dictionary.keys()[:10]:
    patient_alleles = patient_dictionary[patient]
    try:
        df[patient] = df[patient_alleles].apply(PHBR, axis=1)
        patients_used.append(patient)
    except:
        print patient
df.index = df['mutation']

TCGA-D3-A3CE
TCGA-D3-A3CB
TCGA-DH-A7UV
TCGA-C5-A1BM
TCGA-C5-A1BL
TCGA-41-3915
TCGA-85-6560
TCGA-2V-A95S
TCGA-VQ-A94R
TCGA-DC-4745


In [18]:
patient_alleles = patient_dictionary[patient]

In [19]:
patient_alleles

['DRB1_0801',
 'DRB1_0801',
 'HLA-DPA10201_DPB10401',
 'HLA-DPA10201_DPB10501',
 'HLA-DPA10103_DPB10401',
 'HLA-DPA10103_DPB10501',
 'HLA-DQA10401_DQB10402',
 'HLA-DQA10401_DQB10402',
 'HLA-DQA10401_DQB10402',
 'HLA-DQA10401_DQB10402']