In [2]:
import io
from IPython.nbformat import current
def execute_notebook(nbfile):
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    ip = get_ipython()
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
execute_notebook("../../../bin/imports.ipynb")
execute_notebook("../../../bin/plotting.ipynb")


- use IPython.nbformat for read/write/validate public API
- use IPython.nbformat.vX directly to composing notebooks of a particular version

  """)


Populating the interactive namespace from numpy and matplotlib


###Getting a clean run of all of the results 

In [3]:
def create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs):
    
    new_script_file = \
    '/cellar/users/ramarty/Projects/hla_ii/data_analysis/patient_selection/cluster/OR.{0}.sh'.format(script_label)
    
    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/hla_ii/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(thresholds)))
        out_file.write("#$ -l h_vmem=6G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set thresholds=({0})\n".format(" ".join(thresholds)))
        out_file.write("set classes=({0})\n".format(" ".join(classes)))
        out_file.write("set tissue_fs=({0})\n".format(" ".join(tissue_fs)))
        out_file.write("set mut_fs=({0})\n".format(" ".join(mut_fs)))
        out_file.write("set aff_fs=({0})\n".format(" ".join(aff_fs)))
        out_file.write("set names=({0})\n".format(" ".join(names)))
        out_file.write("set pans=({0})\n".format(" ".join(pans)))
        out_file.write("\n")

        out_file.write("set threshold=$thresholds[$SGE_TASK_ID]\n")
        out_file.write("set class=$classes[$SGE_TASK_ID]\n")
        out_file.write("set tissue_f=$tissue_fs[$SGE_TASK_ID]\n")
        out_file.write("set mut_f=$mut_fs[$SGE_TASK_ID]\n")
        out_file.write("set aff_f=$aff_fs[$SGE_TASK_ID]\n")
        out_file.write("set name=$names[$SGE_TASK_ID]\n")
        out_file.write("set pan=$pans[$SGE_TASK_ID]\n")
        out_file.write("\n")
        
        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("\n")
  
        out_file.write("Rscript --vanilla /cellar/users/ramarty/Projects/hla_ii/data_analysis/patient_selection/OR_clean.R " \
                        "$pan $class $name $threshold $tissue_f $mut_f $aff_f\n")
        out_file.write("\n")
                       
        out_file.write("date\n")

Actual

In [5]:
# first MHC-I only (driver, passenger, germline); then both (driver, passenger, germline)
subset = 'inclusive'
pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['cancer']*4
thresholds = ['10']*4
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
mut_fs = ['patient_mutations.cancer.TCGA.inclusive.mut.csv']*4
aff_fs = ['patient_affinities.cancer.TCGA.inclusive.mut.ClassI.csv', 
          'patient_affinities.cancer.TCGA.inclusive.mut.ClassII.csv']*2

script_label='cancer'
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

Randomized

In [6]:
# number of randomizations
pans = ['1', '1']*200 +  ['0', '0']*200
classes = ['random']*400
names = ['class_i_{0}'.format(x) for x in range(100)] + \
        ['class_ii_{0}'.format(x) for x in range(100)] + \
        ['class_i_{0}'.format(x) for x in range(100)] + \
        ['class_ii_{0}'.format(x) for x in range(100)]
thresholds = ['2']*400
tissue_fs = ['patient_tissues.csv']*400
mut_fs = ['patient_mutations.cancer.TCGA.inclusive.mut.csv']*400
aff_fs = ['patient_affinities.cancer.TCGA.inclusive.mut.ClassI.csv']*100 + \
         ['patient_affinities.cancer.TCGA.inclusive.mut.ClassII.csv']*100 + \
         ['patient_affinities.cancer.TCGA.inclusive.mut.ClassI.csv']*100 + \
         ['patient_affinities.cancer.TCGA.inclusive.mut.ClassII.csv']*100

script_label='randomizations'
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

Passenger

In [9]:
subset = 'conservative'
pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['passenger.{0}'.format(subset)]*4
thresholds = ['2']*4
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
mut_fs = ['patient_mutations.passenger.TCGA.{0}.mut.ClassI.csv'.format(subset),
          'patient_mutations.passenger.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2
aff_fs = ['patient_affinities.passenger.TCGA.{0}.mut.ClassI.csv'.format(subset), 
          'patient_affinities.passenger.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='passenger.{0}'.format(subset)
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

Germline

In [19]:
subset = 'conservative'

pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['germline.{0}'.format(subset)]*4
thresholds = ['2']*4
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
mut_fs = ['patient_mutations.germline.TCGA.{0}.mut.ClassI.csv'.format(subset),
          'patient_mutations.germline.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2
aff_fs = ['patient_affinities.germline.TCGA.{0}.mut.ClassI.csv'.format(subset), 
          'patient_affinities.germline.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='germline.{0}'.format(subset)
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

High Allelic Fraction

In [None]:
# see allelic_fraction notebook

Conservative

In [11]:
pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['conservative']*4
thresholds = ['5']*4
tissue_fs = ['patient_tissues.csv']*4
mut_fs = ['patient_mutations.cancer.TCGA.conservative.mut.csv']*4
aff_fs = ['patient_affinities.cancer.TCGA.conservative.mut.ClassI.csv', 
          'patient_affinities.cancer.TCGA.conservative.mut.ClassII.csv']*2

script_label='conservative'
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

Non-cancer

In [7]:
subset = 'conservative'

pans = ['1', '0']
classes = [ 'class_ii']*2
names = ['non_cancer.{0}'.format(subset)]*2
thresholds = ['2']*2
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*2
mut_fs = ['patient_mutations.non_cancer.TCGA.{0}.mut.csv'.format(subset)]*2
aff_fs = ['patient_affinities.non_cancer.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='non_cancer.{0}'.format(subset)
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

In [19]:
pans = ['1', '0']
classes = [ 'class_ii']*2
names = ['known_non_cancer.{0}'.format(subset)]*2
thresholds = ['2']*2
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*2
mut_fs = ['patient_mutations.known_non_cancer.TCGA.{0}.mut.csv'.format(subset)]*2
aff_fs = ['patient_affinities.known_non_cancer.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='known_non_cancer.{0}'.format(subset)
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

CGC genes - this is currently just the new ones.. which won't be representative because likely lower frequency?

In [7]:
pans = ['1', '0']
classes = ['class_ii']*2
names = ['cgc_new']*2
thresholds = ['5']*2
tissue_fs = ['patient_tissues.conservative.csv']*2
mut_fs = ['patient_mutations.cgc_new.TCGA.conservative.mut.ClassII.csv']*2
aff_fs = ['patient_affinities.cgc_new.TCGA.conservative.mut.ClassII.csv']*2

script_label='cgc_new'
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

In [12]:
pans = ['1', '0']
classes = ['class_ii']*2
names = ['cgc']*2
thresholds = ['2']*2
tissue_fs = ['patient_tissues.conservative.csv']*2
mut_fs = ['patient_mutations.cgc.TCGA.conservative.mut.ClassII.csv']*2
aff_fs = ['patient_affinities.cgc.TCGA.conservative.mut.ClassII.csv']*2

script_label='cgc'
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

APC infiltration - by pan

In [3]:
subset = 'inclusive'

pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['high_apc_expr.{0}'.format(subset)]*4
thresholds = ['2']*4
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
mut_fs = ['patient_mutations.high_apc_expr.TCGA.{0}.mut.csv'.format(subset),
          'patient_mutations.high_apc_expr.TCGA.{0}.mut.csv'.format(subset)]*2
aff_fs = ['patient_affinities.high_apc_expr.TCGA.{0}.mut.ClassI.csv'.format(subset), 
          'patient_affinities.high_apc_expr.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='high_apc_expr.{0}'.format(subset)

create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)
pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['low_apc_expr.{0}'.format(subset)]*4
thresholds = ['2']*4
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
mut_fs = ['patient_mutations.low_apc_expr.TCGA.{0}.mut.csv'.format(subset),
          'patient_mutations.low_apc_expr.TCGA.{0}.mut.csv'.format(subset)]*2
aff_fs = ['patient_affinities.low_apc_expr.TCGA.{0}.mut.ClassI.csv'.format(subset), 
          'patient_affinities.low_apc_expr.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='low_apc_expr.{0}'.format(subset)
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

APC infiltration - by tissue

In [6]:
subset = 'conservative'

pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['high_apc_expr_by_tissue.{0}'.format(subset)]*4
thresholds = ['2']*4
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
mut_fs = ['patient_mutations.high_apc_expr_by_tissue.TCGA.{0}.mut.csv'.format(subset),
          'patient_mutations.high_apc_expr_by_tissue.TCGA.{0}.mut.csv'.format(subset)]*2
aff_fs = ['patient_affinities.high_apc_expr_by_tissue.TCGA.{0}.mut.ClassI.csv'.format(subset), 
          'patient_affinities.high_apc_expr_by_tissue.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='high_apc_expr_by_tissue.{0}'.format(subset)

create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)
pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['low_apc_expr_by_tissue.{0}'.format(subset)]*4
thresholds = ['2']*4
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
mut_fs = ['patient_mutations.low_apc_expr_by_tissue.TCGA.{0}.mut.csv'.format(subset),
          'patient_mutations.low_apc_expr_by_tissue.TCGA.{0}.mut.csv'.format(subset)]*2
aff_fs = ['patient_affinities.low_apc_expr_by_tissue.TCGA.{0}.mut.ClassI.csv'.format(subset), 
          'patient_affinities.low_apc_expr_by_tissue.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='low_apc_expr_by_tissue.{0}'.format(subset)
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

Cytolytic score - by tissue

In [9]:
subset = 'inclusive'

pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['high_cytolytic_by_tissue.{0}'.format(subset)]*4
thresholds = ['2']*4
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
mut_fs = ['patient_mutations.high_cytolytic_by_tissue.TCGA.{0}.mut.csv'.format(subset),
          'patient_mutations.high_cytolytic_by_tissue.TCGA.{0}.mut.csv'.format(subset)]*2
aff_fs = ['patient_affinities.high_cytolytic_by_tissue.TCGA.{0}.mut.ClassI.csv'.format(subset), 
          'patient_affinities.high_cytolytic_by_tissue.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='high_cytolytic_by_tissue.{0}'.format(subset)
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

pans = ['1', '1', '0', '0']
classes = ['class_i', 'class_ii']*2
names = ['low_cytolytic_by_tissue.{0}'.format(subset)]*4
thresholds = ['2']*4
tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
mut_fs = ['patient_mutations.low_cytolytic_by_tissue.TCGA.{0}.mut.csv'.format(subset),
          'patient_mutations.low_cytolytic_by_tissue.TCGA.{0}.mut.csv'.format(subset)]*2
aff_fs = ['patient_affinities.low_cytolytic_by_tissue.TCGA.{0}.mut.ClassI.csv'.format(subset), 
          'patient_affinities.low_cytolytic_by_tissue.TCGA.{0}.mut.ClassII.csv'.format(subset)]*2

script_label='low_cytolytic_by_tissue.{0}'.format(subset)
create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

Cell score by tissue

In [13]:
for subset in ['conservative', 'inclusive']:
    for cell in ['CD4', 'CD8']:
        pans = ['1', '1', '0', '0']
        classes = ['class_i', 'class_ii']*2
        names = ['high_{0}_by_tissue.{1}'.format(cell, subset)]*4
        thresholds = ['2']*4
        tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
        mut_fs = ['patient_mutations.high_{0}_by_tissue.TCGA.{1}.mut.csv'.format(cell, subset),
                  'patient_mutations.high_{0}_by_tissue.TCGA.{1}.mut.csv'.format(cell, subset)]*2
        aff_fs = ['patient_affinities.high_{0}_by_tissue.TCGA.{1}.mut.ClassI.csv'.format(cell, subset), 
                  'patient_affinities.high_{0}_by_tissue.TCGA.{1}.mut.ClassII.csv'.format(cell, subset)]*2

        script_label='high_{0}_by_tissue.{1}'.format(cell, subset)
        create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

        pans = ['1', '1', '0', '0']
        classes = ['class_i', 'class_ii']*2
        names = ['low_{0}_by_tissue.{1}'.format(cell, subset)]*4
        thresholds = ['2']*4
        tissue_fs = ['patient_tissues.{0}.csv'.format(subset)]*4
        mut_fs = ['patient_mutations.low_{0}_by_tissue.TCGA.{1}.mut.csv'.format(cell, subset),
                  'patient_mutations.low_{0}_by_tissue.TCGA.{1}.mut.csv'.format(cell, subset)]*2
        aff_fs = ['patient_affinities.low_{0}_by_tissue.TCGA.{1}.mut.ClassI.csv'.format(cell, subset), 
                  'patient_affinities.low_{0}_by_tissue.TCGA.{1}.mut.ClassII.csv'.format(cell, subset)]*2

        script_label='low_{0}_by_tissue.{1}'.format(cell, subset)
        create_cluster_script(script_label, pans, classes, names, thresholds, tissue_fs, mut_fs, aff_fs)

##Decoding results

Pan

In [3]:
pd.read_csv('/cellar/users/ramarty/Data/hla_ii/generated_data/OR_clean/pan/class_ii/cancer.thresh_2.txt',
           sep=' ')

Unnamed: 0,OR,conf_OR_low,conf_OR_high,P
mutation,1.204147,1.177512,1.231384,1.401241e-59


In [4]:
pd.read_csv('/cellar/users/ramarty/Data/hla_ii/generated_data/OR_clean/pan/class_i/cancer.thresh_2.txt',
           sep=' ')

Unnamed: 0,OR,conf_OR_low,conf_OR_high,P
mutation,1.198532,1.174747,1.222799,3.647245e-70


In [13]:
randomizations = []
for i in range(100):
    randomizations.append(pd.read_csv('/cellar/users/ramarty/Data/hla_ii/generated_data/OR_clean/pan/random/class_ii_{0}.thresh_2.txt'.format(i),
           sep=' ').ix['mutation', 'OR'])

In [14]:
pd.Series(randomizations).describe()

count    100.000000
mean       1.204430
std        0.005770
min        1.188676
25%        1.200565
50%        1.204966
75%        1.208086
max        1.220599
dtype: float64

Tissue

In [57]:
df1 = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/generated_data/OR_clean/tissue/class_ii/cancer.thresh_2.txt',
           sep=',')
df2 = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/generated_data/OR_clean/tissue/class_ii/high_allelic_fraction.thresh_2.txt',
           sep=',')

In [61]:
df2

Unnamed: 0,OR,Lci,Hci,P,Error
GBM,1.159707,1.032579,1.302488,0.01237778,0.127129
LUAD,1.072565,0.977291,1.177128,0.1399395,0.095274
LUSC,1.209307,1.087692,1.34452,0.0004406989,0.121615
BLCA,1.335281,1.19505,1.491969,3.261492e-07,0.140232
PAAD,1.241835,1.010325,1.526394,0.03962983,0.23151
COAD,1.158516,1.156833,1.160203,0.0,0.001684
STAD,1.159879,1.035224,1.299544,0.01056484,0.124655
SKCM,1.298856,1.204125,1.40104,1.310182e-11,0.094731
THCA,2.71778,2.333033,3.165977,1.007315e-37,0.384747
HNSC,1.196227,1.073135,1.333437,0.001220567,0.123092
