In [1]:
# mhcflurry_cloud example
# Runs cross validation over a few models.
#
import mhcflurry_cloud
import mhcflurry
import fancyimpute

Using Theano backend.


In [2]:
# Load some data.
train_data = mhcflurry.dataset.Dataset.from_csv("../test/data/bdata.2009.mhci.public.1.txt")
train_data

Dataset(n=137654, alleles=['ELA-A1', 'Gogo-B0101', 'H-2-DB', 'H-2-DD', 'H-2-KB', 'H-2-KD', 'H-2-KK', 'H-2-LD', 'HLA-A0101', 'HLA-A0201', 'HLA-A0202', 'HLA-A0203', 'HLA-A0205', 'HLA-A0206', 'HLA-A0207', 'HLA-A0210', 'HLA-A0211', 'HLA-A0212', 'HLA-A0216', 'HLA-A0219', 'HLA-A0250', 'HLA-A0301', 'HLA-A0302', 'HLA-A11', 'HLA-A1101', 'HLA-A2', 'HLA-A2301', 'HLA-A2402', 'HLA-A2403', 'HLA-A2501', 'HLA-A26', 'HLA-A2601', 'HLA-A2602', 'HLA-A2603', 'HLA-A2902', 'HLA-A3001', 'HLA-A3002', 'HLA-A3101', 'HLA-A3201', 'HLA-A3301', 'HLA-A6601', 'HLA-A6801', 'HLA-A6802', 'HLA-A6901', 'HLA-A8001', 'HLA-B0702', 'HLA-B0801', 'HLA-B0802', 'HLA-B0803', 'HLA-B1402', 'HLA-B1501', 'HLA-B1502', 'HLA-B1503', 'HLA-B1509', 'HLA-B1517', 'HLA-B1801', 'HLA-B2701', 'HLA-B2702', 'HLA-B2703', 'HLA-B2705', 'HLA-B3501', 'HLA-B3503', 'HLA-B3508', 'HLA-B3801', 'HLA-B3901', 'HLA-B4001', 'HLA-B4002', 'HLA-B4201', 'HLA-B44', 'HLA-B4402', 'HLA-B4403', 'HLA-B4501', 'HLA-B4601', 'HLA-B4801', 'HLA-B5101', 'HLA-B5301', 'HLA-B5401', '

In [3]:
# Generate cross validation folds, with imputation.
imputer = fancyimpute.MICE(n_imputations=2, n_burn_in=1, n_nearest_columns=25)

folds = mhcflurry_cloud.cross_validation_folds(
    train_data,
    n_folds=3,
    imputer=imputer,
    drop_similar_peptides=True,
    alleles=["HLA-A0201", "HLA-A0202"],
    n_jobs=-1,
    verbose=5,
)
folds

[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   27.9s remaining:   27.9s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   38.7s finished


[AlleleSpecificTrainTestFold(allele='HLA-A0201', train=Dataset(n=6376, alleles=['HLA-A0201']), imputed_train=Dataset(n=19167, alleles=['HLA-A0201']), test=Dataset(n=1486, alleles=['HLA-A0201'])),
 AlleleSpecificTrainTestFold(allele='HLA-A0201', train=Dataset(n=6377, alleles=['HLA-A0201']), imputed_train=Dataset(n=19153, alleles=['HLA-A0201']), test=Dataset(n=1526, alleles=['HLA-A0201'])),
 AlleleSpecificTrainTestFold(allele='HLA-A0201', train=Dataset(n=6377, alleles=['HLA-A0201']), imputed_train=Dataset(n=19138, alleles=['HLA-A0201']), test=Dataset(n=1477, alleles=['HLA-A0201'])),
 AlleleSpecificTrainTestFold(allele='HLA-A0202', train=Dataset(n=2612, alleles=['HLA-A0202']), imputed_train=Dataset(n=19304, alleles=['HLA-A0202']), test=Dataset(n=716, alleles=['HLA-A0202'])),
 AlleleSpecificTrainTestFold(allele='HLA-A0202', train=Dataset(n=2613, alleles=['HLA-A0202']), imputed_train=Dataset(n=19304, alleles=['HLA-A0202']), test=Dataset(n=709, alleles=['HLA-A0202'])),
 AlleleSpecificTrainTe

In [4]:
# Make some models.
models = mhcflurry_cloud.models_grid(
    activation=["tanh", "relu"],
    layer_sizes=[[4]],
    embedding_output_dim=[8],
    n_training_epochs=[3])
models

[{'activation': 'tanh',
  'batch_size': 128,
  'dropout_probability': 0.5,
  'embedding_output_dim': 8,
  'fraction_negative': 0.2,
  'impute': False,
  'layer_sizes': [4],
  'max_ic50': 50000,
  'n_training_epochs': 3,
  'pretrain_decay': '1 / (1+epoch)**2'},
 {'activation': 'relu',
  'batch_size': 128,
  'dropout_probability': 0.5,
  'embedding_output_dim': 8,
  'fraction_negative': 0.2,
  'impute': False,
  'layer_sizes': [4],
  'max_ic50': 50000,
  'n_training_epochs': 3,
  'pretrain_decay': '1 / (1+epoch)**2'}]

In [6]:
# Train and test models on the folds.
df = mhcflurry_cloud.train_across_models_and_folds(folds, models)
df

  'precision', 'predicted', average, warn_for)


Unnamed: 0,allele,fold_num,model_num,train_size,test_size,imputed_train_size,train_auc,train_tau,train_f1,test_auc,...,model_fraction_negative,model_embedding_output_dim,model_activation,model_batch_size,model_max_ic50,model_dropout_probability,model_layer_sizes,model_n_training_epochs,model_impute,model_pretrain_decay
0,HLA-A0201,0,0,6376,1486,19167,0.860319,0.466539,0.458217,0.877381,...,0.2,8,tanh,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
1,HLA-A0201,0,1,6376,1486,19167,0.76498,0.331552,0.06198,0.754046,...,0.2,8,relu,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
2,HLA-A0201,1,0,6377,1526,19153,0.858363,0.452435,0.473381,0.869996,...,0.2,8,tanh,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
3,HLA-A0201,1,1,6377,1526,19153,0.861165,0.464758,0.41035,0.858737,...,0.2,8,relu,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
4,HLA-A0201,2,0,6377,1477,19138,0.895173,0.509815,0.499332,0.878287,...,0.2,8,tanh,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
5,HLA-A0201,2,1,6377,1477,19138,0.801731,0.381375,0.0,0.796867,...,0.2,8,relu,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
6,HLA-A0202,3,0,2612,716,19304,0.700828,0.266966,0.571429,0.651194,...,0.2,8,tanh,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
7,HLA-A0202,3,1,2612,716,19304,0.693713,0.236269,0.320106,0.658918,...,0.2,8,relu,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
8,HLA-A0202,4,0,2613,709,19304,0.790069,0.383208,0.690281,0.829016,...,0.2,8,tanh,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
9,HLA-A0202,4,1,2613,709,19304,0.657681,0.228233,0.621395,0.681189,...,0.2,8,relu,128,50000,0.5,[4],3,False,1 / (1+epoch)**2
