In [96]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix

from sklearn.feature_selection import RFE


In [97]:
# costumized stuff  
patient_data = True
Simple =  False
OnlyTME = True

In [98]:
if OnlyTME == True: 
    cols_to_drop = ['mw', 'aro', 'inst', 'cys_red', 'Mut_MHCrank_EL', 'Mut_MHCrank_BA',
       'Expression_Level', 'priority_Score', 'response',
       'variant_allele_frequency', 'cellular_prevalence', 'Self_Similarity',
       'est_freq', 'Score_PRIME', 'helix', 'MeanHydroph_coreNoAnc',
       'Prop_Small', 'Prop_Aromatic', 'Prop_Basic', 'Prop_Acidic', 'pI', 'DAI',
       'X.Rank_Stab', 'foginess_score', 'Peptide', 'HLA_allele', 'Patient',
       'Partition', 'cohort']
    TME_inc = "OnlyTME"

In [99]:
# run the data with or without the patient data for tumor microenviroment and with or withput selected cols
#print("running model with only_imp_cols:" + only_imp_cols + "and patient_data"+ patient_data)

if patient_data == True and Simple == False: 
    cols_to_drop = ['response','Peptide', 'HLA_allele','Patient','Partition','cohort','est_freq',
                  'CD8 T cells','Cytotoxic lymphocytes','B lineage',
                   'NK cells','Monocytic lineage','Myeloid dendritic cells',
                   'Neutrophils','Endothelial cells']
    TME_inc = "TME_included"
    sim = "advance"
if patient_data == False and Simple == False: 
    cols_to_drop = ['response','Peptide', 'HLA_allele','Patient','Partition','cohort','est_freq',"CYT","HLA_expression",
                   'T cells','CD8 T cells','Cytotoxic lymphocytes','B lineage',
                   'NK cells','Monocytic lineage','Myeloid dendritic cells',
                   'Neutrophils','Endothelial cells','Fibroblasts']
    TME_inc = "TME_excluded"
    sim = "advance"
if patient_data == False and Simple == True: 
    cols_to_drop = ['response','Peptide', 'HLA_allele','Patient','Partition','cohort','est_freq',"CYT",
                   "Expression_Level", "cellular_prevalence","CYT","HLA_expression","variant_allele_frequency",
                   "Self_Similarity","priority_Score","foginess_score",
                   'T cells','CD8 T cells','Cytotoxic lymphocytes','B lineage',
                   'NK cells','Monocytic lineage','Myeloid dendritic cells',
                   'Neutrophils','Endothelial cells','Fibroblasts']
    TME_inc = "TME_excluded"
    sim = "Simple"
   




# data = data[["response","Peptide", "HLA_allele","Patient","Partition","cohort","est_freq","CYT","CD8A",
 #             "Prop_Hydrophobic","Score_PRIME","Mut_MHCrank_BA","Mut_MHCrank_EL","DAI",
  #          "Thalf.h.","mut_rep_rank_netstabpan","agretopicity","helix","inst","Prop_Polar","aro",
   #         "mw","Prop_Non.polar","Prop_Aliphatic","Prop_Tiny","Prop_Charged","Prop_Small","Prop_Basic"]]

In [100]:

    
data = pd.read_csv('../../data/03_partitioning_data/txt/03_2_filtered_data_model.txt', sep='\t')
Var_importance_filename = '../../results/RandomForrest/tabels/Feature_importance_' + TME_inc+sim +'.txt'
outfile = open(Var_importance_filename ,'w')

print(data.columns)

Index(['mw', 'aro', 'inst', 'cys_red', 'Mut_MHCrank_EL', 'Mut_MHCrank_BA',
       'Expression_Level', 'priority_Score', 'response',
       'variant_allele_frequency', 'cellular_prevalence', 'Self_Similarity',
       'est_freq', 'Score_PRIME', 'helix', 'MeanHydroph_coreNoAnc',
       'Prop_Small', 'Prop_Aromatic', 'Prop_Basic', 'Prop_Acidic', 'pI', 'DAI',
       'X.Rank_Stab', 'foginess_score', 'HLA_expression', 'CYT', 'T cells',
       'CD8 T cells', 'Cytotoxic lymphocytes', 'B lineage', 'NK cells',
       'Monocytic lineage', 'Myeloid dendritic cells', 'Neutrophils',
       'Endothelial cells', 'Fibroblasts', 'Peptide', 'HLA_allele', 'Patient',
       'Partition', 'cohort'],
      dtype='object')


In [101]:
# nested cross validation
partitioning_list = data.Partition.unique()
pred_df = pd.DataFrame()
for i in partitioning_list:
    test = data[data.Partition == i]
    train = data[data.Partition != i]
    info = test[["Peptide", "HLA_allele","Patient","cohort"]]
    print(len(info))
   # train_pos = train[train.response==1]
   # train_neg = train[train.response==0].sample(n=2000, random_state=n) #, random_state=45
   # train = shuffle(pd.concat([train_pos, train_neg], axis=0)).reset_index(drop=True)
    X_train = train.drop(cols_to_drop, axis=1).reset_index(drop=True)
    y_train = train["response"]
    feature_list = list(X_train.columns)


    # Instantiate the RF and the MLP
    rf = RandomForestClassifier(random_state = 42, max_depth = 4, n_estimators = 1000,n_jobs=-1)
   # rf = RFE(rf,n_features_to_select=5)

            # Train the models on training data
    rf.fit(X_train, y_train)
    X_test = test.drop(cols_to_drop, axis=1).reset_index(drop=True)
    y_test = test["response"]  

    #Get fetures importance per partition
    importances = list(rf.feature_importances_)
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
 #   [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
  #  gather all feature importance 
    for pair in feature_importances:
        outfile.write(str(pair[0]) + "\t" + str(pair[1]) + "\t" + str(i) + "\n")

    prediction_rf = rf.predict_proba(X_test)

    pred_val_rf = pd.DataFrame(prediction_rf[:,1])
    pred_val_rf.reset_index(drop=True, inplace=True)
    mapping = {pred_val_rf.columns[0]: "prediction_rf"}
    pred_val_rf = pred_val_rf.rename(columns=mapping)

    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    info.reset_index(drop=True, inplace=True)
    print(len(X_test))
    ped_df = pd.concat([info,X_test,y_test,pred_val_rf],axis = 1)
    ped_df["Partition"] = i
    pred_df = pred_df.append(ped_df)

        
print("RF AUC:",round(roc_auc_score(pred_df.response, pred_df.prediction_rf),4))    
outfile.close()  


3904
3904
3903
3903
3910
3910
3917
3917
3907
3907
RF AUC: 0.6971


In [102]:
# print to outfile 
pred_df.to_csv(r'../../results/RandomForrest/tabels/pred_df_'+TME_inc+sim+'.txt', index=None, sep=' ', mode='w')

In [79]:
print(pred_df)

          Peptide  HLA_allele  Patient   cohort         mw       aro  \
0     ILEYTDQISKY  HLA-A01:01  BC-1849  bladder  1372.5186  0.181818   
1       CIDFQPDIY  HLA-A01:01  BC-2389  bladder  1113.2396  0.222222   
2      SCIDFQPDIY  HLA-A01:01  BC-2389  bladder  1200.3169  0.200000   
3      ANDNSPFMLY  HLA-A01:01  BC-7577  bladder  1171.2789  0.200000   
4       FMIVALHLL  HLA-A02:01  BC-2131  bladder  1056.3637  0.111111   
...           ...         ...      ...      ...        ...       ...   
3902   AEAATGWELP  HLA-B44:02    RH-08   Basket  1044.1149  0.100000   
3903    GPAEAATGW  HLA-B44:02    RH-08   Basket   858.8946  0.111111   
3904    ASAHSPRSY  HLA-A11:01    RH-08   Basket   975.0164  0.111111   
3905  AEASAHSPRSY  HLA-B44:02    RH-08   Basket  1175.2083  0.090909   
3906    SAHSPRSYL  HLA-C05:01    RH-08   Basket  1017.0961  0.111111   

           inst  cys_red  Mut_MHCrank_EL  Mut_MHCrank_BA  ...  Prop_Aromatic  \
0     18.881818     2980          0.2636          0.620