# Ensemble Models for data of each pathogen (calculated separately)

In [1]:
#import libraries
import numpy as np
import pandas as pd
import glob
import math 

In [2]:
from tensorflow.keras.models import load_model

In [3]:
import gc

## Model and accuracy loading functions

In [4]:
def load_all_models(files_dict, pathogen, models_dir = "models/weights/"):
    all_models = list()
    for model, epoch in files_dict.items():
        # Generate file path patterns
#         pattern = models_dir + pathogen + "/" + model + "_" + str(int(epoch)).zfill(2) + "*" + '.hdf5'
        pattern = models_dir + pathogen + "/" + model + "/" + "sequential" + "_1." + str(int(epoch)).zfill(2) + "*" + '.hdf5'
        print('Loading %s model...' % model)
        file_paths = glob.glob(pattern)
        # Add to list of members
        if len(file_paths) == 0:
            print('✘ No models found for specified epoch')
            all_models.append(None)
        else:
            # Load files in loop
            for file_name in file_paths:
                # Load model from file
                model = load_model(file_name)
                all_models.append(model)
                print('✔ Loaded %s' % file_name)
    return all_models

In [5]:
def load_all_accuracies(files_dict, pathogen, accs_dir = "models/history_results/"):
    all_accs = list()
    for model, epoch in files_dict.items():
        # Generate file path patterns
        pattern = accs_dir + pathogen + "/" + "df_results_" + pathogen + "_val_" + model + "*" + '.csv'
        print('Loading %s model accuracy...' % model)
        file_path = glob.glob(pattern)[0]
        # Load accuracies
        acc_df = pd.read_csv(file_path)
        # Add to list of members
        try:
            acc = acc_df.loc[int(epoch) - 1, "acc"]
            all_accs.append(acc)
            print('✔ Loaded %s' % file_path)
        except KeyError:
            print('✘ No accuracy available for specified epoch')
            all_accs.append(None)
    return all_accs

## Encoding functions

In this function below, the data will be encoded. There are two different options of encoding depends on the model itself.
- CNN-LSTM and CNN-GRU: One hot encoding
- LSTM-Embedding and GRU-Embedding: Integer Encoding

In [6]:
def get_onehot_encoding(mydata, max_length = 4034):
    def get_key(mydict, element):
        key = list(mydict.keys())[list(mydict.values()).index(element)]
        return(key)
    
    amino = [
        'R', 'K', 'D', 'E', 'Q', 'N', 'H', 'S', 'T', 'Y', 
        'C', 'W', 'A', 'I', 'L', 'M', 'F', 'V', 'P', 'G'
    ]
    token_index = dict(zip(range(1, (len(amino) + 1)), amino))
    
    results = np.zeros((len(mydata), max_length, max(token_index.keys())))
    for i, sample in enumerate(mydata):
        for j, character in enumerate(sample):
            if character in token_index.values():
                index = get_key(token_index, character) - 1
                results[i, j, index] = 1.
            else:
                results[i, j, :] = results[i, j, :]
    return results

In [7]:
def get_integer_encoding(input, max_length = 4034):
    def get_value(mydict, element):
        key = mydict.get(element)
        return(key)
    
    dic = {
        'A': 1, 'B': 22, 'U': 23, 'J': 24, 'Z': 25, 'O': 26, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 
        'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 
        'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'X': 21
    }

    results = np.zeros((len(input), max_length))
    for i, sample in enumerate(input):
        for j, character in enumerate(sample):
            results[i, j] = get_value(dic, character)
    return results

In [8]:
def get_encoding(input, method, max_length_ohe = 4034, max_length_integer = 4034):
    if method == "ohe":
        output = get_onehot_encoding(input, max_length_ohe)
    elif method == "int":
        output = get_integer_encoding(input, max_length_integer)
    return output

## Ensemble functions

### Function to load the models and the weights needed

In [9]:
def load_models_and_weights(files_dict, pathogen, models_dir = "models/weights/", accs_dir = "models/history_results/"):
    models = load_all_models(files_dict, pathogen, models_dir)
    print("")
    weights = load_all_accuracies(files_dict, pathogen, accs_dir)
    names = [name for name, epoch in files_dict.items()]
    gc.collect()
    
    return(models, weights, names)

### Functions for getting each ensemble method

A function below is a function that that is used to get the ensemble results.

#### Voting classifier

In [10]:
# Define function to get the voting for functions 
# Source https://stackoverflow.com/questions/12297016/how-to-find-most-frequent-values-in-numpy-ndarray
def ens_voting_classifier(ens_input):
    axis = 0
    u, indices = np.unique(ens_input, return_inverse=True)
    
    ens_res =u[np.argmax(np.apply_along_axis(np.bincount, axis, indices.reshape(ens_input.shape),None, np.max(indices) + 1), axis = 0)]
    
    return ens_res

#### Weighted average

In [25]:
# Define the function to make the shifted sigmoid for map x in [0,1] to f(x) in [0,1]
# Source of logistic function: https://en.wikipedia.org/wiki/Logistic_function
def sigmoid(x, x_0, k):
     return 1 / (1 + math.e ** (-k * (x - x_0)))

# Define a function to the weighted average
def ens_weighted_average(input_pred, weights):
    ens_res = np.average(input_pred, axis = 0, weights = weights)
    ens_res = sigmoid(ens_res, x_0 = 0.5, k = 10)
    ens_res = np.where(ens_res < 0.5, 0, 1)
    
    return ens_res

### Functions to calculate Ensemble

In [26]:
def calculate_ensemble(input_list, models, weights, names, encodings, maxlen_ohe, maxlen_int, models_dir = "models/weights/", accs_dir = "models/history_results/"):
    num_inputs = len(input_list)
    num_models = len(models)
    
    # Calculate predictions
    predictions = list()
    for model in range(0, num_models):
        predictions.append(models[model].predict(get_encoding(input_list, encodings[model], maxlen_ohe, maxlen_int)))
    
    # all predictions
    predictions = np.asarray(predictions)
    predictions_binary = np.where(predictions < 0.5, 0, 1)
    
    # Define the weighted average ensemble prediction
    ens_res_weighted_avg = ens_weighted_average(predictions, weights = weights)
    
    # Define the voting classifier ensemble predicition
    ens_res_voting_classifier = ens_voting_classifier(predictions_binary)
    
    # Construct data frame
    seq_col = np.transpose([input_list])
    pred_cols = np.concatenate((predictions_binary, [ens_res_weighted_avg], [ens_res_voting_classifier]))
    pred_cols = np.transpose(np.reshape(pred_cols, (num_models + 2, num_inputs)))
    df = pd.DataFrame.from_records(np.concatenate((seq_col, pred_cols), axis = 1))
    
    # Rename data frame columns
    df_names = list.copy(names)
    df_names.insert(0, "sequence")
    df_names.append("ensemble_weighted")
    df_names.append("ensemble_voting")
    
    print("df names:", df_names)
    df.columns = df_names
    
    return df

## Run functions

In [13]:
# Truncate long sequences in Pandas data frames
pd.options.display.max_colwidth = 20

In [14]:
# Load all models and epochs for bacteria data 
models_and_weights_all_epochs_bacteria = load_models_and_weights(
    files_dict = {
        "cnn_lstm": "60", 
        "cnn_gru": "30",
        "gru_emb": "30",
        "lstm_emb": "28"
    },
    pathogen = "bacteria"
)

Loading cnn_lstm model...
✔ Loaded models/weights/bacteria/cnn_lstm/sequential_1.60-0.26.hdf5
Loading cnn_gru model...
✔ Loaded models/weights/bacteria/cnn_gru/sequential_1.30-0.15.hdf5
Loading gru_emb model...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


✔ Loaded models/weights/bacteria/gru_emb/sequential_1.30-0.20.hdf5
Loading lstm_emb model...
✔ Loaded models/weights/bacteria/lstm_emb/sequential_1.28-0.13.hdf5

Loading cnn_lstm model accuracy...
✔ Loaded models/history_results/bacteria/df_results_bacteria_val_cnn_lstm_secreted.csv
Loading cnn_gru model accuracy...
✔ Loaded models/history_results/bacteria/df_results_bacteria_val_cnn_gru_secreted.csv
Loading gru_emb model accuracy...
✔ Loaded models/history_results/bacteria/df_results_bacteria_val_gru_emb_secreted.csv
Loading lstm_emb model accuracy...
✔ Loaded models/history_results/bacteria/df_results_bacteria_val_lstm_emb_secreted.csv


In [18]:
models_and_weights_all_epochs_bacteria

([<tensorflow.python.keras.engine.training.Model at 0xb389f7048>,
  <tensorflow.python.keras.engine.sequential.Sequential at 0xb3de3fb70>,
  <tensorflow.python.keras.engine.sequential.Sequential at 0xb3e8e9be0>,
  <tensorflow.python.keras.engine.sequential.Sequential at 0xb4002fb70>],
 [0.8552631610318234,
  0.9473684210526316,
  0.8947368389681766,
  0.934210529452876],
 ['cnn_lstm', 'cnn_gru', 'gru_emb', 'lstm_emb'])

In [36]:
# Load all models and epochs for all data 
models_and_weights_all_epochs_all = load_models_and_weights(
    files_dict = {
        "cnn_lstm": "30", 
        "cnn_gru": "40",
        "gru_emb": "60",
        "lstm_emb": "30"
    },
    pathogen = "all"
)

Loading cnn_lstm model...
✔ Loaded models/weights/all/cnn_lstm/sequential_1.30-0.54.hdf5
Loading cnn_gru model...
✔ Loaded models/weights/all/cnn_gru/sequential_1.40-0.34.hdf5
Loading gru_emb model...
✔ Loaded models/weights/all/gru_emb/sequential_1.60-0.33.hdf5
Loading lstm_emb model...
✔ Loaded models/weights/all/lstm_emb/sequential_1.30-0.39.hdf5

Loading cnn_lstm model accuracy...
✔ Loaded models/history_results/all/df_results_all_val_cnn_lstm_secreted.csv
Loading cnn_gru model accuracy...
✔ Loaded models/history_results/all/df_results_all_val_cnn_gru_secreted.csv
Loading gru_emb model accuracy...
✔ Loaded models/history_results/all/df_results_all_val_gru_emb_secreted.csv
Loading lstm_emb model accuracy...
✔ Loaded models/history_results/all/df_results_all_val_lstm_emb_secreted.csv


In [40]:
# Load all models and epochs for oomycete
models_and_weights_all_epochs_oomycete = load_models_and_weights(
    files_dict = {
        "cnn_lstm": "30", 
        "cnn_gru": "30",
        "gru_emb": "30",
        "lstm_emb": "30"
    },
    pathogen = "oomycete"
)

Loading cnn_lstm model...
✔ Loaded models/weights/oomycete/cnn_lstm/sequential_1.30-0.63.hdf5
Loading cnn_gru model...
✔ Loaded models/weights/oomycete/cnn_gru/sequential_1.30-0.43.hdf5
Loading gru_emb model...
✔ Loaded models/weights/oomycete/gru_emb/sequential_1.30-1.49.hdf5
Loading lstm_emb model...
✔ Loaded models/weights/oomycete/lstm_emb/sequential_1.30-0.31.hdf5

Loading cnn_lstm model accuracy...
✔ Loaded models/history_results/oomycete/df_results_oomycete_val_cnn_lstm_saved_model.csv
Loading cnn_gru model accuracy...
✔ Loaded models/history_results/oomycete/df_results_oomycete_val_cnn_gru_secreted.csv
Loading gru_emb model accuracy...
✔ Loaded models/history_results/oomycete/df_results_oomycete_val_gru_emb_saved_model.csv
Loading lstm_emb model accuracy...
✔ Loaded models/history_results/oomycete/df_results_oomycete_val_lstm_emb_saved_model.csv


In [16]:
# Load all models and epochs for fungi
models_and_weights_all_epochs_fungi = load_models_and_weights(
    files_dict = {
        "cnn_lstm": "30", 
        "cnn_gru": "60",
        "gru_emb": "30",
        "lstm_emb": "60"
    },
    pathogen = "fungi"
)

Loading cnn_lstm model...
✔ Loaded models/weights/fungi/cnn_lstm/sequential_1.30-0.35.hdf5
Loading cnn_gru model...
✔ Loaded models/weights/fungi/cnn_gru/sequential_1.60-0.43.hdf5
Loading gru_emb model...
✔ Loaded models/weights/fungi/gru_emb/sequential_1.30-0.48.hdf5
Loading lstm_emb model...
✔ Loaded models/weights/fungi/lstm_emb/sequential_1.60-0.50.hdf5

Loading cnn_lstm model accuracy...
✔ Loaded models/history_results/fungi/df_results_fungi_val_cnn_lstm_secreted.csv
Loading cnn_gru model accuracy...
✔ Loaded models/history_results/fungi/df_results_fungi_val_cnn_gru_secreted.csv
Loading gru_emb model accuracy...
✔ Loaded models/history_results/fungi/df_results_fungi_val_gru_emb_secreted.csv
Loading lstm_emb model accuracy...
✔ Loaded models/history_results/fungi/df_results_fungi_val_lstm_emb_secreted.csv


In [30]:
# Load all models and epochs for fungi
models_and_weights_all_epochs_fungi_cnnlstm_gruemb = load_models_and_weights(
    files_dict = {
        "cnn_lstm": "30", 
        "gru_emb": "30"
    },
    pathogen = "fungi"
)

Loading cnn_lstm model...
✔ Loaded models/weights/fungi/cnn_lstm/sequential_1.30-0.35.hdf5
Loading gru_emb model...
✔ Loaded models/weights/fungi/gru_emb/sequential_1.30-0.48.hdf5

Loading cnn_lstm model accuracy...
✔ Loaded models/history_results/fungi/df_results_fungi_val_cnn_lstm_secreted.csv
Loading gru_emb model accuracy...
✔ Loaded models/history_results/fungi/df_results_fungi_val_gru_emb_secreted.csv


## Ensemble of the whole test bacteria data

In [24]:
test_seq = pd.read_csv("../../../data/secreted_data/ready_to_process/splitted-data/bacteria_testing.csv")

In [31]:
# test_seq
# test_seq.columns = ["Sequence"]

In [26]:
test_seq = test_seq.Sequence.tolist()
# test_seq

In [22]:
test = calculate_ensemble(
    input_list = test_seq,
    models = models_and_weights_all_epochs[0],
    weights = models_and_weights_all_epochs[1],
    names = models_and_weights_all_epochs[2],
    encodings = ["ohe", "ohe", "int", "int"], 
    maxlen = 2574
)

df names: ['sequence', 'cnn_lstm', 'cnn_gru', 'gru_emb', 'lstm_emb', 'ensemble_weighted', 'ensemble_voting']


In [None]:
test

In [23]:
test.to_csv("pred_result/bacteria/df_pred_ens_bacteria.csv")

## Ensemble of the whole test all data

In [30]:
test_seq_all = pd.read_csv("../../../data/secreted_data/ready_to_process/splitted-data/all_testing.csv")

In [32]:
test_seq_all = test_seq_all.Sequence.tolist()

In [37]:
pred_ensemble_all = calculate_ensemble(
    input_list = test_seq_all,
    models = models_and_weights_all_epochs_all[0],
    weights = models_and_weights_all_epochs_all[1],
    names = models_and_weights_all_epochs_all[2],
    encodings = ["ohe", "ohe", "int", "int"], 
    maxlen = 4034
)

df names: ['sequence', 'cnn_lstm', 'cnn_gru', 'gru_emb', 'lstm_emb', 'ensemble_weighted', 'ensemble_voting']


In [38]:
pred_ensemble_all

Unnamed: 0,sequence,cnn_lstm,cnn_gru,gru_emb,lstm_emb,ensemble_weighted,ensemble_voting
0,MFFPSLILAAGSLSTL...,1,1,1,0,1,1
1,MVKLYCAVVGVAGSAF...,1,1,1,1,1,1
2,MPARHHTIQRKRSIGA...,0,0,0,0,0,0
3,MMQWSAILIRTCFSGS...,0,1,1,1,1,1
4,MRVTTFNTFLLTLGTV...,1,1,1,1,1,1
5,MRSIFYVALAFAVLAR...,1,1,1,1,1,1
6,MRVLRVTFLWALLLLV...,0,0,0,0,0,0
7,MAIDMYLKVDGVTGES...,1,1,1,1,1,1
8,MHLSYVLLMVAATLLA...,0,0,1,0,0,0
9,MRLHILLFTLSSSTSL...,1,0,0,0,0,0


In [39]:
pred_ensemble_all.to_csv("pred_result/all/df_pred_ens_all.csv")

## Ensemble of the whole test oomycete data

In [41]:
test_seq_oomycete = pd.read_csv("../../../data/secreted_data/ready_to_process/splitted-data/oomycete_testing.csv")

In [42]:
test_seq_oomycete = test_seq_oomycete.Sequence.tolist()

In [56]:
pred_ensemble_oomycete = calculate_ensemble(
    input_list = test_seq_oomycete,
    models = models_and_weights_all_epochs_oomycete[0],
    weights = models_and_weights_all_epochs_oomycete[1],
    names = models_and_weights_all_epochs_oomycete[2],
    encodings = ["ohe", "ohe", "int", "int"], 
    maxlen_ohe = 934, 
    maxlen_int = 820
)

df names: ['sequence', 'cnn_lstm', 'cnn_gru', 'gru_emb', 'lstm_emb', 'ensemble_weighted', 'ensemble_voting']


In [57]:
pred_ensemble_oomycete.to_csv("pred_result/oomycete/df_pred_ens_oomycete.csv")

## Ensemble of the whole test fungi data

In [16]:
test_seq_fungi = pd.read_csv("../../../data/secreted_data/ready_to_process/splitted-data/fungi_testing.csv")

In [17]:
test_seq_fungi = test_seq_fungi.Sequence.tolist()

In [1]:
pred_ensemble_fungi = calculate_ensemble(
    input_list = test_seq_fungi,
    models = models_and_weights_all_epochs_fungi[0],
    weights = models_and_weights_all_epochs_fungi[1],
    names = models_and_weights_all_epochs_fungi[2],
    encodings = ["ohe", "ohe", "int", "int"], 
    maxlen_ohe = 4034, 
    maxlen_int = 4034
)

In [20]:
pred_ensemble_fungi.to_csv("pred_result/fungi/df_pred_ens_fungi.csv")

In [22]:
# pred_ensemble_fungi

In [31]:
pred_ensemble_fungi_cnnlstm_gruemb = calculate_ensemble(
    input_list = test_seq_fungi,
    models = models_and_weights_all_epochs_fungi_cnnlstm_gruemb[0],
    weights = models_and_weights_all_epochs_fungi_cnnlstm_gruemb[1],
    names = models_and_weights_all_epochs_fungi_cnnlstm_gruemb[2],
    encodings = ["ohe", "int"], 
    maxlen_ohe = 4034, 
    maxlen_int = 4034
)

df names: ['sequence', 'cnn_lstm', 'gru_emb', 'ensemble_weighted', 'ensemble_voting']


In [33]:
pred_ensemble_fungi_cnnlstm_gruemb.to_csv("pred_result/fungi/df_pred_ens_fungi_cnnlstm_gruemb.csv")