# Ensemble model for dataset with non-effector randomly chosen from NCBI

In [18]:
#import libraries
import numpy as np
import pandas as pd
import glob
import math 

In [19]:
from tensorflow.keras.models import load_model

In [20]:
import gc

## Model and accuracy loading functions

In [21]:
def load_all_models(files_dict, models_dir = "models/weights/"):
    all_models = list()
    for model, epoch in files_dict.items():
        # Generate file path patterns
        pattern = models_dir + model + "_" + str(int(epoch)).zfill(2) + "*" + '.hdf5'
        print('Loading %s model...' % model)
        file_paths = glob.glob(pattern)
        # Add to list of members
        if len(file_paths) == 0:
            print('✘ No models found for specified epoch')
            all_models.append(None)
        else:
            # Load files in loop
            for file_name in file_paths:
                # Load model from file
                model = load_model(file_name)
                all_models.append(model)
                print('✔ Loaded %s' % file_name)
    return all_models

In [22]:
def load_all_accuracies(files_dict, accs_dir = "models/history_results/"):
    all_accs = list()
    for model, epoch in files_dict.items():
        # Generate file path patterns
        pattern = accs_dir + "df_results_val_" + model + "*" + '.csv'
        print('Loading %s model accuracy...' % model)
        file_path = glob.glob(pattern)[0]
        # Load accuracies
        acc_df = pd.read_csv(file_path)
        # Add to list of members
        try:
            acc = acc_df.loc[int(epoch) - 1, "acc"]
            all_accs.append(acc)
            print('✔ Loaded %s' % file_path)
        except KeyError:
            print('✘ No accuracy available for specified epoch')
            all_accs.append(None)
    return all_accs

## Encoding functions

In this function below, the data will be encoded. There are two different options of encoding depends on the model itself.
- CNN-LSTM and CNN-GRU: One hot encoding
- LSTM-Embedding and GRU-Embedding: Integer Encoding

In [23]:
def get_onehot_encoding(mydata, max_length = 4034):
    def get_key(mydict, element):
        key = list(mydict.keys())[list(mydict.values()).index(element)]
        return(key)
    
    amino = [
        'R', 'K', 'D', 'E', 'Q', 'N', 'H', 'S', 'T', 'Y', 
        'C', 'W', 'A', 'I', 'L', 'M', 'F', 'V', 'P', 'G'
    ]
    token_index = dict(zip(range(1, (len(amino) + 1)), amino))
    
    results = np.zeros((len(mydata), max_length, max(token_index.keys())))
    for i, sample in enumerate(mydata):
        for j, character in enumerate(sample):
            if character in token_index.values():
                index = get_key(token_index, character) - 1
                results[i, j, index] = 1.
            else:
                results[i, j, :] = results[i, j, :]
    return results

In [24]:
def get_integer_encoding(input, max_length = 4034):
    def get_value(mydict, element):
        key = mydict.get(element)
        return(key)
    
    dic = {
        'A': 1, 'B': 22, 'U': 23, 'J': 24, 'Z': 25, 'O': 26, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 
        'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 
        'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'X': 21
    }

    results = np.zeros((len(input), max_length))
    for i, sample in enumerate(input):
        for j, character in enumerate(sample):
            results[i, j] = get_value(dic, character)
    return results

In [25]:
def get_encoding(input, method, max_length = 4034):
    if method == "ohe":
        output = get_onehot_encoding(input, max_length)
    elif method == "int":
        output = get_integer_encoding(input, max_length)
    return output

## Ensemble functions

### Function to load the models and the weights needed

In [26]:
def load_models_and_weights(files_dict, models_dir = "models/weights/", accs_dir = "models/history_results/"):
    models = load_all_models(files_dict, models_dir)
    print("")
    weights = load_all_accuracies(files_dict, accs_dir)
    names = [name for name, epoch in files_dict.items()]
    gc.collect()
    
    return (models, weights, names)

### Functions for getting each ensemble method

A function below is a function that that is used to get the ensemble results.

#### Voting classifier

In [27]:
# Define function to get the voting for functions 
# Source https://stackoverflow.com/questions/12297016/how-to-find-most-frequent-values-in-numpy-ndarray
def ens_voting_classifier(ens_input):
    axis = 0
    u, indices = np.unique(ens_input, return_inverse=True)
    
    ens_res = u[np.argmax(np.apply_along_axis(np.bincount, axis, indices.reshape(ens_input.shape),None, np.max(indices) + 1), axis = 0)]
    
    return ens_res

#### Weighted average

In [28]:
# Define the function to make the shifted sigmoid for map x in [0,1] to f(x) in [0,1]
# Source of logistic function: https://en.wikipedia.org/wiki/Logistic_function
def sigmoid(x, x_0, k):
     return 1 / (1 + math.e ** (-k * (x - x_0)))

# Define a function to the weighted average
def ens_weighted_average(input_pred, weights):
    ens_res = np.average(input_pred, axis = 0, weights = weights)
    ens_res = sigmoid(ens_res, x_0 = 0.5, k = 10)
    ens_res = np.where(ens_res > 0.5, 1, 0)
    
    return ens_res

### Functions to calculate Ensemble

In [29]:
def calculate_ensemble(input_list, models, weights, names, encodings, models_dir = "models/weights/", accs_dir = "models/history_results/"):
    num_inputs = len(input_list)
    num_models = len(models)
    
    # Calculate predictions
    predictions = list()
    for model in range(0, num_models):
        predictions.append(models[model].predict(get_encoding(input_list, encodings[model])))
    
    # all predictions
    predictions = np.asarray(predictions)
    predictions_binary = np.where(predictions < 0.5, 0, 1)
    
    # Define the weighted average ensemble prediction
    ens_res_weighted_avg = ens_weighted_average(predictions, weights = weights)
    
    # Define the voting classifier ensemble predicition
    ens_res_voting_classifier = ens_voting_classifier(predictions_binary)
    
    # Construct data frame
    seq_col = np.transpose([input_list])
    pred_cols = np.concatenate((predictions_binary, [ens_res_weighted_avg], [ens_res_voting_classifier]))
    pred_cols = np.transpose(np.reshape(pred_cols, (num_models + 2, num_inputs)))
    df = pd.DataFrame.from_records(np.concatenate((seq_col, pred_cols), axis = 1))
    
    # Rename data frame columns
    df_names = list.copy(names)
    df_names.insert(0, "sequence")
    df_names.append("ensemble_weighted")
    df_names.append("ensemble_voting")
    
    print("df names:", df_names)
    df.columns = df_names
    
    return df

## Run functions

In [30]:
# Truncate long sequences in Pandas data frames
pd.options.display.max_colwidth = 20

In [31]:
models_and_weights = load_models_and_weights(
    files_dict = {
        "cnn_lstm": "10", 
        "cnn_gru": "11",
        "gru_emb": "9",
        "lstm_emb": "8"
    }
)

Loading cnn_lstm model...
✔ Loaded models/weights/cnn_lstm_10-0.93.hdf5
Loading cnn_gru model...
✔ Loaded models/weights/cnn_gru_11-0.64.hdf5
Loading gru_emb model...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


✔ Loaded models/weights/gru_emb_09-0.64.hdf5
Loading lstm_emb model...
✔ Loaded models/weights/lstm_emb_08-0.53.hdf5

Loading cnn_lstm model accuracy...
✔ Loaded models/history_results/df_results_val_cnn_lstm_saved_model.csv
Loading cnn_gru model accuracy...
✔ Loaded models/history_results/df_results_val_cnn_gru_saved_model1.csv
Loading gru_emb model accuracy...
✔ Loaded models/history_results/df_results_val_gru_emb_saved_model1.csv
Loading lstm_emb model accuracy...
✔ Loaded models/history_results/df_results_val_lstm_emb_saved_model1.csv


In [32]:
models_and_weights

([<tensorflow.python.keras.engine.training.Model at 0xb3142fa90>,
  <tensorflow.python.keras.engine.sequential.Sequential at 0xb34134ba8>,
  <tensorflow.python.keras.engine.sequential.Sequential at 0xb34d8ce10>,
  <tensorflow.python.keras.engine.sequential.Sequential at 0xb35eccba8>],
 [0.7712418285070681,
  0.6928104586850584,
  0.6862745074664846,
  0.7189542483660131],
 ['cnn_lstm', 'cnn_gru', 'gru_emb', 'lstm_emb'])

In [33]:
models_and_weights_all_epochs = load_models_and_weights(
    files_dict = {
        "cnn_lstm": "30", 
        "cnn_gru": "30",
        "gru_emb": "30",
        "lstm_emb": "28"
    }
)

Loading cnn_lstm model...
✔ Loaded models/weights/cnn_lstm_30-0.41.hdf5
Loading cnn_gru model...
✔ Loaded models/weights/cnn_gru_30-0.39.hdf5
Loading gru_emb model...
✔ Loaded models/weights/gru_emb_30-0.50.hdf5
Loading lstm_emb model...
✔ Loaded models/weights/lstm_emb_28-0.46.hdf5

Loading cnn_lstm model accuracy...
✔ Loaded models/history_results/df_results_val_cnn_lstm_saved_model.csv
Loading cnn_gru model accuracy...
✔ Loaded models/history_results/df_results_val_cnn_gru_saved_model1.csv
Loading gru_emb model accuracy...
✔ Loaded models/history_results/df_results_val_gru_emb_saved_model1.csv
Loading lstm_emb model accuracy...
✔ Loaded models/history_results/df_results_val_lstm_emb_saved_model1.csv


In [35]:
models_and_weights_all_epochs

([<tensorflow.python.keras.engine.training.Model at 0xb3ad15be0>,
  <tensorflow.python.keras.engine.sequential.Sequential at 0xb3b307320>,
  <tensorflow.python.keras.engine.sequential.Sequential at 0xb3b2d4ba8>,
  <tensorflow.python.keras.engine.sequential.Sequential at 0xb3eb09748>],
 [0.7254901972471499,
  0.6797385636497947,
  0.7385620926719865,
  0.6993464052287581],
 ['cnn_lstm', 'cnn_gru', 'gru_emb', 'lstm_emb'])

## Testing

### Testing on effector data

In [36]:
real_test_effector = calculate_ensemble(
    input_list = ["MRLTNTLVVAVAAILLASENAFSAATDADQATVSKLAAAEFDTLVDVLTTESKRSLRATVDDGEERYKQFKIEALKKGKWTDIFNKWKGNELSPAEVQNKLKNKKLSDDLKDAIFRNYKDW", 
                  "MKLIHIISSVVIFSLSVLATNDWDCRGNTIYAYTIDREMYWCFHKYSLTNPSDYIDIPEHHIFKTEFKVPGSNPSGARFEFRFNKDYVISYFVYKYNGQIFDCNRLTTSTMASD",
                  "MLFKQCTALKFLIFILGFSIIAAQYVVDPGFGEIECMCGQIARLTQRPFDVECEATPSCSCDYRGDCPGPAAEYVYRCPTCGPRSHVGCFGVHQGTCEEVHPGIARVQYQNSDSESE",
                  "MRFEYISVLALCGASLARDHQYCACQSGSGDSIDIDATTQLQNDNSKSYLWAQTSPAYWFADRHKPGPRFAGIYLKAANGKIDGDTFYNLCINNGGADSTCFDCSKSHQVRNVIYCDAA",  
                  "MKFSNIPEITTILSLFTSLCATQGVNSGNAKYNCSGVIFYSESINASTAMASTINIGSLNGYPAPYPVYGLSGTAPYHLFPMVKDVMVYAGGVVSKFFLIIDINDIEQGMVYVREGYEGYLPCTAM"
                 ],
    models = models_and_weights[0],
    weights = models_and_weights[1],
    names = models_and_weights[2],
    encodings = ["ohe", "ohe", "int", "int"]
)

df names: ['sequence', 'cnn_lstm', 'cnn_gru', 'gru_emb', 'lstm_emb', 'ensemble_weighted', 'ensemble_voting']


In [22]:
real_test_effector

Unnamed: 0,sequence,cnn_lstm,cnn_gru,gru_emb,lstm_emb,ensemble_weighted,ensemble_voting
0,MRLTNTLVVAVAAILL...,1,1,1,1,1,1
1,MKLIHIISSVVIFSLS...,1,1,1,1,1,1
2,MLFKQCTALKFLIFIL...,0,1,1,0,0,0
3,MRFEYISVLALCGASL...,1,1,1,0,1,1
4,MKFSNIPEITTILSLF...,1,1,1,1,1,1


### Testing on non-effector data

In [24]:
real_test_non_effector = calculate_ensemble(
    input_list = ["MTDAIHKINLRLPMHVVEEAKAQAALLGVSLNAYILFAVSEQVKRTRKELSGPVTPPKPKPRPASAAVSEWDAPVVTPVKRPQAKVGRNEDCPCGSGRKAKHCHPEWT", 
                  "MTGTAIYLPGNPFSLFCCCLARHQNNFLMVFSTAHSLCGQIELTAPAFIRYQYCHFEFIRFEPMIRFNRLFGLLLNAFLMRGRLVGGINH",
                  "MDPMAGLTVYNLPPAPPRWDHIGIFYVTFGLTWTTIVFSLMAFGWLNRTNPVLRHRGVGLSFGAIFFLHCYWFLAQVVYPIGGTMPVILAYSIQYFFMGIWFPLGVALFHASNSRFLHVAKLQKQYMQPELRSKSGCNGADSSWICRSRNMSYTKKIMLPIGFGIVLQILLTTGMWLACRKYHPSWGIPGTEIRGDNLMEQMIDLSQGWEWWPSVFWQVIWTWIVAPILLYRAWGIRDTMGWRFQTVGCCLSSLHATPMFMIACYVPAFQVINPYYPPSQWIHLSIMFFEIFTIIIPAIQVVQQRRMVKKSAKLNAKWETCSQTTTLRTSTSIEGKNSNISLAEKASSFDYLDEELGNRLLTMAALDYVLNENPEPLQEFSALSDFSGENIAFLTRVTRWKSTLTHAVTEENNLICYNRAMDIYVDFISMHDAEFPLNLPSQQLKQLEEIFETSTRTVLGEAVVNPATPFDFPSPSHGSRGQGDSKDHLLTETQYTGEIPAAFSPAVFDAAQAHIKHLVLTNTWPKFVAEMQSRRKSSETERTDISGDSQMTLASQVSSFFKRLL" 
                  "MRLISPALVVSTAIQARHVNSSAPVDSAMTEANPLASAHPPDVGYDGVPAGRVRNPDDPTTEERTPGESFMEAINFEIFKLVQEAQGRILGLPEQPRGDMEWLERYGQDAILHYLETGDKDPSQLEKKYDQLLDELKKHPTLEWKDSNRFTLSFLHTWKK", 
                  "MIFKIISIIFLILLLFTDIKQIINKIKQFFIK", 
                  "MIFKIISIIFLILLLFTDIKQIINKIKQFFIKFF"
                 ],
    models = models_and_weights[0],
    weights = models_and_weights[1],
    names = models_and_weights[2],
    encodings = ["ohe", "ohe", "int", "int"]
)

df names: ['sequence', 'cnn_lstm', 'cnn_gru', 'gru_emb', 'lstm_emb', 'ensemble_weighted', 'ensemble_voting']


In [25]:
real_test_non_effector

Unnamed: 0,sequence,cnn_lstm,cnn_gru,gru_emb,lstm_emb,ensemble_weighted,ensemble_voting
0,MTDAIHKINLRLPMHV...,0,0,0,0,0,0
1,MTGTAIYLPGNPFSLF...,1,1,0,0,0,0
2,MDPMAGLTVYNLPPAP...,0,0,0,0,0,0
3,MIFKIISIIFLILLLF...,1,1,1,1,1,1
4,MIFKIISIIFLILLLF...,1,1,1,1,1,1


## Testing on the whole test data

In [37]:
# test_seq = pd.read_csv("../../scripts/r-scripts/getting-data-current/data-sets/testing_input.csv", header = None)
test_seq = pd.read_csv("../../data/getting-data-new/binary-class-data/data-sets/testing_input.csv", header = None)

In [38]:
test_seq.columns = ["sequence"]

In [39]:
test_seq = test_seq.sequence.tolist()

In [40]:
test = calculate_ensemble(
    input_list = test_seq,
    models = models_and_weights_all_epochs[0],
    weights = models_and_weights_all_epochs[1],
    names = models_and_weights_all_epochs[2],
    encodings = ["ohe", "ohe", "int", "int"]
)

df names: ['sequence', 'cnn_lstm', 'cnn_gru', 'gru_emb', 'lstm_emb', 'ensemble_weighted', 'ensemble_voting']


In [41]:
test

Unnamed: 0,sequence,cnn_lstm,cnn_gru,gru_emb,lstm_emb,ensemble_weighted,ensemble_voting
0,MFFPSLILAAGSLSTL...,1,1,1,0,1,1
1,MDKMLFSFLRLCFVLL...,1,1,0,0,1,0
2,MMAPKSLQTGLLILLL...,0,0,0,0,0,0
3,MLSTTAIALLATLILA...,0,1,0,1,0,0
4,MNLVAALVLCFALLSS...,1,1,1,1,1,1
5,MKFIAVLIAAIASLSA...,1,1,1,1,1,1
6,MRLTNTLVVAVAAILL...,1,1,1,1,1,1
7,MRLTNTLVVAVAAILL...,1,1,1,1,1,1
8,MQFMSRINRILFVAVV...,1,1,0,0,1,0
9,MPARHHTIQRKRSIGA...,1,0,0,1,1,0


In [42]:
test.to_csv("test_results/df_results_test_two_ensembles.csv")