In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [2]:
colab = False  #### Set colab flag ####

if colab:
    np.random.seed(3)
    url = 'https://www.dropbox.com/s/hv4uau8q4wwg00k/final_data.csv?dl=1'
    data_org = pd.read_csv(url)
    test_data = data_org.sample(frac=0.1)
    train_data = data_org.drop(test_data.index)
else:
    np.random.seed(3)
    data_org = pd.read_csv('~/Downloads/final_data.csv')
    test_data = data_org.sample(frac=0.1)
    train_data = data_org.drop(test_data.index) # TODO: note: we also have to preprocess the test set similary
    # TODO: colab

data_org

Unnamed: 0,ERM,KDEL,LMA,MITO,NES,NIK,NLS,NUCP,OMM,gene_id,gene_biotype,seq,struct,m6A_5UTR,m6A_CDS,m6A_3UTR
0,57.045409,35.456782,22.008215,12.355106,22.789983,24.241731,16.970436,29.348389,54.916891,ENSG00000000003,protein_coding,ACCTTGTATTAGGTATTTATTTCCACAAAAGTTTGATGCTTACAAC...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0,1
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.914814,0.244517,ENSG00000000005,protein_coding,TGTGCACAGAAGTTATATACATATATGGGTATATCTATGTAACAAA...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0,0
2,17.449430,34.151539,29.338431,22.237585,24.682767,43.612551,38.683963,35.678476,25.348560,ENSG00000000419,protein_coding,TACTTTATGCAAAAAAAAATATACATTTATTTATAGGTCTCAATAC...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0,0
3,3.830180,2.576734,5.737850,0.761343,2.786808,2.784356,3.382682,2.463676,2.819269,ENSG00000000457,protein_coding,TGACTTTCAAACCATTTTAATATTTCAAATATTCCAGAACAATCCC...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,9,5
4,13.132915,8.782925,10.061390,3.012459,8.821250,6.721117,10.827253,8.005113,6.849962,ENSG00000000460,protein_coding,AACCCGCTCGGGTCCCCTTCCACACTGTGGAAGCTTTGTTCTTTCG...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13805,0.000000,0.000000,0.506809,0.075893,0.050764,0.000000,0.056586,0.000000,0.000000,ENSG00000281883,protein_coding,GGGAAGAAAGGAGCCTGACTCTTATGATGGAATAACCACAAATCAG...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,2,1
13806,0.105452,0.087130,0.171187,0.016101,0.079057,0.669947,0.171672,0.000000,0.254546,ENSG00000282034,protein_coding,GTGTCGGACGGCATGACAGGCAGCAATCCTGTGTCCCCTGCCTCAT...,"[0.37599998712539673, 0.0, 0.07500000298023224...",0,66,4
13807,0.037093,0.444844,0.425368,0.254467,0.323794,1.803249,1.435483,0.249590,0.483645,ENSG00000282827,protein_coding,ATGGCGGGGACCTCCGCGCCAGGCAGCAAGAGGCGGAGCGAGCCCC...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0,0
13808,0.000000,0.519421,2.119115,0.360450,0.887939,0.345021,1.274465,0.741954,0.460649,ENSG00000282936,protein_coding,CTACTGGAACGCCCCCCTCAATCTAGCCTCCCCCACATAACTCTCT...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0,2


### **2** |  Initializations

We use as baseline model the [RNATracker](https://github.com/HarveyYan/RNATracker/blob/master/Models/cnn_bilstm_attention.py) model and a CNN model.


In [26]:
max_seq_len = train_data['seq'].apply(lambda x: len(x)).max()

# DataLoader
param_dataLoader_train = {
    'padding_length': max_seq_len,
    "batch_size": 32,
    "shuffle": True
    }

param_dataLoader_valid = {
    'padding_length': max_seq_len,
    "batch_size": 32,
    "shuffle": True
    }

# KFold
param_KFold = {
    "n_splits": 5,
    "shuffle": True
    }

# Model Parameter
param_branches = [{'architecture': 'cepr', 
                   'conv': [{'filters': 64, 'kernel_size': 9, 'activation': 'relu', 'input_shape': (max_seq_len, 4)}],
                   'dense': [{'units': 9, 'activation': 'softmax'}],
                   'pooling': [{'pool_size': 34518, 'strides': 1}], 
                   'reshape': [{'target_shape': (9,)}]},
                   {'architecture': 'cepr', 
                   'conv': [{'filters': 64, 'kernel_size': 7, 'activation': 'relu', 'input_shape': (max_seq_len, 4)}],
                   'dense': [{'units': 9, 'activation': 'softmax'}],
                   'pooling': [{'pool_size': 34520, 'strides': 1}], 
                   'reshape': [{'target_shape': (9,)}]}]

training_consensus = {'metrics': ['accuracy']}

param_consensus = {}

In [27]:
max_seq_len

34526

In [28]:
# Splitting for 5fold

kf = KFold(**param_KFold)
folds = kf.split(train_data)

In [29]:
# training, only need for the model initialization to change in general
from models import MultiBranch

VALIDATION_ACCURACY = []
VALIDATION_LOSS = []

for i, (train_split, valid_split) in enumerate(folds):
    model = MultiBranch(param_branches=param_branches, number_branches=2, param_consensus=param_consensus, training_consensus=training_consensus)

    model.branched_models[0].summary()
    #model.branched_models[1].summary()
    #model.final_merge_model.summary()

    model.fit(train_data=train_data.iloc[train_split], params_loader=param_dataLoader_train)
    results = model.evaluate(eval_data=train_data.iloc[valid_split], params_loader=param_dataLoader_valid)
    results = dict(zip(model.final_merge_model.metrics_names, results))

    VALIDATION_ACCURACY.append(results['accuracy'])
    VALIDATION_LOSS.append(results['loss'])

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_12 (Conv1D)          (None, 34518, 64)         2368      
                                                                 
 dense_14 (Dense)            (None, 34518, 9)          585       
                                                                 
 max_pooling1d_12 (MaxPoolin  (None, 1, 9)             0         
 g1D)                                                            
                                                                 
 reshape_12 (Reshape)        (None, 9)                 0         
                                                                 
Total params: 2,953
Trainable params: 2,953
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_21"
_________________________________________________________________
 Layer (type)               

In [8]:
model.final_merge_model.metrics_names

['loss']

In [9]:
results

9.318687438964844

In [32]:
test_result = model.evaluate(test_data, params_loader=param_dataLoader_valid)
result = dict(zip(model.final_merge_model.metrics_names, test_result))
TEST_ACCURACY = result['accuracy']
TEST_LOSS = result['loss']



In [30]:
VALIDATION_ACCURACY

[0.08970233052968979,
 0.01045856811106205,
 0.12872083485126495,
 0.10539018362760544,
 0.10221327841281891]

In [31]:
VALIDATION_LOSS

[7.521951198577881,
 8.533324241638184,
 8.469651222229004,
 11.200939178466797,
 7.81272029876709]