# Regular Genomics Project **RNA Localisation**


---

**Problem definition**:

TODO


In [None]:
# Inline installation instructions: 
# ! ONLY EXECUTE WHEN IN COLAB !
#############################

!pip install pandas
!pip install torch
!pip install numpy
!pip install keras
!pip install tensorflow
!pip install matplotlib
!pip install scikit-learn

##############################

### **1.** First steps with data

Firstly, we import several necessary packages and load in our data:

In [4]:
import pandas as pd
import numpy as np
#from keras.utils import to_categorical
from keras.utils import to_categorical
import itertools
from collections import OrderedDict
import os
import sys
#from keras.utils import pad_sequences
from keras.utils import pad_sequences
import tensorflow
from sklearn.model_selection import KFold, StratifiedKFold
from dataloaders.GeneWrapper import Gene_Wrapper

In [14]:
# Defining a few parameters
max_len = 4000
nb_filters = 32
filters_length = 10
pooling_size = 3
lstm_units = 32
lower_bound = 0
upper_bound = 4000
nb_classes = 9 # because we have 9 localisations
batch_size = 256

output_path = "/outputs" #"~/Downloads/model_outputs"

In [None]:
# Initializing test set

colab = True  #### Set colab flag ####

if colab:
    np.random.seed(3)
    url = 'https://www.dropbox.com/s/hv4uau8q4wwg00k/final_data.csv?dl=1'
    data_org = pd.read_csv(url)
    test_data = data_org.sample(frac=0.1)
    train_data = data_org.drop(test_data.index)
else:
    np.random.seed(3)
    data_org = pd.read_csv('~/Downloads/final_data.csv')
    test_data = data_org.sample(frac=0.1)
    train_data = data_org.drop(test_data.index) # TODO: note: we also have to preprocess the test set similary
    # TODO: colab

data_org

In [None]:

sum_vec = train_data.iloc[:, :9].sum(axis=1)
data2 = train_data.iloc[:, :9].divide(sum_vec, axis='index')
train_data_no_struct = pd.concat([data2, train_data['seq']], axis=1)
train_data_no_struct



In [9]:

# One hot encode the 'seq' attribute of the above table
mapping = {
    'A': 0,
    'C': 1,
    'G': 2,
    'T': 3
}

mapping_localisations = {
    'ERM':  0,
    'KDEL': 1,
    'LMA':  2,
    'MITO': 3,
    'NES':  4,
    'NIK':  5,
    'NLS':  6,
    'NUCP': 7,
    'OMM':  8
}

one_hot_encode_lam = lambda seq: to_categorical([mapping[x] for x in seq])
data_one = train_data_no_struct['seq'].apply(one_hot_encode_lam)

data_one

# Now just injecting this modified 'seq' back into the pandas frame
data_one_no_struct =  pd.concat([train_data_no_struct.iloc[:, :9], data_one], axis=1)

data_one_no_struct


# Additional ordinal encoding of the 'seq' attribute

gene_data = train_data['seq']

def label_dist(dist):
    # TODO: what is this
    assert (len(dist) == 4)
    return np.array(dist) / np.sum(dist)

encoding_seq = OrderedDict([
    ('UNK', [0, 0, 0, 0]),
    ('A', [1, 0, 0, 0]),
    ('C', [0, 1, 0, 0]),
    ('G', [0, 0, 1, 0]),
    ('T', [0, 0, 0, 1]),
    ('N', [0.25, 0.25, 0.25, 0.25]),  # A or C or G or T
])

encoding_keys = list(encoding_seq.keys())
seq_encoding_vectors = np.array(list(encoding_seq.values()))
encoding_vectors = seq_encoding_vectors


In [31]:
#print(mapping_localisations.keys())
#print(list(mapping_localisations.keys()))
#print(data_one_no_struct[mapping_localisations.keys()])
#print(data_one_no_struct[mapping_localisations.keys()].values)

dict_keys(['ERM', 'KDEL', 'LMA', 'MITO', 'NES', 'NIK', 'NLS', 'NUCP', 'OMM'])
['ERM', 'KDEL', 'LMA', 'MITO', 'NES', 'NIK', 'NLS', 'NUCP', 'OMM']
            ERM      KDEL       LMA      MITO       NES       NIK       NLS  \
0      0.207338  0.128871  0.079991  0.044906  0.082833  0.088109  0.061681   
1      0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2      0.064346  0.125935  0.108187  0.082002  0.091019  0.160823  0.142649   
3      0.141112  0.094932  0.211394  0.028049  0.102672  0.102581  0.124625   
4      0.172315  0.115240  0.132014  0.039526  0.115743  0.088187  0.142063   
...         ...       ...       ...       ...       ...       ...       ...   
13805  0.000000  0.000000  0.734451  0.109981  0.073566  0.000000  0.082002   
13806  0.067811  0.056029  0.110082  0.010354  0.050838  0.430809  0.110393   
13807  0.006797  0.081510  0.077941  0.046627  0.059330  0.330415  0.263028   
13808  0.000000  0.077421  0.315861  0.053726  0.132350  0.051426

In [10]:

X = pad_sequences([[encoding_keys.index(c) for c in gene] for gene in gene_data],
                    maxlen=max_len,
                    dtype=np.int8, value=encoding_keys.index('UNK'))  # , truncating='post')

y = data_one_no_struct[mapping_localisations.keys()].values

# See notes to extend this for secondary structure

encoding_keys


['UNK', 'A', 'C', 'G', 'T', 'N']

In [11]:

# Splitting for 5fold

folds_total = 5

kf = KFold(n_splits=folds_total, shuffle=True, random_state=1234)
folds = kf.split(X, y)

# folds now contains a list of lists. Each sublist contains all the indices for the pandas data entries to be used in the respective fold


### **2** |  Importing models...

We use as baseline model the [RNATracker](https://github.com/HarveyYan/RNATracker/blob/master/Models/cnn_bilstm_attention.py) model and a CNN model.


In [12]:
# Import NN
from models.RNAtracker import RNATracker

# Set paths for model output
try:
    os.makedirs('')
except Exception as e:
    print(str(e))

model_output_folder = output_path #'~/Downloads/model_outputs'


[Errno 17] File exists: '~/Downloads/model_outputs'


In [15]:

# TODO: with understand what we have to predict, we can allocate X and y
# Also: the kwargsvalues are hyperparameters of which we will select default values from the RNAtracker repo
epochs = 10

for i, (train_indices, test_indices) in enumerate(folds):
    print('Evaluating KFolds {}/{}'.format(str(i + 1), str(folds_total)))
    model = RNATracker(max_len, nb_classes, model_output_folder, kfold_index=i)
    #model.build_model(nb_filters=kwargs['nb_filters'], filters_length=kwargs['filters_length'],
    #                          pooling_size=kwargs['pooling_size'], lstm_units=kwargs['lstm_units'],
    #                          embedding_vec=encoding_vectors)
    
    model.build_model_advanced_masking(nb_filters=nb_filters,
                                                   filters_length=filters_length,
                                                   pooling_size=pooling_size,
                                                   lstm_units=lstm_units,
                                                   embedding_vec=encoding_vectors)
    

    model.train(X[train_indices], y[train_indices], batch_size,epochs)

    score, acc = model.evaluate(X[test_indices], y[test_indices], "")
    result_string = "Fold {}/{} achieves a score of {} and accuracy {}".format(str(i + 1), str(folds_total), str(score), str(acc))
    print(result_string)

    model_output_path = os.path.join(model_output_folder, "fold"+str(i + 1)+"_model")
    model.save(model_output_path)

    #results = model.evaluate(eval_data=train_data.iloc[valid_split], **param_dataLoader_valid)
    #results = dict(zip(model.model.metrics_names, results))

    #VALIDATION_ACCURACY.append(results['accuracy'])
    #VALIDATION_LOSS.append(results['loss'])
    #K.clear_session()








Evaluating KFolds 1/5
Advanced Masking
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 4000)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 4000, 4)      24          ['input_2[0][0]']                
                                                                                                  
 conv1d_2 (Conv1D)              (None, 3991, 32)     1280        ['embedding_1[0][0]']            
                                                                                                  
 max_pooling1d_2 (MaxPooling1D)  (None, 1330, 32)    0           ['conv1d_2[0][0]']               
                                                     

KeyboardInterrupt: 