# Regular Genomics Project **RNA Localisation**


---

**Problem definition**:

TODO


### **1.** Data Preparation

Firstly, we import several necessary packages and load in our data:

In [1]:
import pandas as pd
import numpy as np
# training, only need for the model initialization to change in general
from models import CNN
from notes.utils import read_model_file
from models.utils import plot_line_graph, box_plot, save_plot, tf_pearson, roc_curve_plot
from sklearn.model_selection import train_test_split
import datetime
import pydot

### **2** |  Initializations

We use as baseline model the [RNATracker](https://github.com/HarveyYan/RNATracker/blob/master/Models/cnn_bilstm_attention.py) model and a CNN model.


In [4]:
models = ["CNN_architecture1_Iusti.yaml"]
plot_output_paths = ["CNN_architecture1"]
model_architecture_visualizations = ["CNN_architecture1"]
model_outputs = ["CNN_architecture1"]
datasets = ["APEX_SEQ_UPDATED"]

for dataset in datasets:
    np.random.seed(3)
    data_org = pd.read_csv(f'datasets/{dataset}.csv')
    test_data = data_org.sample(frac=0.1)
    train_data = data_org.drop(test_data.index) # TODO: note: we also have to preprocess the test set similary

    # 80/20 split
    train_split, valid_split = train_test_split(train_data, random_state=42, test_size=0.2)

    for i in range(0, len(models)):

        max_seq_len = train_data['seq'].apply(lambda x: len(x)).max()
        # MODEL
        model_path = f"model_architectures/{models[i]}"

        # Path where to save viz
        model_architecture_path = f"model_architecture_viz/{model_architecture_visualizations[i]}.png"

        params_dict = read_model_file(model_path, max_seq_len)
        param_dataLoader_valid = params_dict['param_dataLoader_valid']
        param_dataLoader_train = params_dict['param_dataLoader_train']
        params_model = params_dict['params_model']
        params_train = params_dict['params_train']

        model = CNN(
                    input_size=(max_seq_len, 4),
                    params_model=params_model, metrics=['accuracy', tf_pearson])


        history = model.fit_and_evaluate(train_data=train_split, eval_data=valid_split,
                                         params_train_dataLoader=param_dataLoader_train,
                                         params_eval_dataLoader=param_dataLoader_valid,
                                         params_train=params_train)

        model.print_model(model_architecture_path)

        time_date = datetime.datetime.now().date()

        model_output = f"model_outputs/{model_outputs[i]}_{time_date}.h5"

        model.save_model(model_output)

        # print(history.history)

        plt_data = [history.history['loss'], history.history['val_loss']]
        # plot_line_graph(plt_data, "Loss Graph", 'loss', 'epoch', ['train', 'val'])

        # save_plot(f"plots/{dataset}_{plot_output_paths[i]}_loss.png")

        plt_data = [history.history['accuracy'], history.history['val_accuracy']]
        # plot_line_graph(plt_data, "Accuracy Graph", 'accuracy', 'epoch', ['train', 'val'])

        # save_plot(f"plots/{dataset}_{plot_output_paths[i]}_acc.png")

        plt_data = [history.history['tf_pearson'], history.history['val_tf_pearson']]
        # plot_line_graph(plt_data, "Correlation Graph", 'tf_pearson', 'epoch', ['train', 'val'])

        pred_data = model.predict(test_data, params_dataLoader=param_dataLoader_valid)

        roc_curve_plot(test_data[0:9], pred_data)





TypeError: '<' not supported between instances of 'str' and 'float'

In [14]:
print(pred_data)

roc_curve_plot(test_data.iloc[:, 0:9], pred_data)

[[0.08848426 0.12777641 0.1382956  ... 0.13713866 0.13289338 0.10892894]
 [0.16015416 0.08776006 0.1398356  ... 0.11273863 0.08632646 0.12813918]
 [0.08707615 0.12903528 0.13889042 ... 0.13799036 0.13374321 0.10794855]
 ...
 [0.1251582  0.1035712  0.1337838  ... 0.12058223 0.10870849 0.12170468]
 [0.08671411 0.12909147 0.14014049 ... 0.13804066 0.13294788 0.10755061]
 [0.09496228 0.12163914 0.13768387 ... 0.13371521 0.12710238 0.1118726 ]]


ValueError: continuous-multioutput format is not supported

In [None]:
test_result = model.evaluate(test_data, **param_dataLoader_valid)
result = dict(zip(model.model.metrics_names, test_result))
TEST_ACCURACY = result['accuracy']
TEST_LOSS = result['loss']