# Regular Genomics Project **RNA Localisation**


---

**Problem definition**:

TODO


### **1.** Data Preparation

Firstly, we import several necessary packages and load in our data:

In [9]:
import pandas as pd
import numpy as np
import keras
# training, only need for the model initialization to change in general
from models import CNN
from utils import read_model_file, extractY
from plotting import plot_line_graph, box_plot, roc_curve_plot, scatter_plot, bar_plot
from metrics import pearson
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam, Nadam, SGD
from keras.losses import KLDivergence, CategoricalCrossentropy
import datetime

### **2** |  Initializations

We use as baseline model the [RNATracker](https://github.com/HarveyYan/RNATracker/blob/master/Models/cnn_bilstm_attention.py) model and a CNN model.


In [None]:
models = ["CNN_architecture4_Iusti_Padded.yaml", "CNN_architecture5_Iusti_Padded.yaml", "CNN_architecture6_Iusti_Padded.yaml", "CNN_architecture7_Iusti_Padded.yaml", "CNN_architecture8_Iusti_Padded.yaml"]

names = ["CNN_architecture4_Iusti_Padded", "CNN_architecture5_Iusti_Padded", "CNN_architecture6_Iusti_Padded", "CNN_architecture7_Iusti_Padded", "CNN_architecture8_Iusti_Padded"]

plot_output_paths = names
model_architecture_visualizations =names
model_outputs = names

datasets = ["APEX_SEQ"]

for dataset in datasets:
    np.random.seed(3)
    data_org = pd.read_csv(f'datasets/{dataset}.csv')
    test_data = data_org.sample(frac=0.1)
    train_data = data_org.drop(test_data.index) # TODO: note: we also have to preprocess the test set similary

    # 80/20 split
    train_split, valid_split = train_test_split(train_data, random_state=42, test_size=0.2)

    for i in range(0, len(models)):

        max_seq_len = train_data['seq'].apply(lambda x: len(x)).max()
        truncate_len = 4000
        # MODEL
        model_path = f"model_architectures/{models[i]}"

        # Path where to save viz
        model_architecture_path = f"model_architecture_viz/{model_architecture_visualizations[i]}.png"

        params_dict = read_model_file(model_path, max_seq_len, truncate_len)
        param_dataLoader_valid = params_dict['param_dataLoader_valid']
        param_dataLoader_train = params_dict['param_dataLoader_train']
        params_model = params_dict['params_model']
        params_train = params_dict['params_train']

        if param_dataLoader_valid["truncate"]:
            input_size =(truncate_len, 4)
        else:
            input_size =(max_seq_len, 4)

        model = CNN(
                    input_size=input_size,
                    optimizer=Nadam(),
                    loss=KLDivergence(name="kullback_leibler_divergence"),
                    params_model=params_model,
                    metrics=[pearson]
                    )


        history = model.fit_and_evaluate(train_data=train_split, eval_data=valid_split,
                                         callback=keras.callbacks.EarlyStopping(monitor="val_loss"),
                                         params_train_dataLoader=param_dataLoader_train,
                                         params_eval_dataLoader=param_dataLoader_valid,
                                         params_train=params_train)

        model.print_model(model_architecture_path)

        time_date = datetime.datetime.now().date()

        model_output = f"model_outputs/{model_outputs[i]}_{time_date}.h5"

        model.save_model(model_output)

        # print(history.history)

        plt_data = [history.history['loss'], history.history['val_loss']]
        plot_line_graph(plt_data, "Loss Graph", 'loss', 'epoch', ['train', 'val'])

        # save_plot(f"plots/{dataset}_{plot_output_paths[i]}_loss.png")

        #plt_data = [history.history['categorical_accuracy'], history.history['val_categorical_accuracy']]
        #plot_line_graph(plt_data, "Accuracy Graph", 'accuracy', 'epoch', ['train', 'val'])

        # save_plot(f"plots/{dataset}_{plot_output_paths[i]}_acc.png")

        plt_data = [history.history['pearson'], history.history['val_pearson']]
        plot_line_graph(plt_data, "Correlation Graph", 'tf_pearson', 'epoch', ['train', 'val'])

        pred_data = model.predict(test_data, params_dataLoader=param_dataLoader_valid)
        testY = extractY(test_data)

        bar_plot(testY, pred_data, "pearson")



In [None]:
pred_data = model.predict(test_data, params_dataLoader=param_dataLoader_valid)
testY = extractY(test_data)

In [None]:
roc_curve_plot(testY, pred_data)

In [None]:
scatter_plot(testY, pred_data)

In [None]:
bar_plot(testY, pred_data, "pearson")