In [2]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from src import deepTarget_utils as bio_utils
from src import my_deepTarget_utils as utils

## load and format train data

In [76]:
poss_site_level = "../data/13059_2014_500_MOESM1_ESM=site-level human miRNA-target site pairs from miRecords=.csv"
poss_gene_level = "../data/13059_2014_500_MOESM2_ESM=UTR-level human miRNA-gene pairs from miRecords and miRTarBase=.csv"
neg_site_level = "../data/13059_2014_500_MOESM4_ESM=site-level human mock miRNA-target site negative pairs=.csv"
neg_gene_level = "../data/13059_2014_500_MOESM5_ESM=UTR-level human mock miRNA-gene pairs=.csv"

In [77]:
mirna_fasta_file = "../data/miRNA.fasta"
mrna_fasta_file = "../data/3UTR.fasta"
mirna_ids, mirna_seqs, mrna_ids, mrna_seqs = bio_utils.read_fasta(mirna_fasta_file, mrna_fasta_file)

In [78]:
poss_data = pd.read_csv(poss_gene_level)
query_ids, target_ids, label = list(poss_data["miR_ID"]), list(poss_data["mRNA_ID"]), np.ones(len(poss_data))

In [79]:
X_query_ids, X_query_seqs, X_target_ids, X_target_seqs, Y = utils.make_train_pair(mirna_ids,
                                                                                  mirna_seqs,
                                                                                  mrna_ids,
                                                                                  mrna_seqs,
                                                                                  query_ids,
                                                                                  target_ids,
                                                                                  label)

In [80]:
poss_train_data = pd.DataFrame(np.array([X_query_ids, X_query_seqs,
                                         X_target_ids, X_target_seqs,
                                         list(Y.flatten())]).T, 
                                         columns=["mi_name","mi_seq","m_name","target_seq", "label"])

In [81]:
neg_train_pd = pd.read_csv(neg_gene_level, header=None)
neg_train_pd.columns = ["mi_name", "m_name", "none", "target_seq"]
neg_train_pd.drop(columns=["none"], inplace=True)
neg_train_pd["mi_seq"] = neg_train_pd.apply(lambda x: mirna_seqs[mirna_ids.index(x.mi_name)], axis=1)
neg_train_pd["label"] = 0 

In [82]:
# all train data
train_data = neg_train_pd.append(poss_train_data)
# shuffle data
train_data = train_data.sample(frac=1).reset_index(drop=True)
train_data["target_seq"] = train_data["target_seq"].apply( lambda x: x.upper())

In [83]:
X_mirna, X_mrna = bio_utils.formatting_data(list(train_data["mi_seq"]), list(train_data["target_seq"]))
labels = np.array(list(train_data["label"]), dtype=int).reshape(len(train_data), 1)

In [84]:
#labels
Y = []
for l in labels:
    if l == 1: Y.append([0, 1])
    else: Y.append([1, 0])

## embeding training

In [38]:
from keras.utils import np_utils
from keras.layers import containers
from keras.models import Sequential
import json
from keras.layers.core import Dense, Dropout, Merge
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.models import model_from_json, model_from_config

Using Theano backend.


In [39]:
# it will construct and compile from json config file
embd_model = model_from_json(open("./src/deepTarget_embd_model.json").read())

In [85]:
emd_train_data = np.concatenate((X_mirna, X_mrna))

In [None]:
# if you want to load their weights
# embd_model.load_weights("../data/emb_we.hdf5")

In [86]:
emd_train = []
for datum in emd_train_data:
    emd_train.append(bio_utils.one_hot(emd_train_data[0]))

In [42]:
embd_model.fit(emd_train_data, emd_train, nb_epoch=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fcebb9305c0>

In [87]:
X_mirna_embd = embd_model.predict(X_mirna)
X_mrna_embd = embd_model.predict(X_mrna)

## model training

In [48]:
model = model_from_json(open("../src/deepTarget_model.json").read())

In [49]:
# if you want to load their weights
# model.load_weights("../data/model_ew.hdf5")

In [54]:
model.fit(X=[X_mirna_embd, X_mrna_embd], y=Y, nb_epoch=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fce914b9c50>

In [88]:
# lets see how well model can predict on train data
from sklearn.metrics import accuracy_score, precision_score, accuracy_score, f1_score
predictions = model.predict([X_mirna_embd, X_mrna_embd], verbose=0)
Y_predict = np.uint8(np.argmax(predictions, axis=1))
l_true = np.uint8(np.argmax(Y, axis=1))
# perfect
accuracy_score(l_true, Y_predict), precision_score(l_true, Y_predict), f1_score(l_true, Y_predict)

(0.98875694795351188, 1.0, 0.99051273851401767)

## prediction on test

In [56]:
# I cant find their prediction labels, TODO
# ==========================================
# it will take time, will try if checking this model gets mote importtant

In [92]:
mirna_fasta_file = '../data/miRNA.fasta'
mrna_fasta_file = '../data/3UTR.fasta'
query_pair_file = '../data/miRNA-mRNA_query.txt'
mirna_ids, mirna_seqs, mrna_ids, mrna_seqs = bio_utils.read_fasta(mirna_fasta_file, mrna_fasta_file)
query_ids, target_ids = bio_utils.read_query_pair(query_pair_file)

X_query_ids, X_query_seqs, X_target_ids, X_target_seqs = bio_utils.make_pair_from_file(mirna_ids, mirna_seqs, mrna_ids, mrna_seqs, query_ids, target_ids)

X_mirna, X_mrna = bio_utils.formatting_data(X_query_seqs, X_target_seqs)
X_mirna_embd = embd_model.predict(X_mirna)
X_mrna_embd = embd_model.predict(X_mrna)

predictions = model.predict([X_mirna_embd, X_mrna_embd], verbose=0)
Y_predict = np.uint8(np.argmax(predictions, axis=1))

In [93]:
sum(Y_predict), len(Y_predict), sum(their_predictions)

(5863, 5960, 4754)

In [94]:
their_predictions = np.array(pd.read_csv("../data/their_predictions.txt", sep="\t", header=None)[2])

In [95]:
# my model mostly gives 1's, I think I can fix it if I use all training data, TODO
accuracy_score(their_predictions, Y_predict), f1_score(their_predictions, Y_predict)

(0.78406040268456378, 0.87877931619101446)