In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import json
import plotly.express as px
import tensorflow.keras.layers as L
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.layers import LeakyReLU

In [2]:
tf.random.set_seed(2020)
np.random.seed(2020)

In [3]:
# Jeu de données initial 
train = pd.read_json("train.json", lines = True)
test = pd.read_json("test.json", lines = True)

In [4]:
train = train.query("signal_to_noise >= 1")

In [5]:
def divide_set(df, list_column):
    """
    Fonction qui crée d'une fenetre de lecture de 68 bases. 
        :Parameters:
            df = un data frame train
            list_column = numero de la colonne du data frame 
        :Return:
            deux data frame :
                first68_train = data frame contenant les 68 premieres bases et annotations de structure
                last68_train = data frame contenant les 68 dernieres bases et annotations de structure
    """
    first68 = df.copy()
    last68 = df.copy()
    for a in list_column :
        for i, r in enumerate(df.iloc[:,a]):
            first68.iloc[i,a] = r[0:68]
            last68.iloc[i,a] = r[len(r)-68:]
    return first68, last68

In [6]:
first68_train, last68_train = divide_set(train, [2,3,4])

In [7]:
def one_hot_encoding(df, name_colonne):
    """
    Fonction pour encoder les sequences, structures et type boucles : One hot encoding
        :Parameters:
            df = data frame : train et test de départ
            name_colonne = le nom de la colonne du df 
        :Return:
            un tableau
    """
    frst_lst = []
    for r in df[name_colonne]:
        for i in range(len(r)):
            if r[i] not in frst_lst:
                frst_lst.append(r[i])
    dico = {}
    ar = np.zeros(shape=(1,len(frst_lst)),dtype=int)
    for i, l in enumerate(frst_lst):
        ar2 = ar.copy()
        ar2[0][i]=1
        dico[l]=ar2
    scnd_lst = []
    for r in df[name_colonne]:
        lst = [] 
        for i in range(len(r)):
            #print(dico[r[i]], r[i])
            lst.append(dico[r[i]])
        scnd_lst.append(lst)
    return np.array(scnd_lst)

tab = one_hot_encoding(first68_train, 'sequence')
tab2 = one_hot_encoding(first68_train, 'structure')
tab3 = one_hot_encoding(first68_train, 'predicted_loop_type')

first68_train["One_hot seq"] = tab.tolist() # ajout des colonnes encoding a notre DF de départ
first68_train["One hot strucure"] = tab2.tolist()
first68_train["encoding predicted loop"] = tab3.tolist()
first68_train

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,...,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C,One_hot seq,One hot strucure,encoding predicted loop
0,0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...",...,"[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
2,2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,8.800,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...",...,"[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
5,5,id_00ab2d761,GGAAAGCGCCGCGGCGGUAGCGGCAGCGAGGAGCGCUACCAAGGCA...,.....(.(((((.(((((((((...........)))))))..(((....,EEEEESISSSSSISSSSSSSSSHHHHHHHHHHHSSSSSSSMMSSSH...,4.136,1,107,68,"[0.1942, 0.2041, 0.1626, 0.1213, 0.10590000000...",...,"[0.165, 0.20520000000000002, 0.179, 0.1333, 0....","[0.2864, 0.24710000000000001, 0.2222, 0.1903, ...","[0.7642, 1.6641, 1.0622, 0.5008, 0.4107, 0.133...","[0.9559000000000001, 1.9442, 1.0114, 0.5105000...","[1.9554, 2.1298, 1.0403, 0.609, 0.5486, 0.386,...","[0.22460000000000002, 1.7281, 1.381, 0.6623, 0...","[0.5882000000000001, 1.1786, 0.9704, 0.6035, 0...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
6,6,id_00abef1d7,GGAAAACAAUUGCAUCGUUAGUACGACUCCACAGCGUAAGCUGUGG...,.........((((((((......((((((((((((....)))))))...,EEEEEEEEESSSSSSSSIIIIIISSSSSSSSSSSSHHHHSSSSSSS...,2.485,1,107,68,"[0.422, 0.5478000000000001, 0.4749000000000000...",...,"[0.5827, 0.7555000000000001, 0.5949, 0.4511, 0...","[0.9306000000000001, 1.0496, 0.5844, 0.7796000...","[0.895, 2.3377, 2.2305, 2.003, 1.9006, 1.0373,...","[0.46040000000000003, 3.6695, 0.78550000000000...","[2.7711, 7.365, 1.6924000000000001, 1.43840000...","[1.073, 2.8604000000000003, 1.9936, 1.0273, 1....","[2.0964, 3.3688000000000002, 0.6399, 2.1053, 1...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
7,7,id_00b436dec,GGAAAUCAUCGAGGACGGGUCCGUUCAGCACGCGAAAGCGUCGUGA...,.....(((((((((((..(((((((((..((((....))))..)))...,EEEEESSSSSSSSSSSIISSSSSSSSSIISSSSHHHHSSSSIISSS...,1.727,1,107,68,"[0.4843, 0.5233, 0.4554, 0.43520000000000003, ...",...,"[0.384, 0.723, 0.4766, 0.30260000000000004, 0....","[0.7429, 0.9137000000000001, 0.480400000000000...","[1.1576, 1.5137, 1.3382, 1.5622, 1.2121, 0.295...","[1.6912, 5.2652, 2.3901, 0.45890000000000003, ...","[1.8641, 2.3767, 1.149, 1.0132, 0.9876, 0.0, 0...","[0.49060000000000004, 4.6339, 1.95860000000000...","[1.2852000000000001, 2.5460000000000003, 0.234...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2394,2394,id_ff13729b0,GGAAAUAAAUAAAUAACAAUAAAGAGAUAAGACACAAUAAAUAAAA...,.................................................,EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE...,1.995,0,107,68,"[0.3019, 0.33680000000000004, 0.25980000000000...",...,"[0.25930000000000003, 0.3074, 0.2109, 0.1995, ...","[0.465, 0.6078, 0.30310000000000004, 0.3873000...","[0.3517, 0.5358, 0.4318, 0.016900000000000002,...","[0.6612, 1.0221, 0.1676, 0.1648, 0.5634, 0.645...","[4.0973, 2.0778, 0.2776, 0.1207, 0.63140000000...","[0.2661, 0.5771000000000001, 0.3517, 0.295, 0....","[0.2897, 1.1666, 0.135, 0.4742, 0.9522, 0.8408...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
2395,2395,id_ff84602f7,GGAAAAUAGCAGAGGAAAUACUAGAGCAAUUGCAAAGGCCGAUCAU...,........((..((......))...)).........(((..........,EEEEEEEESSIISSHHHHHHSSIIISSXXXXXXXXXSSSHHHHHHH...,4.036,1,107,68,"[0.2585, 0.29710000000000003, 0.2748, 0.205000...",...,"[0.2093, 0.2985, 0.2922, 0.08360000000000001, ...","[0.29460000000000003, 0.40850000000000003, 0.3...","[0.6957, 1.251, 1.3235999999999999, 0.7521, 0....","[0.6439, 2.0117, 1.3682, 0.0918, 0.65860000000...","[2.1589, 3.3601, 1.6179000000000001, 0.1344000...","[0.47900000000000004, 1.9583, 2.4635, 0.0512, ...","[0.5759000000000001, 2.3736, 1.4158, 0.1914000...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
2396,2396,id_ff85fcdba,GGAAAACAAAAACAAACAACAAAAACAAACAACAAAAACAAACAAC...,.................................................,EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE...,3.227,1,107,68,"[0.2169, 0.2513, 0.2303, 0.22260000000000002, ...",...,"[0.2758, 0.3659, 0.2155, 0.28340000000000004, ...","[0.401, 0.388, 0.3403, 0.3608, 0.3057, 0.242, ...","[0.2891, 0.4496, 0.7165, 0.7128, 0.59310000000...","[0.3619, 0.6924, 0.2988, 0.3639, 0.545, 0.2263...","[2.8541, 1.6106, 1.4343, 1.0797, 0.6803, 0.559...","[0.2964, 0.9351, 0.2555, 0.7603000000000001, 0...","[0.6526000000000001, 0.2548, 0.6927, 0.9316000...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
2398,2398,id_ffe06f3fe,GGAAACGAUAGCAGAAGAGAUCGAUAUAGAGCAUAAGCUAAGAAUA...,.....((((..(....)..))))......(((....)))..........,EEEEESSSSIISHHHHSIISSSSXXXXXXSSSHHHHSSSXXXXXXX...,5.553,0,107,68,"[0.1431, 0.1847, 0.15960000000000002, 0.1466, ...",...,"[0.0944, 0.1453, 0.1067, 0.0994, 0.06470000000...","[0.1691, 0.22740000000000002, 0.178, 0.1762, 0...","[0.6919000000000001, 1.4823, 1.3685, 1.2473, 0...","[0.4544, 2.4603, 0.8778, 0.6402, 0.28340000000...","[2.7157999999999998, 3.1249000000000002, 1.137...","[0.3262, 1.3932, 0.8832000000000001, 0.8144, 0...","[0.5814, 1.5119, 1.1749, 1.2676, 0.22190000000...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."


In [8]:
def right_format(df, colname):
    """
    juste pck j'crois que y'avait des liste dans des listes
    j'ai transformé en liste d'array
    """
    x1 = df[colname].copy()
    x2 = []
    for r in x1:
        lst = []
        for u in r:
            lst.append(np.array(u[0], float))
        lst = np.array(lst)
        x2.append(lst)

    return x2

In [9]:
x_seq = right_format(first68_train, "One_hot seq")
x_strct = right_format(first68_train, "One hot strucure")
x_loop = right_format(first68_train, "encoding predicted loop")

#print(len(x_seq))
#print(len(x_strct))
#print(len(x_loop))
x_seq = np.array(x_seq)
x_strct = np.array(x_strct)
x_loop = np.array(x_loop)
x_all = np.concatenate((x_seq, x_strct,x_loop), axis=2)
x_all.shape

(2097, 68, 14)

In [10]:
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']

In [11]:
y_true = tf.random.normal((32, 68, 5))
y_pred = tf.random.normal((32, 68, 5))

In [12]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    re = tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)
    print(re)
    return re

In [13]:
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(
        hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))

def build_model(embed_size, seq_len=68, pred_len=68, dropout=0.5, 
                sp_dropout=0.2, embed_dim=200, hidden_dim=256, n_layers=3):
    inputs = L.Input(shape=(seq_len, 14))# 107 longueur sequence - 4 ATGU
    embed = L.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)

    reshaped = tf.reshape(
        embed, shape=(-1, 68,  embed.shape[2] * embed.shape[3])
    )
    hidden = L.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers):
        print(x,"layer")
        hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = L.Dense(5, activation='relu')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.keras.optimizers.Adam(), loss=MCRMSE, metrics=["accuracy"])
    
    return model

In [14]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

In [15]:
train_labels = pandas_list_to_array(train[pred_cols])

In [16]:
x_train, x_val, y_train, y_val = train_test_split(
    x_all, train_labels, test_size=.1, random_state=34, stratify=train.SN_filter)

print(x_train.shape)
print(y_train.shape)

(1887, 68, 14)
(1887, 68, 5)


In [17]:
model = build_model(embed_size=14)
model.summary()

0 layer
1 layer
2 layer
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 68, 14)]          0         
_________________________________________________________________
embedding (Embedding)        (None, 68, 14, 200)       2800      
_________________________________________________________________
tf_op_layer_Reshape (TensorF [(None, 68, 2800)]        0         
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 68, 2800)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 68, 512)           4697088   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 68, 512)           1182720   
_________________________________________________________________
bidirectional_2 (Bidirection (

In [45]:
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=1,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

Tensor("MCRMSE/Mean_1:0", shape=(None,), dtype=float32)
Tensor("MCRMSE/Mean_1:0", shape=(None,), dtype=float32)


In [None]:
# relu, rmsprop, 3 couches, noeaud(256 128 85), 30 epochs : 
# loss: 0.2735 - accuracy: 0.4391 - val_loss: 0.2643 - val_accuracy: 0.4440


In [46]:
print(256, int(256/2), int(256/3))
history.history

256 128 85


{'loss': [0.5505052804946899],
 'accuracy': [0.2459864765405655],
 'val_loss': [0.41559648513793945],
 'val_accuracy': [0.2228991538286209],
 'lr': [0.001]}

In [None]:
import plotly.express as px
fig = px.line(
    history.history, y=['loss', 'accuracy', 'val_loss', 'val_accuracy'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [27]:
last68_train
t = one_hot_encoding(last68_train, 'sequence')
t2 = one_hot_encoding(last68_train, 'structure')
t3 = one_hot_encoding(last68_train, 'predicted_loop_type')

last68_train["One_hot seq"] = t.tolist() # ajout des colonnes encoding a notre DF de départ
last68_train["One hot strucure"] = t2.tolist()
last68_train["encoding predicted loop"] = t3.tolist()
last68_train

x_seq2 = right_format(last68_train, "One_hot seq")
x_strct2 = right_format(last68_train, "One hot strucure")
x_loop2 = right_format(last68_train, "encoding predicted loop")

#print(len(x_seq))
#print(len(x_strct))
#print(len(x_loop))
x_seq2 = np.array(x_seq2)
x_strct2 = np.array(x_strct2)
x_loop2 = np.array(x_loop2)
x_all2 = np.concatenate((x_seq2, x_strct2,x_loop2), axis=2)
x_all2.shape

(2097, 68, 14)

In [28]:
model.load_weights('model.h5')
train1_preds = model.predict(x_all)
train2_preds = model.predict(x_all2)

In [30]:
train1_preds

array([[[0.90393806, 0.6613666 , 0.91118795, 1.5056193 , 0.9298657 ],
        [1.1889478 , 0.88627654, 1.1505041 , 1.6896155 , 1.118321  ],
        [1.2566925 , 0.93215555, 1.2239708 , 1.5971904 , 1.1345354 ],
        ...,
        [0.44921324, 0.38045922, 0.5262022 , 0.48042727, 0.4627806 ],
        [0.37794954, 0.32218608, 0.43487206, 0.3431243 , 0.37883466],
        [0.26133105, 0.2288419 , 0.28810087, 0.14659883, 0.25699896]],

       [[0.88801134, 0.6497552 , 0.8943719 , 1.4834915 , 0.9170171 ],
        [1.1606777 , 0.8673179 , 1.1258777 , 1.656759  , 1.0978813 ],
        [1.2116538 , 0.90511316, 1.191824  , 1.5528655 , 1.104795  ],
        ...,
        [0.2600085 , 0.4192485 , 0.3797747 , 0.24643084, 0.27848563],
        [0.26271713, 0.38610294, 0.31940877, 0.1900029 , 0.24798624],
        [0.23871613, 0.30569872, 0.24546069, 0.09623206, 0.19423725]],

       [[0.8852091 , 0.6435987 , 0.8899377 , 1.4749409 , 0.9131397 ],
        [1.1582663 , 0.8589161 , 1.1218555 , 1.6475452 , 1.0

In [31]:
train2_preds

array([[[0.9082878 , 0.66908693, 0.84680086, 1.424108  , 0.8908664 ],
        [1.059046  , 0.8083022 , 0.9850202 , 1.4411715 , 0.97634697],
        [0.88460743, 0.72183865, 0.90921855, 1.158859  , 0.8347094 ],
        ...,
        [0.02104485, 0.19813141, 0.09922462, 0.        , 0.06778613],
        [0.00353262, 0.151982  , 0.04713656, 0.        , 0.03283549],
        [0.03855949, 0.13369955, 0.02045606, 0.        , 0.02326482]],

       [[0.21420045, 0.32312328, 0.24818683, 0.5171417 , 0.28661913],
        [0.24111669, 0.3939018 , 0.24212185, 0.44039842, 0.2927215 ],
        [0.09266131, 0.32597846, 0.17589271, 0.21445985, 0.16954954],
        ...,
        [0.0210466 , 0.19813363, 0.09922563, 0.        , 0.06778987],
        [0.00353378, 0.15198357, 0.04713756, 0.        , 0.03283784],
        [0.03856023, 0.13370064, 0.02045688, 0.        , 0.02326625]],

       [[0.3297636 , 0.33917516, 0.4332485 , 0.76115537, 0.44171545],
        [0.36201075, 0.40864494, 0.50933146, 0.7450724 , 0.4