In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [2]:
# Jeu de données initial 
train = pd.read_json("train.json", lines = True)
test = pd.read_json("test.json", lines = True)

In [3]:
import json

import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow.keras.layers as L
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.layers import LeakyReLU

In [9]:
tf.random.set_seed(2020)
np.random.seed(2020)

In [10]:
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']

In [11]:
y_true = tf.random.normal((32, 68, 3))
y_pred = tf.random.normal((32, 68, 3))

In [12]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    re = tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)
    print(re)
    return re

In [41]:
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(
        hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))

def build_model(embed_size, seq_len=68, pred_len=68, dropout=0.5, 
                sp_dropout=0.2, embed_dim=200, hidden_dim=256, n_layers=3):
    inputs = L.Input(shape=(seq_len, 4))# 107 longueur sequence - 4 ATGU
    embed = L.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)

    reshaped = tf.reshape(
        embed, shape=(-1, 68,  embed.shape[2] * embed.shape[3])
    )
    hidden = L.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers):
        print(x,"layer")
        hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = L.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.keras.optimizers.RMSprop(), loss=MCRMSE, metrics=["accuracy"])
    
    return model

In [42]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

In [43]:
def preprocess_inputs(df, token2int, cols=['sequence', 'structure', 'predicted_loop_type']):
    return pandas_list_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    )

In [44]:
train = train.query("signal_to_noise >= 1")

In [45]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

train_inputs = preprocess_inputs(train, token2int)
train_labels = pandas_list_to_array(train[pred_cols])

In [46]:
x_train, x_val, y_train, y_val = train_test_split(
    train_inputs, train_labels, test_size=.1, random_state=34, stratify=train.SN_filter)

print(x_train.shape)
print(y_train.shape)

x_train, x_val, y_train, y_val = train_test_split(
    x_seq, train_labels, test_size=.1, random_state=34, stratify=train.SN_filter)

print(x_train.shape)
print(y_train.shape)

(1887, 107, 3)
(1887, 68, 5)
(1887, 68, 4)
(1887, 68, 5)


In [47]:
x_train

array([[[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]],

       ...,

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 1.

In [48]:
first68_train['One_hot seq'].values

array([list([[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]], [[0, 1, 0, 0]], [[0, 1, 0, 0]], [[0, 1, 0, 0]], [[1, 0, 0, 0]], [[0, 0, 1, 0]], [[0, 0, 0, 1]], [[0, 0, 1, 0]], [[0, 0, 0, 1]], [[0, 1, 0, 0]], [[0, 1, 0, 0]], [[0, 0, 0, 1]], [[0, 1, 0, 0]], [[0, 1, 0, 0]], [[0, 0, 1, 0]], [[0, 1, 0, 0]], [[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]], [[0, 0, 1, 0]], [[0, 0, 0, 1]], [[0, 1, 0, 0]], [[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]], [[0, 0, 1, 0]], [[0, 0, 0, 1]], [[0, 1, 0, 0]], [[0, 0, 1, 0]], [[1, 0, 0, 0]], [[0, 0, 0, 1]], [[0, 1, 0, 0]], [[0, 0, 0, 1]], [[0, 0, 0, 1]], [[0, 0, 0, 1]], [[0, 0, 1, 0]], [[0, 0, 0, 1]], [[0, 1, 0, 0]], [[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 0, 0, 1]], [[0, 1, 0, 0]], [[0, 1, 0, 0]], [[0, 0, 1, 0]], [[0, 0, 0, 1]], [[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]], [[0, 1, 0, 0]], [[0, 0, 0, 1]], [[0, 1, 0, 0]], [[0, 1, 0, 0]], [[0, 0, 1, 0]], [[0, 0, 1, 0]], [[0, 0, 1, 0]], [[0, 1, 0, 0]], [[0, 0, 0, 1]], [[0, 1, 0, 

In [49]:
public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")

public_inputs = preprocess_inputs(public_df, token2int)
private_inputs = preprocess_inputs(private_df, token2int)

In [50]:
y_train.shape

(1887, 68, 5)

In [51]:
model = build_model(embed_size=4)
model.summary()

0 layer
1 layer
2 layer
Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 68, 4)]           0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 68, 4, 200)        800       
_________________________________________________________________
tf_op_layer_Reshape_2 (Tenso [(None, 68, 800)]         0         
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 68, 800)           0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 68, 512)           1625088   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 68, 512)           1182720   
_________________________________________________________________
bidirectional_8 (Bidirection (

In [52]:
import warnings
def fxn():
    warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

In [53]:
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=75,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)
#1 :  linear, rmsprop, 1layer,   45    : loss: 0.2506 - accuracy: 0.4689 - val_loss: 0.2724 - val_accuracy: 0.4438
#2 :  relu,   rmsprop, 1layer,   45    : loss: 0.2744 - accuracy: 0.4596 - val_loss: 0.2856 - val_accuracy: 0.4480
#3 :  linear, rmsprop, 3layer,   45    : loss: 0.2428 - accuracy: 0.4666 - val_loss: 0.2824 - val_accuracy: 0.4630
#4 :  linear, rmsprop, 2layers,  45    : loss: 0.2497 - accuracy: 0.4642 - val_loss: 0.2802 - val_accuracy: 0.4513
#6 :  linear, rmsprop, 3layers,  75    : loss: 0.2368 - accuracy: 0.4800 - val_loss: 0.2626 - val_accuracy: 0.4732
#pour le dernier vers 50 ça se stabilise

Epoch 1/75
Tensor("MCRMSE/Mean_1:0", shape=(None,), dtype=float32)
Tensor("MCRMSE/Mean_1:0", shape=(None,), dtype=float32)
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75


Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


- 1 layer, Activation relu, Optimization RMSprop:
    Epoch 45/45
30/30 [==============================] - 47s 2s/step - loss: 0.2739 - val_loss: 0.2852

In [36]:
1305600/85600

15.25233644859813

In [54]:
import plotly.express as px
fig = px.line(
    history.history, y=['loss', 'accuracy', 'val_loss', 'val_accuracy'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [34]:
history.history

{'loss': [0.5811967253684998,
  0.45659327507019043,
  0.4234050214290619,
  0.40253010392189026,
  0.3813134729862213,
  0.3703606128692627,
  0.3631477952003479,
  0.35643303394317627,
  0.35147225856781006,
  0.3453005850315094,
  0.3403914272785187,
  0.33420664072036743,
  0.32982534170150757,
  0.32509031891822815,
  0.3222046196460724,
  0.3180749714374542,
  0.3100661337375641,
  0.3081080913543701,
  0.305099755525589,
  0.2993493974208832,
  0.2997782230377197,
  0.2947523295879364,
  0.29163646697998047,
  0.28758668899536133,
  0.2870587408542633,
  0.2830219864845276,
  0.2813083529472351,
  0.2795730233192444,
  0.2754025161266327,
  0.27326300740242004,
  0.27363577485084534,
  0.26893919706344604,
  0.2674010396003723,
  0.2656969428062439,
  0.26308438181877136,
  0.26385998725891113,
  0.2586568593978882,
  0.2574784457683563,
  0.2546256482601166,
  0.25375163555145264,
  0.25176623463630676,
  0.2497900277376175,
  0.24584801495075226,
  0.24561317265033722,
  0.242

In [20]:
def divide_set(df, list_column):
    """
    Fonction qui crée d'une fenetre de lecture de 68 bases. 
        :Parameters:
            df = un data frame train
            list_column = numero de la colonne du data frame 
        :Return:
            deux data frame :
                first68_train = data frame contenant les 68 premieres bases et annotations de structure
                last68_train = data frame contenant les 68 dernieres bases et annotations de structure
    """
    first68 = df.copy()
    last68 = df.copy()
    for a in list_column :
        for i, r in enumerate(df.iloc[:,a]):
            first68.iloc[i,a] = r[0:68]
            last68.iloc[i,a] = r[len(r)-68:]
    return first68, last68

In [21]:
first68_train, last68_train = divide_set(train, [2,3,4])

In [22]:
def one_hot_encoding(df, name_colonne):
    """
    Fonction pour encoder les sequences, structures et type boucles : One hot encoding
        :Parameters:
            df = data frame : train et test de départ
            name_colonne = le nom de la colonne du df 
        :Return:
            un tableau
    """
    frst_lst = []
    for r in df[name_colonne]:
        for i in range(len(r)):
            if r[i] not in frst_lst:
                frst_lst.append(r[i])
    dico = {}
    ar = np.zeros(shape=(1,len(frst_lst)),dtype=int)
    for i, l in enumerate(frst_lst):
        ar2 = ar.copy()
        ar2[0][i]=1
        dico[l]=ar2
    scnd_lst = []
    for r in df[name_colonne]:
        lst = [] 
        for i in range(len(r)):
            #print(dico[r[i]], r[i])
            lst.append(dico[r[i]])
        scnd_lst.append(lst)
    return np.array(scnd_lst)

tab = one_hot_encoding(first68_train, 'sequence')
tab2 = one_hot_encoding(first68_train, 'structure')
tab3 = one_hot_encoding(first68_train, 'predicted_loop_type')

first68_train["One_hot seq"] = tab.tolist() # ajout des colonnes encoding a notre DF de départ
first68_train["One hot strucure"] = tab2.tolist()
first68_train["encoding predicted loop"] = tab3.tolist()
first68_train

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,...,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C,One_hot seq,One hot strucure,encoding predicted loop
0,0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...",...,"[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
2,2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,8.800,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...",...,"[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
5,5,id_00ab2d761,GGAAAGCGCCGCGGCGGUAGCGGCAGCGAGGAGCGCUACCAAGGCA...,.....(.(((((.(((((((((...........)))))))..(((....,EEEEESISSSSSISSSSSSSSSHHHHHHHHHHHSSSSSSSMMSSSH...,4.136,1,107,68,"[0.1942, 0.2041, 0.1626, 0.1213, 0.10590000000...",...,"[0.165, 0.20520000000000002, 0.179, 0.1333, 0....","[0.2864, 0.24710000000000001, 0.2222, 0.1903, ...","[0.7642, 1.6641, 1.0622, 0.5008, 0.4107, 0.133...","[0.9559000000000001, 1.9442, 1.0114, 0.5105000...","[1.9554, 2.1298, 1.0403, 0.609, 0.5486, 0.386,...","[0.22460000000000002, 1.7281, 1.381, 0.6623, 0...","[0.5882000000000001, 1.1786, 0.9704, 0.6035, 0...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
6,6,id_00abef1d7,GGAAAACAAUUGCAUCGUUAGUACGACUCCACAGCGUAAGCUGUGG...,.........((((((((......((((((((((((....)))))))...,EEEEEEEEESSSSSSSSIIIIIISSSSSSSSSSSSHHHHSSSSSSS...,2.485,1,107,68,"[0.422, 0.5478000000000001, 0.4749000000000000...",...,"[0.5827, 0.7555000000000001, 0.5949, 0.4511, 0...","[0.9306000000000001, 1.0496, 0.5844, 0.7796000...","[0.895, 2.3377, 2.2305, 2.003, 1.9006, 1.0373,...","[0.46040000000000003, 3.6695, 0.78550000000000...","[2.7711, 7.365, 1.6924000000000001, 1.43840000...","[1.073, 2.8604000000000003, 1.9936, 1.0273, 1....","[2.0964, 3.3688000000000002, 0.6399, 2.1053, 1...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
7,7,id_00b436dec,GGAAAUCAUCGAGGACGGGUCCGUUCAGCACGCGAAAGCGUCGUGA...,.....(((((((((((..(((((((((..((((....))))..)))...,EEEEESSSSSSSSSSSIISSSSSSSSSIISSSSHHHHSSSSIISSS...,1.727,1,107,68,"[0.4843, 0.5233, 0.4554, 0.43520000000000003, ...",...,"[0.384, 0.723, 0.4766, 0.30260000000000004, 0....","[0.7429, 0.9137000000000001, 0.480400000000000...","[1.1576, 1.5137, 1.3382, 1.5622, 1.2121, 0.295...","[1.6912, 5.2652, 2.3901, 0.45890000000000003, ...","[1.8641, 2.3767, 1.149, 1.0132, 0.9876, 0.0, 0...","[0.49060000000000004, 4.6339, 1.95860000000000...","[1.2852000000000001, 2.5460000000000003, 0.234...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2394,2394,id_ff13729b0,GGAAAUAAAUAAAUAACAAUAAAGAGAUAAGACACAAUAAAUAAAA...,.................................................,EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE...,1.995,0,107,68,"[0.3019, 0.33680000000000004, 0.25980000000000...",...,"[0.25930000000000003, 0.3074, 0.2109, 0.1995, ...","[0.465, 0.6078, 0.30310000000000004, 0.3873000...","[0.3517, 0.5358, 0.4318, 0.016900000000000002,...","[0.6612, 1.0221, 0.1676, 0.1648, 0.5634, 0.645...","[4.0973, 2.0778, 0.2776, 0.1207, 0.63140000000...","[0.2661, 0.5771000000000001, 0.3517, 0.295, 0....","[0.2897, 1.1666, 0.135, 0.4742, 0.9522, 0.8408...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
2395,2395,id_ff84602f7,GGAAAAUAGCAGAGGAAAUACUAGAGCAAUUGCAAAGGCCGAUCAU...,........((..((......))...)).........(((..........,EEEEEEEESSIISSHHHHHHSSIIISSXXXXXXXXXSSSHHHHHHH...,4.036,1,107,68,"[0.2585, 0.29710000000000003, 0.2748, 0.205000...",...,"[0.2093, 0.2985, 0.2922, 0.08360000000000001, ...","[0.29460000000000003, 0.40850000000000003, 0.3...","[0.6957, 1.251, 1.3235999999999999, 0.7521, 0....","[0.6439, 2.0117, 1.3682, 0.0918, 0.65860000000...","[2.1589, 3.3601, 1.6179000000000001, 0.1344000...","[0.47900000000000004, 1.9583, 2.4635, 0.0512, ...","[0.5759000000000001, 2.3736, 1.4158, 0.1914000...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
2396,2396,id_ff85fcdba,GGAAAACAAAAACAAACAACAAAAACAAACAACAAAAACAAACAAC...,.................................................,EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE...,3.227,1,107,68,"[0.2169, 0.2513, 0.2303, 0.22260000000000002, ...",...,"[0.2758, 0.3659, 0.2155, 0.28340000000000004, ...","[0.401, 0.388, 0.3403, 0.3608, 0.3057, 0.242, ...","[0.2891, 0.4496, 0.7165, 0.7128, 0.59310000000...","[0.3619, 0.6924, 0.2988, 0.3639, 0.545, 0.2263...","[2.8541, 1.6106, 1.4343, 1.0797, 0.6803, 0.559...","[0.2964, 0.9351, 0.2555, 0.7603000000000001, 0...","[0.6526000000000001, 0.2548, 0.6927, 0.9316000...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."
2398,2398,id_ffe06f3fe,GGAAACGAUAGCAGAAGAGAUCGAUAUAGAGCAUAAGCUAAGAAUA...,.....((((..(....)..))))......(((....)))..........,EEEEESSSSIISHHHHSIISSSSXXXXXXSSSHHHHSSSXXXXXXX...,5.553,0,107,68,"[0.1431, 0.1847, 0.15960000000000002, 0.1466, ...",...,"[0.0944, 0.1453, 0.1067, 0.0994, 0.06470000000...","[0.1691, 0.22740000000000002, 0.178, 0.1762, 0...","[0.6919000000000001, 1.4823, 1.3685, 1.2473, 0...","[0.4544, 2.4603, 0.8778, 0.6402, 0.28340000000...","[2.7157999999999998, 3.1249000000000002, 1.137...","[0.3262, 1.3932, 0.8832000000000001, 0.8144, 0...","[0.5814, 1.5119, 1.1749, 1.2676, 0.22190000000...","[[[1, 0, 0, 0]], [[1, 0, 0, 0]], [[0, 1, 0, 0]...","[[[1, 0, 0]], [[1, 0, 0]], [[1, 0, 0]], [[1, 0...","[[[1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, ..."


In [23]:
def right_format(df, colname):
    """
    juste pck j'crois que y'avait des liste dans des listes
    j'ai transformé en liste d'array
    """
    x1 = df[colname].copy()
    x2 = []
    for r in x1:
        lst = []
        for u in r:
            lst.append(np.array(u[0], float))
        lst = np.array(lst)
        x2.append(lst)

    return x2

In [24]:
x_seq = right_format(first68_train, "One_hot seq")
#x_strct = right_format(first68_train, "One hot strucure")
#x_loop = right_format(first68_train, "encoding predicted loop")

#print(len(x_seq))
#print(len(x_strct))
#print(len(x_loop))
x_seq = np.array(x_seq)
x_seq

array([[[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]],

       ...,

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [1., 0.

In [21]:
y1 = first68_train['reactivity'].values
y2 = first68_train['deg_Mg_pH10'].values
y3 = first68_train['deg_Mg_50C'].values

In [22]:
x_train = tf.convert_to_tensor(x_train)
x_train

<tf.Tensor: shape=(1887, 107, 3), dtype=int64, numpy=
array([[[5, 2, 8],
        [5, 2, 8],
        [3, 2, 8],
        ...,
        [3, 2, 8],
        [3, 2, 8],
        [4, 2, 8]],

       [[5, 2, 8],
        [5, 2, 8],
        [3, 2, 8],
        ...,
        [3, 2, 8],
        [3, 2, 8],
        [4, 2, 8]],

       [[5, 2, 8],
        [5, 2, 8],
        [3, 2, 8],
        ...,
        [3, 2, 8],
        [3, 2, 8],
        [4, 2, 8]],

       ...,

       [[5, 2, 8],
        [5, 2, 8],
        [3, 2, 8],
        ...,
        [3, 2, 8],
        [3, 2, 8],
        [4, 2, 8]],

       [[5, 2, 8],
        [5, 2, 8],
        [3, 2, 8],
        ...,
        [3, 2, 8],
        [3, 2, 8],
        [4, 2, 8]],

       [[5, 2, 8],
        [5, 2, 8],
        [3, 2, 8],
        ...,
        [3, 2, 8],
        [3, 2, 8],
        [4, 2, 8]]])>

In [224]:
y1

array([list([0.3297, 1.5693000000000001, 1.1227, 0.8686, 0.7217, 0.4384, 0.256, 0.33640000000000003, 0.21680000000000002, 0.3583, 0.9541000000000001, 1.4113, 1.6911, 1.2494, 1.1895, 0.6909000000000001, 0.4736, 0.1754, 0.0582, 0.21730000000000002, 0.0785, 0.8249000000000001, 0.7638, 0.1095, 0.25680000000000003, 0.08950000000000001, 0.15760000000000002, 0.7727, 0.1573, 0.5043, 1.0444, 0.4766, 0.5588000000000001, 0.9054000000000001, 1.0125, 1.0482, 1.044, 0.45220000000000005, 0.211, 0.0461, 0.082, 0.06430000000000001, 0.1526, 0.08940000000000001, 0.5081, 1.0745, 0.3215, 0.0716, 0.0244, 0.0123, 0.19840000000000002, 0.49610000000000004, 1.0641, 0.6394000000000001, 0.6789000000000001, 0.365, 0.1741, 0.1408, 0.1646, 0.5389, 0.683, 0.4273, 0.052700000000000004, 0.0693, 0.1398, 0.2937, 0.23620000000000002, 0.5731]),
       list([0.44820000000000004, 1.4822, 1.1819, 0.7434000000000001, 0.7148, 0.6529, 0.22390000000000002, 0.1927, 0.19690000000000002, 0.3033, 0.6176, 0.38580000000000003, 1.0418, 

In [228]:
tf.convert_to_tensor(y_train)

<tf.Tensor: shape=(1887, 68, 3), dtype=float64, numpy=
array([[[0.3041, 0.3322, 0.4213],
        [1.7311, 1.6811, 1.3996],
        [1.2554, 0.8533, 1.1203],
        ...,
        [0.2271, 0.2281, 0.2264],
        [0.2261, 0.3028, 0.2245],
        [0.5701, 0.8259, 0.2462]],

       [[0.8709, 0.4808, 0.4952],
        [3.4296, 3.2291, 3.6226],
        [1.564 , 2.2518, 2.0154],
        ...,
        [0.5661, 0.5359, 0.9449],
        [0.9591, 0.3728, 0.4612],
        [0.182 , 1.0262, 0.8275]],

       [[0.7129, 0.8175, 0.3888],
        [1.9962, 4.2932, 3.0685],
        [0.733 , 1.0769, 1.0348],
        ...,
        [0.03  , 0.2056, 0.081 ],
        [0.0518, 0.7357, 0.481 ],
        [0.7724, 0.8423, 0.2484]],

       ...,

       [[1.1755, 0.7854, 0.7026],
        [1.6162, 5.2026, 2.6689],
        [1.385 , 1.1572, 1.051 ],
        ...,
        [0.0478, 0.3731, 0.101 ],
        [0.1235, 0.9291, 0.5529],
        [0.4677, 0.8325, 0.6064]],

       [[0.1603, 0.1387, 0.25  ],
        [0.4112, 0.742