In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

import time
from datetime import timedelta

# Data

In [None]:
train_df = pd.read_json('train.json',  lines = True).drop(['index'], axis = 1)
test_df = pd.read_json('test.json', lines = True).drop(['index'], axis = 1)

def cut_short(seq, maxlen = 107):
    return seq[:maxlen]

    
test_df[['sequence']]=test_df['sequence'].apply(cut_short)
test_df[['structure']]=test_df['structure'].apply(cut_short)
test_df[['predicted_loop_type']]=test_df['predicted_loop_type'].apply(cut_short)


In [3]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSXZ')}

target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

train_inputs = preprocess_inputs(train_df.loc[train_df.SN_filter == 1])
train_labels = np.array(train_df.loc[train_df.SN_filter == 1][target_cols].values.tolist()).transpose((0, 2, 1))

In [4]:
from sklearn.model_selection import train_test_split

seed = 123
X_train, X_test, Y_train, Y_test = train_test_split(train_inputs, train_labels, test_size = 0.1, random_state = seed)

print(np.shape(X_train)) #1430 samples de séquences de 107 de long et de 3=sequence,structure, predicted loop type

(1430, 107, 3)


# Model

In [167]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [169]:
import tensorflow.keras.layers as L
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(hidden_dim, dropout=dropout, return_sequences=True))


def build_model(seq_len=107, pred_len=68, dropout=0.5, embed_dim=75, hidden_dim=128):
    inputs = L.Input(shape=(seq_len, 3))

    embed = L.Embedding(input_dim=len(token2int), output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))

    hidden = gru_layer(hidden_dim, dropout)(reshaped)
    hidden = gru_layer(hidden_dim, dropout)(hidden)
    hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, we have
    # to truncate it
    truncated = hidden[:, :pred_len]
    
    out = L.Dense(5, activation='linear')(truncated)

    model = tf.keras.Model(inputs=inputs, outputs=out)

    model.compile(tf.keras.optimizers.Adam(), loss=MCRMSE,  metrics=['accuracy'])
    
    return model

In [170]:
def prepare_window_training(base_index, n_bases,X_train, Y_train, X_test, Y_test):
   
    start_index=int(i-(window_size-1)/2)
    stop_index=int(i+(window_size-1)/2 +1) #not included
    full_array=[]
    for sample_index in range(len(X_train)):
        sample_array=X_train[sample_index]

        if start_index<0:
            out_array=np.array([[14]*len(sample_array[0])]*(-int(start_index)))
            studied_cut_array=sample_array[0:stop_index]
            studied_array=np.concatenate((out_array,studied_cut_array))

        elif stop_index>68:
            out_array=np.array([[14]*len(sample_array[0])]*(int(stop_index-68)))
            studied_cut_array=sample_array[start_index:68]
            studied_array=np.concatenate((studied_cut_array,out_array))

        else :
            studied_array=sample_array[start_index:stop_index]
        full_array.append(studied_array)
    
    full_array_test=[]
    for sample_index in range(len(X_test)):
        sample_array=X_test[sample_index]

        if start_index<0:
            out_array=np.array([[14]*len(sample_array[0])]*(-int(start_index)))
            studied_cut_array=sample_array[0:stop_index]
            studied_array=np.concatenate((out_array,studied_cut_array))

        elif stop_index>68:
            out_array=np.array([[14]*len(sample_array[0])]*(int(stop_index-68)))
            studied_cut_array=sample_array[start_index:68]
            studied_array=np.concatenate((studied_cut_array,out_array))

        else :
            studied_array=sample_array[start_index:stop_index]
        full_array_test.append(studied_array)

    X_train_window=np.array(full_array)
    Y_train_window=Y_train[:,i,:]
    X_test_window=np.array(full_array_test)
    Y_test_window=Y_test[:,i,:]
    
#     print('SHAPE', np.shape(X_train_window))

    return X_train_window, Y_train_window, X_test_window, Y_test_window

# Window mode


In [171]:
def prepare_window_prediction(base_index, n_bases, X_to_predict):
   
    start_index=int(i-(window_size-1)/2)
    stop_index=int(i+(window_size-1)/2 +1) #not included
    full_array=[]
    for sample_index in range(len(X_to_predict)):
        sample_array=X_to_predict[sample_index]

        if start_index<0:
            out_array=np.array([[14]*len(sample_array[0])]*(-int(start_index)))
            studied_cut_array=sample_array[0:stop_index]
            studied_array=np.concatenate((out_array,studied_cut_array))

        elif stop_index>68:
            out_array=np.array([[14]*len(sample_array[0])]*(int(stop_index-68)))
            studied_cut_array=sample_array[start_index:68]
            studied_array=np.concatenate((studied_cut_array,out_array))

        else :
            studied_array=sample_array[start_index:stop_index]
        full_array.append(studied_array)
#     print('SHAPE', np.shape(full_array))
    return np.array(full_array)

In [None]:
#mode fenêtré:
window_size=21 #doit être un nombre impair
X_train=X_train[:,:68,:]
X_test=X_test[:,:68,:]

X_to_predict = preprocess_inputs(test_df)[:,:68,:]
Y_predicted = np.empty(shape=(len(X_to_predict), 68, 5))
print(len(X_to_predict))

lines = []
start_time = time.monotonic()

# for i in [0,1]:
for i in range(68): #on parcourt toute la séquence
    print('Training for base', i, '=============================================')

    X_train_window, Y_train_window, X_test_window, Y_test_window = prepare_window_training(i, 68, 
                                                                                   X_train, Y_train, 
                                                                                   X_test, Y_test)
    #ici il faut faire le fit sur le full_array de dimensions 2160*21*14=nombredeséquences*window_size*nombredeparamètresd'input
    #ensuite on lance le modèle sur les vraies données, 
    #et on stocke le résultat [a,b,c] dans l'item i une liste du type [[1, 0, 0], [0, 1, 0.3], [0.4, 0.7, 1], ...], de dimensions 68*3
    
    model = build_model(seq_len=21, embed_dim=len(token2int))
    model.summary()
    Y_train_window=np.array([np.tile(Y_train_window[i],(21,1)) for i in range(len(Y_train_window))])
    Y_test_window=np.array([np.tile(Y_test_window[i],(21,1)) for i in range(len(Y_test_window))])

    print(np.shape(X_train_window))
    print(np.shape(Y_train_window))
    print(np.shape(X_test_window))
    print(np.shape(Y_test_window))
    
    history = model.fit(
    X_train_window, 
    Y_train_window,
    validation_data=(X_test_window, Y_test_window),
    batch_size=64,
    epochs=5,
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
    )
    
    lines += plt.plot(history.history['loss'], label='loss {0}'.format(i))
    lines += plt.plot(history.history['val_loss'], label='val_loss {0}'.format(i))

    
    print('Evaluation for base', i, '=============================================')

    X_predic=prepare_window_prediction(i,68,X_to_predict)

    base_predict=model.predict(X_predic)
#     print(base_predict)
#     print(np.shape(base_predict))
    for seq in range(len(base_predict)):
        if seq%100==0:
            print(seq)
        mean_predict=np.mean(base_predict[seq], axis=0)
        Y_predicted[seq, i, :]=mean_predict
#         print(mean_predict)

end_time = time.monotonic()
print(timedelta(seconds=end_time - start_time))

labels = [l.get_label() for l in lines]
plt.legend(lines, labels)
plt.show()
#     break
    

3634
Model: "functional_183"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_93 (InputLayer)        [(None, 21, 3)]           0         
_________________________________________________________________
embedding_92 (Embedding)     (None, 21, 3, 15)         225       
_________________________________________________________________
tf_op_layer_Reshape_91 (Tens [(None, 21, 45)]          0         
_________________________________________________________________
bidirectional_273 (Bidirecti (None, 21, 256)           134400    
_________________________________________________________________
bidirectional_274 (Bidirecti (None, 21, 256)           296448    
_________________________________________________________________
bidirectional_275 (Bidirecti (None, 21, 256)           296448    
_________________________________________________________________
tf_op_layer_strided_slice_91 [(None, 21, 256)] 

23/23 - 38s - loss: 0.4288 - accuracy: 0.2422 - val_loss: 0.3046 - val_accuracy: 0.3220
Epoch 2/5
23/23 - 4s - loss: 0.3163 - accuracy: 0.3089 - val_loss: 0.2837 - val_accuracy: 0.2767
Epoch 3/5
23/23 - 4s - loss: 0.3033 - accuracy: 0.2985 - val_loss: 0.2748 - val_accuracy: 0.3270
Epoch 4/5
23/23 - 4s - loss: 0.3051 - accuracy: 0.2884 - val_loss: 0.2743 - val_accuracy: 0.3393
Epoch 5/5
23/23 - 4s - loss: 0.2952 - accuracy: 0.3004 - val_loss: 0.2756 - val_accuracy: 0.2767
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
Model: "functional_191"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_97 (InputLayer)        [(None, 21, 3)]           0         
_________________________________________________________________
embedding_96 (Embedding)     (None, 21, 3, 15)         225     

23/23 - 8s - loss: 0.2957 - accuracy: 0.2908 - val_loss: 0.2850 - val_accuracy: 0.2767
Epoch 2/5
23/23 - 4s - loss: 0.2690 - accuracy: 0.3318 - val_loss: 0.2822 - val_accuracy: 0.3160
Epoch 3/5
23/23 - 4s - loss: 0.2639 - accuracy: 0.3233 - val_loss: 0.2676 - val_accuracy: 0.3408
Epoch 4/5
23/23 - 4s - loss: 0.2613 - accuracy: 0.3371 - val_loss: 0.2717 - val_accuracy: 0.3294
Epoch 5/5
23/23 - 4s - loss: 0.2567 - accuracy: 0.3254 - val_loss: 0.2540 - val_accuracy: 0.3273
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
Model: "functional_199"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_101 (InputLayer)       [(None, 21, 3)]           0         
_________________________________________________________________
embedding_100 (Embedding)    (None, 21, 3, 15)         225      

23/23 - 8s - loss: 0.2561 - accuracy: 0.2499 - val_loss: 0.2509 - val_accuracy: 0.2618
Epoch 2/5
23/23 - 4s - loss: 0.2398 - accuracy: 0.2982 - val_loss: 0.2449 - val_accuracy: 0.2863
Epoch 3/5
23/23 - 4s - loss: 0.2350 - accuracy: 0.3193 - val_loss: 0.2428 - val_accuracy: 0.3297
Epoch 4/5
23/23 - 4s - loss: 0.2346 - accuracy: 0.3297 - val_loss: 0.2428 - val_accuracy: 0.2689
Epoch 5/5
23/23 - 4s - loss: 0.2300 - accuracy: 0.3253 - val_loss: 0.2415 - val_accuracy: 0.3258
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
Model: "functional_207"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_105 (InputLayer)       [(None, 21, 3)]           0         
_________________________________________________________________
embedding_104 (Embedding)    (None, 21, 3, 15)         225      

23/23 - 20s - loss: 0.2909 - accuracy: 0.2711 - val_loss: 0.2944 - val_accuracy: 0.2893
Epoch 2/5
23/23 - 4s - loss: 0.2717 - accuracy: 0.2808 - val_loss: 0.2897 - val_accuracy: 0.2839
Epoch 3/5
23/23 - 4s - loss: 0.2659 - accuracy: 0.3104 - val_loss: 0.2744 - val_accuracy: 0.2938
Epoch 4/5
23/23 - 4s - loss: 0.2604 - accuracy: 0.3107 - val_loss: 0.2698 - val_accuracy: 0.3139
Epoch 5/5
23/23 - 4s - loss: 0.2597 - accuracy: 0.3101 - val_loss: 0.2723 - val_accuracy: 0.3052
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
Model: "functional_215"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_109 (InputLayer)       [(None, 21, 3)]           0         
_________________________________________________________________
embedding_108 (Embedding)    (None, 21, 3, 15)         225     

23/23 - 8s - loss: 0.3173 - accuracy: 0.2907 - val_loss: 0.3297 - val_accuracy: 0.3699
Epoch 2/5
23/23 - 4s - loss: 0.3020 - accuracy: 0.2850 - val_loss: 0.3130 - val_accuracy: 0.2052
Epoch 3/5
23/23 - 4s - loss: 0.2959 - accuracy: 0.2881 - val_loss: 0.3147 - val_accuracy: 0.3833
Epoch 4/5
23/23 - 4s - loss: 0.2943 - accuracy: 0.3113 - val_loss: 0.3053 - val_accuracy: 0.2016
Epoch 5/5
23/23 - 4s - loss: 0.2917 - accuracy: 0.2843 - val_loss: 0.3073 - val_accuracy: 0.3687
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
Model: "functional_223"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_113 (InputLayer)       [(None, 21, 3)]           0         
_________________________________________________________________
embedding_112 (Embedding)    (None, 21, 3, 15)         225      

Epoch 1/5
23/23 - 23s - loss: 0.2722 - accuracy: 0.2799 - val_loss: 0.2584 - val_accuracy: 0.2240
Epoch 2/5
23/23 - 6s - loss: 0.2541 - accuracy: 0.2831 - val_loss: 0.2521 - val_accuracy: 0.3360
Epoch 3/5
23/23 - 5s - loss: 0.2480 - accuracy: 0.3078 - val_loss: 0.2468 - val_accuracy: 0.3264
Epoch 4/5
23/23 - 4s - loss: 0.2437 - accuracy: 0.3131 - val_loss: 0.2390 - val_accuracy: 0.3678
Epoch 5/5
23/23 - 4s - loss: 0.2420 - accuracy: 0.3063 - val_loss: 0.2471 - val_accuracy: 0.3363
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600


Dans la préparation des données, (mode fenêtré), au lieu de mettre des -1 (ou des 14 en fait) à la fin de la séquence on peut mettre les vraies bases et structures

# Prédiction

In [191]:
np.shape(Y_predicted)

(3634, 68, 5)

In [192]:
Y_predicted[0]

array([[0.60537148, 0.65606326, 2.10306787, 0.50557548, 0.66570354],
       [1.89880753, 2.59474087, 3.34839678, 2.58118153, 2.19244337],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.