In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import tensorflow.keras.layers as L
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
tf.random.set_seed(2020)
np.random.seed(2020)

In [3]:
# Jeu de données initial 
train = pd.read_json("train.json", lines = True)
test = pd.read_json("test.json", lines = True)

In [4]:
train = train.query("signal_to_noise >= 1")

In [5]:
def divide_set(df, list_column):
    """
    Fonction qui crée d'une fenetre de lecture de 68 bases. 
        :Parameters:
            df = un data frame train
            list_column = numero de la colonne du data frame 
        :Return:
            deux data frame :
                first68_train = data frame contenant les 68 premieres bases et annotations de structure
                last68_train = data frame contenant les 68 dernieres bases et annotations de structure
    """
    first68 = df.copy()
    last68 = df.copy()
    for a in list_column :
        for i, r in enumerate(df.iloc[:,a]):
            first68.iloc[i,a] = r[0:68]
            last68.iloc[i,a] = r[len(r)-68:]
    return first68, last68

In [6]:
first68_train, last68_train = divide_set(train, [2,3,4])
first68_test, last68_test = divide_set(test, [2,3,4])

In [7]:
def one_hot_encoding(df, name_colonne):
    """
    Fonction pour encoder les sequences, structures et type boucles : One hot encoding
        :Parameters:
            df = data frame : train et test de départ
            name_colonne = le nom de la colonne du df 
        :Return:
            un tableau
    """
    frst_lst = []
    for r in df[name_colonne]:
        for i in range(len(r)):
            if r[i] not in frst_lst:
                frst_lst.append(r[i])
    dico = {}
    ar = np.zeros(shape=(1,len(frst_lst)),dtype=int)
    for i, l in enumerate(frst_lst):
        ar2 = ar.copy()
        ar2[0][i]=1
        dico[l]=ar2
    scnd_lst = []
    for r in df[name_colonne]:
        lst = [] 
        for i in range(len(r)):
            #print(dico[r[i]], r[i])
            lst.append(dico[r[i]])
        scnd_lst.append(lst)
    return np.array(scnd_lst)

In [8]:
tab = one_hot_encoding(first68_train, 'sequence')
tab2 = one_hot_encoding(first68_train, 'structure')
tab3 = one_hot_encoding(first68_train, 'predicted_loop_type')

first68_train["One_hot seq"] = tab.tolist() # ajout des colonnes encoding a notre DF de départ
first68_train["One hot strucure"] = tab2.tolist()
first68_train["encoding predicted loop"] = tab3.tolist()

############################

t = one_hot_encoding(last68_train, 'sequence')
t2 = one_hot_encoding(last68_train, 'structure')
t3 = one_hot_encoding(last68_train, 'predicted_loop_type')

last68_train["One_hot seq"] = t.tolist() # ajout des colonnes encoding a notre DF de départ
last68_train["One hot strucure"] = t2.tolist()
last68_train["encoding predicted loop"] = t3.tolist()

###########################################################################

tt = one_hot_encoding(first68_test, 'sequence')
tt2 = one_hot_encoding(first68_test, 'structure')
tt3 = one_hot_encoding(first68_test, 'predicted_loop_type')

first68_test["One_hot seq"] = tt.tolist() # ajout des colonnes encoding a notre DF de départ
first68_test["One hot strucure"] = tt2.tolist()
first68_test["encoding predicted loop"] = tt3.tolist()

############################

ttt = one_hot_encoding(last68_test, 'sequence')
ttt2 = one_hot_encoding(last68_test, 'structure')
ttt3 = one_hot_encoding(last68_test, 'predicted_loop_type')

last68_test["One_hot seq"] = ttt.tolist() # ajout des colonnes encoding a notre DF de départ
last68_test["One hot strucure"] = ttt2.tolist()
last68_test["encoding predicted loop"] = ttt3.tolist()

In [9]:
def right_format(df, colname):
    """
    juste pck j'crois que y'avait des liste dans des listes
    j'ai transformé en liste d'array
    """
    x1 = df[colname].copy()
    x2 = []
    for r in x1:
        lst = []
        for u in r:
            lst.append(np.array(u[0], float))
        lst = np.array(lst)
        x2.append(lst)

    return x2

In [10]:
x_seq = right_format(first68_train, "One_hot seq")
x_strct = right_format(first68_train, "One hot strucure")
x_loop = right_format(first68_train, "encoding predicted loop")

x_seq = np.array(x_seq)
x_strct = np.array(x_strct)
x_loop = np.array(x_loop)
x_all = np.concatenate((x_seq, x_strct,x_loop), axis=2)

############################

x_seq2 = right_format(last68_train, "One_hot seq")
x_strct2 = right_format(last68_train, "One hot strucure")
x_loop2 = right_format(last68_train, "encoding predicted loop")

x_seq2 = np.array(x_seq2)
x_strct2 = np.array(x_strct2)
x_loop2 = np.array(x_loop2)
x_all2 = np.concatenate((x_seq2, x_strct2,x_loop2), axis=2)

###########################################################################

x_seqt = right_format(first68_test, "One_hot seq")
x_strctt = right_format(first68_test, "One hot strucure")
x_loopt = right_format(first68_test, "encoding predicted loop")

x_seqt = np.array(x_seqt)
x_strctt = np.array(x_strctt)
x_loopt = np.array(x_loopt)
x_allt = np.concatenate((x_seqt, x_strctt,x_loopt), axis=2)

############################

x_seqt2 = right_format(last68_test, "One_hot seq")
x_strctt2 = right_format(last68_test, "One hot strucure")
x_loopt2 = right_format(last68_test, "encoding predicted loop")

x_seqt2 = np.array(x_seqt2)
x_strctt2 = np.array(x_strctt2)
x_loopt2 = np.array(x_loopt2)
x_allt2 = np.concatenate((x_seqt2, x_strctt2,x_loopt2), axis=2)

In [11]:
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']

In [12]:
y_true = tf.random.normal((32, 68, 3))
y_pred = tf.random.normal((32, 68, 3))

In [13]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    re = tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)
    print(re)
    return re

In [21]:
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(
        hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))

def build_model(embed_size, seq_len=68, pred_len=68, dropout=0.5, 
                sp_dropout=0.2, embed_dim=200, hidden_dim=256, n_layers=3):
    inputs = L.Input(shape=(seq_len, 14))# 107 longueur sequence - 4 ATGU
    embed = L.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)

    reshaped = tf.reshape(
        embed, shape=(-1, 68,  embed.shape[2] * embed.shape[3])
    )
    hidden = L.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers):
        print(x,"layer")
        hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = L.Dense(3, activation='relu')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.keras.optimizers.Adam(), loss=MCRMSE)
    
    return model

In [15]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

In [16]:
train_labels = pandas_list_to_array(train[pred_cols])

In [17]:
x_train, x_val, y_train, y_val = train_test_split(
    x_all, train_labels, test_size=.1, random_state=34, stratify=train.SN_filter)

print(x_train.shape)
print(y_train.shape)

(1887, 68, 14)
(1887, 68, 3)


In [22]:
model = build_model(embed_size=14)
model.summary()

0 layer
Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 68, 14)]          0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 68, 14, 200)       2800      
_________________________________________________________________
tf_op_layer_Reshape_1 (Tenso [(None, 68, 2800)]        0         
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 68, 2800)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 68, 512)           4697088   
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 68, 512)]         0         
_________________________________________________________________
dense_1 (Dense)              (None, 68, 3)    

In [23]:
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=60,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

Tensor("MCRMSE/Mean_1:0", shape=(None,), dtype=float32)
Tensor("MCRMSE/Mean_1:0", shape=(None,), dtype=float32)


In [None]:
import plotly.express as px
fig = px.line(
    history.history, y=['loss', 'accuracy', 'val_loss', 'val_accuracy'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [24]:
model.load_weights('model.h5')
train1_preds = model.predict(x_all)
train2_preds = model.predict(x_all2)
test1_preds = model.predict(x_allt)
test2_preds = model.predict(x_allt2)

In [29]:
react_lst = []
mgph_lst = []
mg50_lst = []

for i in range(len(train1_preds)):
    lst1 = []
    lst2 = []
    lst3 = []
    for j in range(len(train1_preds[i])):
        lst1.append(train1_preds[i][j][0])
        lst2.append(train1_preds[i][j][1])
        lst3.append(train1_preds[i][j][2])
    for j in range(29,68):
        lst1.append(train2_preds[i][j][0])
        lst2.append(train2_preds[i][j][1])
        lst3.append(train2_preds[i][j][2])
    react_lst.append(lst1)
    mgph_lst.append(lst2)
    mg50_lst.append(lst3)

id_lst = train['id'].tolist()

#######################

react_lstt = []
mgph_lstt = []
mg50_lstt = []

for i in range(len(test1_preds)):
    lstt1 = []
    lstt2 = []
    lstt3 = []
    for j in range(len(test1_preds[i])):
        lstt1.append(test1_preds[i][j][0])
        lstt2.append(test1_preds[i][j][1])
        lstt3.append(test1_preds[i][j][2])
    for j in range(29,68):
        lstt1.append(test2_preds[i][j][0])
        lstt2.append(test2_preds[i][j][1])
        lstt3.append(test2_preds[i][j][2])
    react_lstt.append(lstt1)
    mgph_lstt.append(lstt2)
    mg50_lstt.append(lstt3)

id_lstt = test['id'].tolist()

zero_lst = [0.]*len(train1_preds)
zero_lstt = [0.]*len(test1_preds)

In [43]:
train_res = pd.DataFrame({'id' : id_lst, 'reactivity' : react_lst, 'deg_Mg_pH10' : mgph_lst, 
                          'deg_Mg_50C' : mg50_lst, 'deg_pH10' : zero_lst, 'deg_50C' : zero_lst})

test_res = pd.DataFrame({'id' : id_lstt, 'reactivity' : react_lstt, 'deg_Mg_pH10' : mgph_lstt, 
                         'deg_Mg_50C' : mg50_lstt, 'deg_pH10' : zero_lstt, 'deg_50C' : zero_lstt})


In [37]:
train_res

Unnamed: 0,id,reactivity,deg_Mg_pH10,deg_Mg_50C,deg_pH10,deg_50C
0,id_001f94081,"[0.7914494, 0.9926132, 0.9954111, 0.8704676, 0...","[0.80218875, 0.9671839, 0.8765932, 0.7789023, ...","[0.9174599, 0.9651952, 0.7893513, 0.73246336, ...",0.0,0.0
1,id_006f36f57,"[0.77885526, 0.9709615, 0.9588372, 0.8087618, ...","[0.792704, 0.9515224, 0.8530046, 0.7465116, 0....","[0.9167105, 0.96395874, 0.7881207, 0.73206407,...",0.0,0.0
2,id_00ab2d761,"[0.7677532, 0.9542286, 0.9343871, 0.77436537, ...","[0.7849598, 0.9398305, 0.83606505, 0.7222327, ...","[0.9167643, 0.9648242, 0.7919145, 0.7445236, 0...",0.0,0.0
3,id_00abef1d7,"[0.7392228, 0.9265516, 0.9237518, 0.82489926, ...","[0.76747924, 0.927589, 0.844452, 0.7870017, 0....","[0.9071127, 0.95227855, 0.7810653, 0.7567013, ...",0.0,0.0
4,id_00b436dec,"[0.7998315, 1.0050045, 1.0128076, 0.89332956, ...","[0.80760515, 0.97626346, 0.8922308, 0.8077295,...","[0.916743, 0.9637152, 0.7860814, 0.7252742, 0....",0.0,0.0
...,...,...,...,...,...,...
2092,id_ff13729b0,"[0.72464055, 0.9058878, 0.8958906, 0.79031736,...","[0.7578562, 0.9153019, 0.8317756, 0.781224, 0....","[0.9037082, 0.9466477, 0.7722059, 0.74358946, ...",0.0,0.0
2093,id_ff84602f7,"[0.74173415, 0.9307721, 0.9308467, 0.83717006,...","[0.7699528, 0.9320326, 0.85222733, 0.8007089, ...","[0.90707886, 0.95236945, 0.78136045, 0.7577369...",0.0,0.0
2094,id_ff85fcdba,"[0.7235519, 0.9029911, 0.8891008, 0.7746555, 0...","[0.75688523, 0.9115791, 0.82116, 0.75359464, 0...","[0.90555525, 0.9497053, 0.77703804, 0.7501864,...",0.0,0.0
2095,id_ffe06f3fe,"[0.78114325, 0.9737403, 0.96095294, 0.8059943,...","[0.7937949, 0.9522316, 0.8517156, 0.7388145, 0...","[0.9171803, 0.9642705, 0.78707147, 0.72545433,...",0.0,0.0


In [44]:
test_res

Unnamed: 0,id,reactivity,deg_Mg_pH10,deg_Mg_50C,deg_pH10,deg_50C
0,id_00073f8be,"[0.7709632, 0.9688732, 0.9750643, 0.874853, 0....","[0.7895822, 0.95550805, 0.873969, 0.8054149, 0...","[0.91353846, 0.96178615, 0.7927722, 0.7647296,...",0.0,0.0
1,id_000ae4237,"[0.7833213, 0.97840863, 0.97092307, 0.8280398,...","[0.7964117, 0.9580894, 0.86406654, 0.76466906,...","[0.91634893, 0.9632615, 0.7863801, 0.727875, 0...",0.0,0.0
2,id_00131c573,"[0.7308323, 0.9159655, 0.91225994, 0.8172247, ...","[0.7621912, 0.92172897, 0.84018093, 0.79001486...","[0.9048219, 0.94861835, 0.775523, 0.7491968, 0...",0.0,0.0
3,id_00181fd34,"[0.75715154, 0.9444402, 0.9328375, 0.80224705,...","[0.77909327, 0.9371185, 0.84408957, 0.7593235,...","[0.91358876, 0.9617897, 0.79363173, 0.7679648,...",0.0,0.0
4,id_0020473f7,"[0.79984295, 1.0062283, 1.0171425, 0.9055023, ...","[0.8076334, 0.9778144, 0.89773864, 0.82305145,...","[0.9151533, 0.96108943, 0.7818727, 0.7189819, ...",0.0,0.0
...,...,...,...,...,...,...
3629,id_ff691b7e5,"[0.74462533, 0.9323901, 0.92784786, 0.8214444,...","[0.7708248, 0.9318121, 0.85007495, 0.7958889, ...","[0.9073545, 0.9524147, 0.7807692, 0.7549485, 0...",0.0,0.0
3630,id_ff9bf3581,"[0.7490961, 0.93567127, 0.9258003, 0.80331415,...","[0.7733842, 0.93177766, 0.8421578, 0.7676159, ...","[0.9103819, 0.95691067, 0.7865818, 0.7591365, ...",0.0,0.0
3631,id_ffc8f96a8,"[0.78497344, 0.9805221, 0.9734564, 0.8308056, ...","[0.7973277, 0.9583359, 0.8619314, 0.7561568, 0...","[0.9180232, 0.9661825, 0.7916662, 0.7378377, 0...",0.0,0.0
3632,id_ffd7e8cc1,"[0.7219948, 0.89892596, 0.88000137, 0.7555619,...","[0.7550838, 0.90897846, 0.81908995, 0.7566986,...","[0.9040456, 0.94705355, 0.7727209, 0.74349064,...",0.0,0.0


In [45]:
frames = [train_res, test_res]

result = pd.concat(frames)

In [46]:
result

Unnamed: 0,id,reactivity,deg_Mg_pH10,deg_Mg_50C,deg_pH10,deg_50C
0,id_001f94081,"[0.7914494, 0.9926132, 0.9954111, 0.8704676, 0...","[0.80218875, 0.9671839, 0.8765932, 0.7789023, ...","[0.9174599, 0.9651952, 0.7893513, 0.73246336, ...",0.0,0.0
1,id_006f36f57,"[0.77885526, 0.9709615, 0.9588372, 0.8087618, ...","[0.792704, 0.9515224, 0.8530046, 0.7465116, 0....","[0.9167105, 0.96395874, 0.7881207, 0.73206407,...",0.0,0.0
2,id_00ab2d761,"[0.7677532, 0.9542286, 0.9343871, 0.77436537, ...","[0.7849598, 0.9398305, 0.83606505, 0.7222327, ...","[0.9167643, 0.9648242, 0.7919145, 0.7445236, 0...",0.0,0.0
3,id_00abef1d7,"[0.7392228, 0.9265516, 0.9237518, 0.82489926, ...","[0.76747924, 0.927589, 0.844452, 0.7870017, 0....","[0.9071127, 0.95227855, 0.7810653, 0.7567013, ...",0.0,0.0
4,id_00b436dec,"[0.7998315, 1.0050045, 1.0128076, 0.89332956, ...","[0.80760515, 0.97626346, 0.8922308, 0.8077295,...","[0.916743, 0.9637152, 0.7860814, 0.7252742, 0....",0.0,0.0
...,...,...,...,...,...,...
3629,id_ff691b7e5,"[0.74462533, 0.9323901, 0.92784786, 0.8214444,...","[0.7708248, 0.9318121, 0.85007495, 0.7958889, ...","[0.9073545, 0.9524147, 0.7807692, 0.7549485, 0...",0.0,0.0
3630,id_ff9bf3581,"[0.7490961, 0.93567127, 0.9258003, 0.80331415,...","[0.7733842, 0.93177766, 0.8421578, 0.7676159, ...","[0.9103819, 0.95691067, 0.7865818, 0.7591365, ...",0.0,0.0
3631,id_ffc8f96a8,"[0.78497344, 0.9805221, 0.9734564, 0.8308056, ...","[0.7973277, 0.9583359, 0.8619314, 0.7561568, 0...","[0.9180232, 0.9661825, 0.7916662, 0.7378377, 0...",0.0,0.0
3632,id_ffd7e8cc1,"[0.7219948, 0.89892596, 0.88000137, 0.7555619,...","[0.7550838, 0.90897846, 0.81908995, 0.7566986,...","[0.9040456, 0.94705355, 0.7727209, 0.74349064,...",0.0,0.0


In [50]:
result.to_csv('result.csv', index=False)