In [1]:
import json

import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow.keras.layers as L
import tensorflow as tf
from sklearn.model_selection import train_test_split


In [2]:
tf.random.set_seed(2020)
np.random.seed(2020)

In [3]:
pred_cols = ['reactivity','deg_Mg_pH10','deg_Mg_50C','deg_pH10','deg_50C']

In [4]:
def MCRMSE(y_true,y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred),axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse),axis=1)

In [5]:
def gru_layer(hidden_dim,dropout):
    return L.Bidirectional(L.GRU(hidden_dim,dropout=dropout,return_sequences=True,kernel_initializer='orthogonal'))

In [6]:
def build_model(embed_size,seq_len = 107,pred_len = 68,dropout = 0.5,sp_dropout = 0.2,embed_dim = 75,hidden_dim = 128,n_layers = 2):
    inputs = L.Input(shape=(seq_len,3))
    embed = L.Embedding(input_dim=embed_size,output_dim=embed_size)(inputs)

    reshaped = tf.reshape(embed,shape=(-1,embed.shape[1],embed.shape[2]*embed.shape[3]))
    hidden = L.SpatialDropout1D(sp_dropout)(reshaped)

    for x in range(n_layers):
        hidden = gru_layer(hidden_dim,dropout)(hidden)

    truncated = hidden[:,:pred_len]
    out = L.Dense(5,activation='linear')(truncated)

    model = tf.keras.Model(inputs=inputs,outputs=out)
    model.compile(tf.optimizers.Adam(),loss=MCRMSE)

    return model

In [7]:
def pandas_list_to_array(df):
    return np.transpose(np.array(df.values.tolist()),(0,2,1))

In [8]:
def preprocess_inputs(df,token2int,cols=['sequence','structure','predicted_loop_type']):
    return pandas_list_to_array(df[cols].applymap(lambda seq: [token2int[x] for x in seq]))

In [9]:
data_dir = './covid_data/'
train = pd.read_json(data_dir + 'train.json',lines=True)
test = pd.read_json(data_dir + 'test.json',lines=True)
sample_df = pd.read_csv(data_dir+'sample_submission.csv')

In [10]:
token2int = {x:i for i,x in enumerate('().ACGUBEHIMSX')}

train_inputs = preprocess_inputs(train,token2int)
train_labels = pandas_list_to_array(train[pred_cols])

In [11]:
x_train, x_val, y_train , y_val = train_test_split(train_inputs,train_labels,test_size=.1,random_state=34)

In [12]:
public_df = test.query("seq_length==107")
private_df = test.query("seq_length==130")

public_inputs = preprocess_inputs(public_df,token2int)
private_inputs = preprocess_inputs(private_df,token2int)

In [13]:
model = build_model(embed_size=len(token2int))
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 107, 3)]          0         
_________________________________________________________________
embedding (Embedding)        (None, 107, 3, 14)        196       
_________________________________________________________________
tf_op_layer_Reshape (TensorF [(None, 107, 42)]         0         
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 107, 42)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 107, 256)          132096    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 107, 256)          296448    
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 68, 256)]         0     

In [None]:
history = model.fit(x_train,y_train,validation_data=(x_val,y_val),batch_size=64,epochs=100,verbose=2,callbacks=[tf.keras.callbacks.ReduceLROnPlateau(patience=5),tf.keras.callbacks.ModelCheckpoint('model.h5')])

Train on 2160 samples, validate on 240 samples
Epoch 1/100
2160/2160 - 12s - loss: 0.6382 - val_loss: 0.5684
Epoch 2/100
2160/2160 - 2s - loss: 0.5722 - val_loss: 0.5252
Epoch 3/100
2160/2160 - 2s - loss: 0.5506 - val_loss: 0.5133
Epoch 4/100
2160/2160 - 2s - loss: 0.5364 - val_loss: 0.4935
Epoch 5/100
2160/2160 - 2s - loss: 0.5246 - val_loss: 0.4822
Epoch 6/100
2160/2160 - 2s - loss: 0.5140 - val_loss: 0.4767
Epoch 7/100
2160/2160 - 2s - loss: 0.5083 - val_loss: 0.4711
Epoch 8/100
2160/2160 - 2s - loss: 0.5055 - val_loss: 0.4694
Epoch 9/100


In [None]:
fig = px.line(
    history.history,y=['loss','val_loss'],
    labels={'index':'epoch','value':'MCRMSE'},
    title = 'Training History')
fig.show()

In [None]:
model_public = build_model(seq_len=107,pred_len=107,embed_size=len(token2int))
model_private = build_model(seq_len=130,pred_len=130,embed_size=len(token2int))

model_public.load_weights('model.h5')
model_private.load_weights('model.h5')

In [None]:
public_preds = model_public.predict(public_inputs)
private_preds = model_private.predict(private_inputs)

pred_ls = []
for df,preds in [(public_df,public_preds),(private_df,private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred,columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        pred_ls.append(single_df)
preds_df = pd.concat(pred_ls)
preds_df.head()
submission = sample_df[['id_seqpos']].merge(preds_df,on=['id_seqpos'])
submission.to_csv('submission.csv',index=False)
