In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import Counter
from copy import deepcopy
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

import torch
import torch.nn as nn
from math import ceil
from torch.utils.data import TensorDataset, RandomSampler, DataLoader
from tqdm import tqdm
import math
from torch.autograd import Variable

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**IMPORT DATA**

In [None]:
train_data = pd.read_json('../input/stanford-covid-vaccine/train.json',lines=True)
test_data = pd.read_json('../input/stanford-covid-vaccine/test.json',lines=True)

**SEPARATE THE PUBLIC AND PRIVATE TEST DATASETS BASED ON THE SEQUENCE LENGTH (NOT YET)**

In [None]:
# public_test_data = test_data
public_test_data = test_data[test_data['seq_length'] == 107]
private_test_data = test_data[test_data['seq_length'] > 107]

**GET THE FILTERED TRAINING DATA (NOT YET)**

In [None]:
train_data = train_data[train_data['SN_filter'] == 1]
train_bpps_ids = [id for id in train_data['id']]
private_test_bpps_ids = [id for id in private_test_data['id']]
public_test_bpps_ids = [id for id in public_test_data['id']]
test_bpps_ids = [id for id in public_test_data['id']]
path = '/kaggle/input/stanford-covid-vaccine/bpps/'
train_bpps = np.array([np.load(path + i + '.npy') for i in train_bpps_ids])
private_test_bpps = np.array([np.load(path + i + '.npy') for i in private_test_bpps_ids])
public_test_bpps = np.array([np.load(path + i + '.npy') for i in public_test_bpps_ids])
train_bpps_pad = np.zeros((train_bpps.shape[0], train_bpps.shape[1], 130))
train_bpps_pad[:, :, :107] = train_bpps
public_test_bpps_pad = np.zeros((public_test_bpps.shape[0], public_test_bpps.shape[1], 130))
public_test_bpps_pad[:, :, :107] = public_test_bpps

**INITIALIZE FEATURE PREPROCESSING FUNCTIONS**

In [None]:
def pair_indices(structure, sequence):
    stack_base = []
    stack_index = []
    seqpairposmap = []
    seqpairposmapbase = []
    seqpairpos_dist = []
    for i in range(len(structure)):
        if structure[i] == '(':
            stack_base.append(sequence[i])
            stack_index.append(i)
        if structure[i] == ')':
            pairpos = stack_index.pop()
            pairpos_base = stack_base.pop()
            seqpairposmap.append((i, pairpos))
            pair = (sequence[i], pairpos_base)
            pair = tuple(sorted(pair))
            seqpairposmapbase.append(pair)
    return seqpairposmapbase

In [None]:
def pair_sequence(structure, sequence):
    stack_index = []
    pairSequence = []
    pairMap = []
    for i in range(len(structure)):
        if structure[i] == '(':
            stack_index.append(i)
        if structure[i] == ')':
            pairpos = stack_index.pop()
            pairMap.append((i, pairpos))
    for i in range(len(sequence)):
        paired = 'XX'
        for pairIndex1, pairIndex2 in pairMap:
            if i == pairIndex1 or i == pairIndex2:
                paired = ''.join(sorted(sequence[pairIndex1] + sequence[pairIndex2]))
        pairSequence.append(paired)
    return pairSequence

**GET THE AMOUNT AND TYPES OF PAIRS PER SEQUENCE**

In [None]:
public_test_data['pairs'] = public_test_data.apply(lambda x: pair_indices(x.structure, x.sequence), axis=1)
public_test_data['pairs'] = public_test_data['pairs'].apply(lambda x: Counter(x))
private_test_data['pairs'] = private_test_data.apply(lambda x: pair_indices(x.structure, x.sequence), axis=1)
private_test_data['pairs'] = private_test_data['pairs'].apply(lambda x: Counter(x))
public_test_data = pd.concat([public_test_data.drop(['pairs'], axis=1), public_test_data['pairs'].apply(pd.Series)], axis=1)
train_data['pairs'] = train_data.apply(lambda x: pair_indices(x.structure, x.sequence), axis=1)
train_data['pairs'] = train_data['pairs'].apply(lambda x: Counter(x))
train_data = pd.concat([train_data.drop(['pairs'], axis=1), train_data['pairs'].apply(pd.Series)], axis=1)

**GET THE COUNTS OF EACH BASE PER SEQUENCE**

In [None]:
public_test_data['base_counts'] = public_test_data['sequence'].apply(lambda x: Counter(x))
public_test_data = pd.concat([public_test_data.drop(['base_counts'], axis=1), public_test_data['base_counts'].apply(pd.Series)], axis=1)
private_test_data['base_counts'] = private_test_data['sequence'].apply(lambda x: Counter(x))
private_test_data = pd.concat([private_test_data.drop(['base_counts'], axis=1), private_test_data['base_counts'].apply(pd.Series)], axis=1)
train_data['base_counts'] = train_data['sequence'].apply(lambda x: Counter(x))
train_data = pd.concat([train_data.drop(['base_counts'], axis=1), train_data['base_counts'].apply(pd.Series)], axis=1)

**GET THE PAIR SEQUENCE**

In [None]:
train_data['pair_sequence'] = train_data[['sequence', 'structure']].apply(lambda x: pair_sequence(x.structure, x.sequence), axis=1)
public_test_data['pair_sequence'] = public_test_data[['sequence', 'structure']].apply(lambda x: pair_sequence(x.structure, x.sequence), axis=1)
private_test_data['pair_sequence'] = private_test_data[['sequence', 'structure']].apply(lambda x: pair_sequence(x.structure, x.sequence), axis=1)

**INITIALIZE THE ONE-HOT ENCODER**

In [None]:
encoder = OneHotEncoder()

**TRAIN THE ONE-HOT ENCODER ON AN EXAMPLE SEQUENCE**

In [None]:
sample = train_data['sequence'][0]
sample = [char for char in sample]
encoder.fit(np.array(sample).reshape(-1,1))

**APPLY THE NOW TRAINED ENCODER TO ALL THE SEQUENCES**

In [None]:
public_test_data['sequence_encoding'] = public_test_data['sequence'].apply(lambda x: [char for char in x])
public_test_data['sequence_encoding']=public_test_data['sequence_encoding'].apply(lambda x: encoder.transform(np.array(x).reshape(-1,1)).todense())
private_test_data['sequence_encoding'] = private_test_data['sequence'].apply(lambda x: [char for char in x])
private_test_data['sequence_encoding']=private_test_data['sequence_encoding'].apply(lambda x: encoder.transform(np.array(x).reshape(-1,1)).todense())
train_data['sequence_encoding'] = train_data['sequence'].apply(lambda x: [char for char in x])
train_data['sequence_encoding']=train_data['sequence_encoding'].apply(lambda x: encoder.transform(np.array(x).reshape(-1,1)).todense())

**TRAIN THE ONE-HOT ENCODER ON AN EXAMPLE PREDICTED LOOP TYPE**

In [None]:
encoder_data = train_data['predicted_loop_type'][0] + train_data['predicted_loop_type'][2] + train_data['predicted_loop_type'][5]
encoder_data = [char for char in encoder_data]
encoder_data = np.array(encoder_data).reshape(-1,1)
encoder.fit(encoder_data)

**APPLY THE NOW TRAINED ENCODER TO ALL THE PREDICTED LOOP TYPES**

In [None]:
train_data['predicted_loop_type_encoding'] = train_data['predicted_loop_type'].apply(lambda x: [char for char in x])
train_data['predicted_loop_type_encoding'] = train_data['predicted_loop_type_encoding'].apply(lambda x: encoder.transform(np.array(x).reshape(-1,1)).todense())
public_test_data['predicted_loop_type_encoding'] = public_test_data['predicted_loop_type'].apply(lambda x: [char for char in x])
public_test_data['predicted_loop_type_encoding'] = public_test_data['predicted_loop_type_encoding'].apply(lambda x: encoder.transform(np.array(x).reshape(-1,1)).todense())
private_test_data['predicted_loop_type_encoding'] = private_test_data['predicted_loop_type'].apply(lambda x: [char for char in x])
private_test_data['predicted_loop_type_encoding'] = private_test_data['predicted_loop_type_encoding'].apply(lambda x: encoder.transform(np.array(x).reshape(-1,1)).todense())

**TRAIN THE ONE-HOT ENCODER ON AN EXAMPLE PAIR SEQUENCE**

In [None]:
sample = train_data['pair_sequence'][0]
sample = np.array(sample).reshape(-1,1)
encoder.fit(sample)

**APPLY THE NOW TRAINED ENCODER TO ALL PAIR SEQUENCES**

In [None]:
train_data['pair_sequence_encoding']=train_data['pair_sequence'].apply(lambda x: encoder.transform(np.array(x).reshape(-1,1)).todense())
public_test_data['pair_sequence_encoding']=public_test_data['pair_sequence'].apply(lambda x: encoder.transform(np.array(x).reshape(-1,1)).todense())
private_test_data['pair_sequence_encoding']=private_test_data['pair_sequence'].apply(lambda x: encoder.transform(np.array(x).reshape(-1,1)).todense())

**GET THE DATA IN AN NUMPY ARRAY FORMAT / THE PUBLIC TEST DATA DOES NOT HAVE LABELS (?)**

In [None]:
training_array = np.array([np.concatenate(train_data[['sequence_encoding', 'predicted_loop_type_encoding', 'pair_sequence_encoding']].to_numpy()[i], axis=1) for i in range(train_data.shape[0])])
public_test_array = np.array([np.concatenate(public_test_data[['sequence_encoding', 'predicted_loop_type_encoding', 'pair_sequence_encoding']].to_numpy()[i], axis=1) for i in range(public_test_data.shape[0])])
private_test_array = np.array([np.concatenate(private_test_data[['sequence_encoding', 'predicted_loop_type_encoding', 'pair_sequence_encoding']].to_numpy()[i], axis=1) for i in range(private_test_data.shape[0])])
training_array_targets = np.array([np.concatenate((np.array(train_data['reactivity'][i]).reshape(1,-1), np.array(train_data['deg_Mg_pH10'][i]).reshape(1,-1), np.array(train_data['deg_Mg_50C'][i]).reshape(1,-1), np.array(train_data['deg_pH10'][i]).reshape(1,-1), np.array(train_data['deg_50C'][i]).reshape(1,-1)), axis=0) for i in train_data.index])

**CREATE THE OBJECTS OF OUR MODELS**

In [None]:
class Encoder(nn.Module):
    def __init__(self, pre_train: bool, **kwargs):
        super(Encoder, self).__init__()
        if torch.cuda.is_available():
            self.dev = "cuda:0"
        else:
            self.dev = "cpu"
        self.pre_train = pre_train
        self.input_shape = kwargs["input_shape"]
        self.n_output = kwargs["output_shape"]
        self.d_model = kwargs["d_model"]
        self.nhead = kwargs["nhead"]
        self.num_layers = kwargs["num_layers"]
        self.dim_feedforward = kwargs["dim_feedforward"]
        self.dropout_proba = kwargs["dropout_proba"]
        self.layer_norm = nn.LayerNorm(self.d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=self.d_model, nhead=self.nhead ,dim_feedforward=self.dim_feedforward, activation='gelu', dropout=self.dropout_proba)
        self.fully_connected = nn.Linear(in_features=self.d_model, out_features=self.n_output)
        self.encoder = nn.TransformerEncoder(encoder_layer=self.encoder_layer,
                                             num_layers=self.num_layers, norm=self.layer_norm)
        self.weighted_sum_1 = nn.Linear(in_features=3 * self.d_model + 32, out_features=self.d_model) # + 32 for the bpps encoding
        self.weighted_sum_2 = nn.Linear(in_features=self.d_model, out_features=self.d_model)
        self.bpps_encoding_layer = nn.Linear(in_features=130, out_features=32)
        self.Embedding_layer = nn.Embedding(self.input_shape, self.d_model)
        self.vocabulary_indices = np.array(range(self.input_shape)).reshape(1, -1)
        self.dropout_1 = nn.Dropout(p=self.dropout_proba)
        self.dropout_2 = nn.Dropout(p=self.dropout_proba)
        self.ReLU = nn.ReLU()
        self.Tanh = nn.Tanh()
        self.positional_encoding=PositionalEncoder(d_model=self.d_model)
        if pre_train is True:
            self.mlm_dropout = torch.nn.Dropout2d(p=.1)
        # else:
        #     self.mlm_dropout = torch.nn.Dropout2d(p=.0)

    def forward(self, x): # samples, sequence, dim
        x_indices = self.get_indices(x[:, :, :self.input_shape].cpu())
        x_bpps = self.bpps_encoding_layer(x[:, :, self.input_shape:].to(self.dev))
        x_bpps = self.ReLU(x_bpps)
        x_out = self.Embedding_layer(x_indices.long().to(self.dev)).to(self.dev)
        x_out = x_out.reshape(x.shape[0], x.shape[1], 3 * self.d_model)
        if self.pre_train:
            with torch.no_grad():
                self.mlm_dropout.train()
                x_out = self.mlm_dropout(x_out.unsqueeze_(2)).squeeze()
                x_out_mlm = x_out.detach().clone()
                x_out_mlm_indices = (x_out_mlm.reshape(-1, 3 * self.d_model).sum(1) == 0).nonzero()
        x_out = self.ReLU(x_out)
        x_out = torch.cat((x_out, x_bpps), 2)
        x_out = self.weighted_sum_1(x_out)
        # x_out = self.gaussian_noise(x=x_out, std=1e-1)
        # trial of positional encoding
        x_out = self.ReLU(x_out)
        x_out = self.dropout_1(x_out)
        x_out = self.positional_encoding(x_out)
        x_out = self.weighted_sum_2(x_out)
        x_out = self.ReLU(x_out)
        # x_out = self.dropout_2(x_out)
        x_out = self.encoder(x_out.transpose(0, 1)) # sequence, samples, dim
        x_out = self.fully_connected(x_out)
        if self.pre_train is True:
            return (x_out, x_out_mlm_indices)
        return x_out
    
    def get_indices(self, x):
        shape = x.shape
        x = x.numpy().astype('float')
        x[x == 0] = np.nan
        x_values = np.multiply(x, self.vocabulary_indices)
        return torch.tensor(x_values[~np.isnan(x_values)].reshape(shape[0], shape[1], 3))
    
    def gaussian_noise(self, x, std: float):
        shape = x.shape
        x += (std**0.5)*torch.randn(shape).to(self.dev)
        return x
    
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len=130):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = \
                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        # add constant to embedding
        seq_len = x.size(1)
        x = x + Variable(self.pe[:,:seq_len], requires_grad=False)
        return x

**CREATE THE OBJECTS OF THE LOSS FUNCTIONS**

In [None]:
class Pre_train_loss(nn.Module):
    def _init_(self):
        super()._init_()
        
    def forward(self, yhat, y):   
        loss = nn.CrossEntropyLoss()
        yhat, dropout_indices = yhat
        yhat = yhat.reshape(-1, 15)[dropout_indices].squeeze()
        y = y[:, :, :15].reshape(-1, 15)[dropout_indices].squeeze()
        loss_base = loss(yhat[:, :4], torch.argmax(y[:, :4], 1))
        loss_loop = loss(yhat[:, 4:11], torch.argmax(y[:, 4:11], 1))
        loss_pair = loss(yhat[:, 11:15], torch.argmax(y[:, 11:15], 1))
        loss_full = 0.1*loss_base + 0.65*loss_loop + 0.25*loss_pair
        return loss_full


class Our_Loss(nn.Module):
    def __init__(self, num_targets: int):
        super().__init__()
        self.eps = 1e-6
        self.num_targets = num_targets

    def forward(self, yhat, y):
        if isinstance(yhat, tuple) is True: # if we want to pretrain but not with classification
            yhat, dropout_indices = yhat
        error = y[:, :, :self.num_targets] - yhat[:y.shape[0], :, :self.num_targets] 
        error = torch.square(error)
        error = torch.mean(error, dim=0) + self.eps
        error = torch.sqrt(error)
        error = torch.mean(error, dim=1)
        error = torch.mean(error)
        return error

**CREATE THE OBJECT THAT WILL PERFORM THE TRAINING PROCESS**

In [None]:
class Optimizer():
    def __init__(self, model: torch.nn):
        if torch.cuda.is_available():
            self.dev = "cuda:0"
        else:
            self.dev = "cpu"
        self.model = model.to(self.dev)

    def get_optimizer(self, lr: float):
        return torch.optim.Adam(self.model.parameters(), lr=lr, weight_decay=1e-5)

    def get_criterion(self, num_targets: int):
        if num_targets > 0:
            return Our_Loss(num_targets=num_targets)
        else: # Pretrain!
            return Pre_train_loss()
    
    def optimize(self, X: torch.Tensor, Y: torch.Tensor, optimizer: torch.optim.Adam, num_targets: int):
        optimizer.zero_grad()
        Y_out = self.model(X)
        criterion = self.get_criterion(num_targets=num_targets)
        loss = criterion(Y_out, Y.transpose(0, 1).to(self.dev))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.)
        optimizer.step()
        return loss.detach().item()

    def evaluate(self, X: torch.Tensor, Y: torch.Tensor, num_targets: int):
        self.model.eval()
        with torch.no_grad():
            Y_out = self.model(X)
            criterion = self.get_criterion(num_targets=num_targets)
            loss = criterion(Y_out, Y.transpose(0, 1).to(self.dev))
        self.model.train()
        return loss.detach().item()
    
    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            return self.model(X)

    def get_dataloader(self, X: torch.Tensor, Y: torch.Tensor, batch_size: int) -> DataLoader:
        dataset = TensorDataset(X, Y)
        sampler = RandomSampler(dataset)
        return DataLoader(dataset=dataset, sampler=sampler, batch_size=batch_size)

    def fit(self, X: torch.Tensor, Y: torch.Tensor, X_val: torch.Tensor, Y_val: torch.Tensor, epochs: int, lr: float, batch_size: int, patience: int, best_model: bool, num_targets: int):
        X_train, X_val, Y_train, Y_val = X, X_val, Y, Y_val
        train_dataset = self.get_dataloader(X=X_train, Y=Y_train, batch_size=batch_size)
        val_dataset = self.get_dataloader(X=X_val, Y=Y_val, batch_size=batch_size)
        optimizer = self.get_optimizer(lr=lr)
        train_loss_history, val_loss_history = [], []
        old_loss = np.inf
        counter = 0
        for epoch in range(epochs):
            train_batch_loss_history, val_batch_loss_history = [], []
            for step_num, [X_batch, Y_batch] in enumerate(train_dataset):
                step_loss = self.optimize(X=X_batch.to(self.dev), Y=Y_batch.to(self.dev), optimizer=optimizer, num_targets=num_targets)
                train_batch_loss_history.append(step_loss)
            train_loss_history.append(np.average(train_batch_loss_history))
            for step_num, [X_batch, Y_batch] in enumerate(val_dataset):
                val_step_loss = self.evaluate(X=X_batch.to(self.dev), Y=Y_batch.to(self.dev), num_targets=num_targets)
                val_batch_loss_history.append(val_step_loss)
            val_loss = np.average(val_batch_loss_history)
            if old_loss > val_loss:
                counter = 0
                old_loss = val_loss.copy()
                model = deepcopy(self.model)
            else:
                counter += 1
            val_loss_history.append(val_loss)
            print(f"train loss: {train_loss_history[-1]} \tvalidation loss: {val_loss_history[-1]} for epoch {epoch + 1}")
            if counter == patience:
                self.model = deepcopy(model)
                break
        if best_model:
            self.model = deepcopy(model)

**CREATE THE VALIDATION SPLIT FUNCTION**

In [None]:
def get_validation_dataset(X: torch.Tensor, Y: torch.Tensor, split_ratio: float):
    X, Y = shuffle(X, Y)
    X_train, X_val = X[ceil(split_ratio*len(X)):], X[:ceil(split_ratio*len(X))]
    Y_train, Y_val = Y[ceil(split_ratio * len(Y)):], Y[:ceil(split_ratio * len(Y))]
    return X_train, X_val, Y_train, Y_val


In [None]:
public_test_bpps.shape

**ADD THE BPPS VECTOR TO EACH TIMESTEP**

In [None]:
training_array = np.concatenate((training_array, train_bpps_pad), axis=2)
private_test_array = np.concatenate((private_test_array, private_test_bpps), axis=2)
public_test_array = np.concatenate((public_test_array, public_test_bpps_pad), axis=2)

In [None]:
private_test_array = np.concatenate((private_test_array[:, :, :15], private_test_bpps), axis=2)

In [None]:
public_test_array = np.concatenate((public_test_array[:, :, :15], public_test_bpps_pad), axis=2)

In [None]:
x_test = torch.Tensor(public_test_array)

In [None]:
x_test_private = torch.Tensor(private_test_array)

**CHANGE THE ARRAYS TO TENSORS**

In [None]:
x = torch.Tensor(training_array)
y = torch.Tensor(training_array_targets)
x_test_private = torch.Tensor(private_test_array)
x_test = torch.Tensor(public_test_array)

**CHANGING THE OUTPUTS OF THE TRAIN SET TO MATCH THE INPUT SHAPE**

In [None]:
y = y.transpose(1,2)

**CREATE THE VALIDATION AND TRAIN SETS**

In [None]:
X_train, X_val, Y_train, Y_val = get_validation_dataset(X=x, Y=y, split_ratio=0.2)

**INITIALIZE THE MODEL AND THE OPTIMIZER OBJECTS**

In [None]:
# torch.manual_seed(0)
# np.random.seed(0)
# torch.cuda.manual_seed(0)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# encoder = Encoder(input_shape=15, d_model=256, nhead=4, num_layers=8, output_shape=15, dim_feedforward=512, dropout_proba=0.1, pre_train=True)
# optimizer = Optimizer(model=encoder)                            #  4

**PRE TRAIN THE OPTIMIZER**

In [None]:
# optimizer.fit(
#     X=X_train,
#     Y=X_train,
#     X_val=X_val,
#     Y_val=X_val,
#     epochs=100,
#     lr=3e-4,
#     batch_size=8,
#     patience=75,
#     best_model=True,
#     num_targets=0)

In [None]:
# parameters = {key: value for key, value in dict(optimizer.model.named_parameters()).items() if not key.startswith('fully_connected')}

In [None]:
# encoder_new = Encoder(input_shape=15, d_model=256, nhead=4, num_layers=4, output_shape=5, dim_feedforward=512, dropout_proba=0.1, pre_train=False)
# encoder_new.load_state_dict(parameters, strict=False)
# optimizer_new = Optimizer(model=encoder_new)

**FIT THE OPTIMIZER**

In [None]:
# optimizer_new.fit(X=X_train, Y=Y_train, X_val=X_val, Y_val=Y_val, epochs=150, lr=3e-4, batch_size=8, patience=75, best_model=True, num_targets=5)

In [None]:
# optimizer_new.fit(X=X_train, Y=Y_train, X_val=X_val, Y_val=Y_val, epochs=100, lr=3e-4, batch_size=16, patience=50, best_model=True, num_targets=5)

In [None]:
# optimizer_new.fit(X=X_train, Y=Y_train, X_val=X_val, Y_val=Y_val, epochs=40, lr=3e-4, batch_size=32, patience=30, best_model=True, num_targets=5)

In [None]:
# optimizer_new.fit(X=X_train, Y=Y_train, X_val=X_val, Y_val=Y_val, epochs=15, lr=3e-4, batch_size=128, patience=10, best_model=True, num_targets=5)

In [None]:
# optimizer_new.fit(X=X_train, Y=Y_train, X_val=X_val, Y_val=Y_val, epochs=15, lr=3e-4, batch_size=256, patience=10, best_model=True, num_targets=5)

In [None]:
torch.manual_seed(0)
np.random.seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
NUM_K_FOLD = 5
NUM_TARGETS = 5
DROPOUT = 0.1
PRE_TRAIN = True
PRE_TRAIN_EPOCHS = 120

from sklearn.model_selection import KFold
kfold = KFold(NUM_K_FOLD, shuffle = True, random_state = 0)

epochs = [200, 100, 40, 15, 15]
# epochs = [150, 100, 40, 15, 15]
batch_sizes = [8, 16, 32, 128, 256]
# batch_sizes = [8, 16, 32, 128, 256]
patiences = [75, 50, 20, 10, 10]


if PRE_TRAIN:
    print(f"- Pretraining -")
    encoder = Encoder(input_shape=15, d_model=256, nhead=4, num_layers=4, output_shape=15, dim_feedforward=512, dropout_proba=DROPOUT, pre_train=True)
    optimizer = Optimizer(model=encoder)
    optimizer.fit(
    X=X_train,
    Y=X_train,
    X_val=X_val,
    Y_val=X_val,
    epochs=PRE_TRAIN_EPOCHS,
    lr=3e-4,
    batch_size=8,
    patience=75,
    best_model=False,
    num_targets=15)
    parameters = {key: value for key, value in dict(optimizer.model.named_parameters()).items() if not key.startswith('fully_connected')}

models = []
scores = []
for i, (tr_idx, va_idx) in enumerate(kfold.split(x, y)):
    print(f"------ Fold {i+1} start -----")
    
    x_tr = x[tr_idx]
    x_va = x[va_idx]
    y_tr = y[tr_idx]
    y_va = y[va_idx]
          
    encoder_new = Encoder(input_shape=15, d_model=256, nhead=4, num_layers=2, output_shape=5, dim_feedforward=512, dropout_proba=DROPOUT, pre_train=False)
    if PRE_TRAIN:
        encoder_new.load_state_dict(parameters, strict=False)
    optimizer_new = Optimizer(model=encoder_new)
    
    for j in range(len(epochs)):
        print(f"- Finetune stage {j+1}, epochs {epochs[j]}, batch size {batch_sizes[j]}, patience {patiences[j]} -")
        optimizer_new.fit(X=x_tr, Y=y_tr, X_val=x_va, Y_val=y_va, epochs=epochs[j], lr=3e-4, batch_size=batch_sizes[j], patience=patiences[j], best_model=True, num_targets=NUM_TARGETS)
    
    models.append(optimizer_new)
    
    score3 = optimizer_new.evaluate(x_va, y_va, 3)
    score5 = optimizer_new.evaluate(x_va, y_va, 5)
    scores.append((score3, score5))
    print(f"----- Score 3: {score3}, Score 5: {score5} -----")

In [None]:
x_test_private.shape

In [None]:
x_test.shape

In [None]:
x.shape

In [None]:
preds_pub = []
preds_private = []
for i, optimizer in enumerate(models):
    preds = optimizer.predict(x_test)
    preds = preds.transpose(1, 0).cpu().numpy()
    preds_pub.append(preds)
    preds = optimizer.predict(x_test_private)
    preds = preds.transpose(1, 0).cpu().numpy()
    preds_private.append(preds)
preds_pub = np.array(preds_pub).mean(0)
preds_private = np.array(preds_private).mean(0)
preds_pub.shape

In [None]:
preds_private.shape

In [None]:
targets = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C", "deg_pH10", "deg_50C"]
preds_ls = []
for df, preds in [(public_test_data, preds_pub), (private_test_data, preds_private)]:
    for i, uid in enumerate(df.id):
        single_df = pd.DataFrame(preds[i], columns=targets)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]
        preds_ls.append(single_df)
preds_df = pd.concat(preds_ls)
preds_df.to_csv("submission.csv", index = False)