In [None]:
!pip install -q -r requirements.txt

In [None]:
import pandas as pd
import noise_filter
import preprocessing
import base_model
import torch
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import CosineAnnealingLR
import testing

## Run Instructions

### W&B logger

The Weights&Biases API will ask for your API-Key when you run it for the first time. (get your key -> <a>https://wandb.ai/authorize</a>)

Please note that your API-Key is stored in your home directory under `~/.netrc` and will automatically be used for all future runs even in other environments/projects. Use with caution on shared instances.

In [None]:
import wandb
wandb.login()

In [None]:
preprocessor = preprocessing.Preprocessor(max_sequence_length=457)

In [None]:
class LSTM_RNA(torch.nn.Module):
    def __init__(self, hidden_size=256, nlayers=3):
        super(LSTM_RNA, self).__init__()
        self.model_config = {"hidden_size":hidden_size, "nlayers":nlayers}
        self.hidden_size = hidden_size
        self.num_layers = nlayers
        self.lstm1 = torch.nn.LSTM(input_size=1, 
                                   hidden_size=self.hidden_size, 
                                   num_layers=self.num_layers, 
                                   bidirectional=True, 
                                   batch_first=True)
        self.fc1 = torch.nn.Linear(self.hidden_size*2, out_features=2)

    def forward(self, x):
        h_0 = torch.randn(2 * self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c_0 = torch.randn(2 * self.num_layers, x.size(0), self.hidden_size).to(x.device)

        output, (hn, cn) = self.lstm1(x, (h_0, c_0))
        
        output = self.fc1(torch.relu(output))
        
        return output

def create_lstm(suffix, epochs):
    lstm = LSTM_RNA()
    optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    lstm = base_model.BaseModel(optimizer, lstm, f'LSTM-{suffix}.pth', scheduler=scheduler, enable_wandb=True)
    return lstm

In [None]:
class RNN_RNA(torch.nn.Module):
    def __init__(self, hidden_size=64):
        super(RNN_RNA, self).__init__()
        self.model_config = {"hidden_size":hidden_size}
        self.hidden_size = hidden_size
        self.num_layers = 3
        self.rnn1 = torch.nn.RNN(input_size=1, 
                                   hidden_size=self.hidden_size, 
                                   num_layers=self.num_layers, 
                                   bidirectional=True, 
                                   batch_first=True,
                                   nonlinearity='relu')
        self.fc1 = torch.nn.Linear(self.hidden_size*2, out_features=2)

    def forward(self, x):
        x = x.float()
        h_0 = torch.randn(2 * self.num_layers, x.size(0), self.hidden_size).to(x.device)

        output, hn = self.rnn1(x, h_0)
        
        output = self.fc1(torch.relu(output))
        
        return output

def create_rnn(suffix, epochs):
    rnn = RNN_RNA()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    rnn = base_model.BaseModel(optimizer, rnn, f'RNN-{suffix}.pth', scheduler=scheduler, enable_wandb=True)
    return rnn

In [None]:
# See: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
import math

class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, dropout = 0.1, max_len = 5000):
        super().__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class Transformer_RNA(torch.nn.Module):
    def __init__(self, ntoken, d_model, nhead, d_hid, nlayers, out_dim, dropout = 0.5):
        super().__init__()
        self.model_config = {"ntoken":ntoken, "d_model":d_model, "nhead":nhead, "d_hid":d_hid, "nlayers":nlayers, "out_dim":out_dim,"dropout":dropout}
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = torch.nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout, norm_first=True)
        self.transformer_encoder = torch.nn.TransformerEncoder(encoder_layers, nlayers)
        self.embedding = torch.nn.Embedding(ntoken, d_model)
        self.d_model = d_model

        self.linear = torch.nn.Linear(d_model, out_dim)
        
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        
    def forward(self, src):
        src = src.squeeze(dim=-1)
        mask = (src == 0)
        src = src.permute(1, 0)

        src = self.embedding(src.to(torch.int32)) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)

        output = self.transformer_encoder(src, src_key_padding_mask=mask)

        # Apply final linear layer
        final_output = self.linear(output)

        final_output = final_output.permute(1, 0, 2)

        return final_output

def create_transformer(suffix, epochs):
    ntokens = 500  # size of vocabulary (Note: This has to be > ~500 if Structure is encoded)
    emsize = 200  # embedding dimension
    d_hid = 800  # dimension of the feedforward network model in ``nn.TransformerEncoder``
    nlayers = 6  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
    nhead = 5  # number of heads in ``nn.MultiheadAttention``
    dropout = 0.2  # dropout probability
    out_dim = 2 # 1 if dual model, 2 if single model

    transformer = Transformer_RNA(ntokens, emsize, nhead, d_hid, nlayers, out_dim, dropout)
    optimizer = torch.optim.AdamW(transformer.parameters(), lr=0.001, weight_decay=0.05)
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    transformer = base_model.BaseModel(
        optimizer, transformer, f'TRANSFORMER-{suffix}.pth', scheduler=scheduler, enable_wandb=True)
    return transformer

# Training

In [None]:
NUM_EPOCHS = 5

In [None]:
train = pd.read_csv('../data/train_newfeat2.csv')

In [None]:
train.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
tester = testing.Testing()
tester.run_tests(preprocessor=preprocessor, create_model=create_lstm, num_epochs=NUM_EPOCHS, train=train,
                 split_and_fold_tests=False, filter_noise_tests=False, structure_clip_and_weighted_loss_tests=False,
                 different_weighted_loss_tests=True, additive_weighted_loss_tests=True,
                 additional_tags=['LSTM', '256'])

In [None]:
train_sets, preprocessing_config = preprocessor.prepare_xy_split(
    train, categorical=True, 
    shuffle=True, validation_split=None, 
    batch_size=128, filter_noise=True, 
    dual_model=False, k_fold=5, structure=True,
    clip=True, weighted_loss=None, additive_weight=False)

In [None]:
losses = []
skip_first = False
for experiment_type, train_data_loader, validation_data_loader in train_sets:
    if skip_first:
        skip_first = False
        continue
        
    print(f'Model fit {experiment_type}')
    model = create_rnn(experiment_type, NUM_EPOCHS)
    training_losses, validation_losses = model.fit(
        train_data_loader,
        validation_data_loader,
        experiment_type=experiment_type,
        epochs=NUM_EPOCHS,
        verbose=True,
        preprocessing_config=preprocessing_config)
    losses.append([training_losses, validation_losses])
    del model

In [None]:
model.clear_gpu()
del model

In [None]:
plt.plot(range(NUM_EPOCHS), training_losses, label='Train Loss', marker='o', color='orange')
# Plotting the validation loss
plt.plot(range(NUM_EPOCHS), validation_losses, label='Validation Loss', marker='o', color='midnightblue')

# Testing

TBD: Add structure test set and set structure=True for preprocessing

In [None]:
test_data = pd.read_csv('../data/test_newfeat2.csv')

In [None]:
test_data = preprocessor.prepare_prediction_dataset(test_data, batch_size=512, categorical=True, structure=True)

In [None]:
#small_test = [a[1] for a in list(enumerate(test_data))[:2]]

In [None]:
# For single model mode
import gc
final_outputs = pd.DataFrame()
final_outputs.index.name = 'id'
experiment_types = ['DMS_AND_2A3_MaP']

for experiment_type in experiment_types:
    print(f'Model prediction {experiment_type}')
    model = create_transformer(experiment_type, NUM_EPOCHS)
    model.load_model()
    
    final_predictions = model.predict(test_data, single_model_mode=True)
    print(final_predictions.shape)
    
    final_outputs[f'reactivity_DMS_MaP'] = final_predictions[:,0].cpu().numpy()
    final_outputs[f'reactivity_2A3_MaP'] = final_predictions[:,1].cpu().numpy()
    del final_predictions
    del model
    
    gc.collect()
    
    
final_outputs.to_csv(f'Transformer_AdamW_SingleModel_{NUM_EPOCHS}Epochs.csv')

In [None]:
# For dual model mode
import gc
final_outputs = pd.DataFrame()
final_outputs.index.name = 'id'
experiment_types = ['DMS_MaP', '2A3_MaP']

for experiment_type in experiment_types:
    print(f'Model prediction {experiment_type}')
    model = create_transformer(experiment_type, NUM_EPOCHS)
    model.load_model()
    
    final_predictions = model.predict(test_data)
    
    final_outputs[f'reactivity_{experiment_type}'] = final_predictions.cpu().numpy()
    del final_predictions
    del model
    
    gc.collect()
    
    
final_outputs.to_csv(f'Transformer_AdamW_DualModel_{NUM_EPOCHS}Epochs.csv')

In [None]:
final_outputs.shape[0] == 269796671

In [None]:
final_outputs

In [None]:
final_outputs.min(), final_outputs.max()

In [None]:
# read first 50 lines of the submission file to check results
pd.read_csv(f'RNN_CosineScheduler_{NUM_EPOCHS}Epochs.csv', nrows=50)

# Other

In [None]:
from importlib import reload
reload(test_data)

## Clip Submission to [0,1] (recommended)

This ensures that all predictions are in the range [0,1].

In [None]:
subm = pd.read_csv('Transformer_AdamW_DualModel_v2_10Epochs.csv',index_col=0)
subm = subm.clip(0.0,1.0)
subm.to_csv('Transformer_AdamW_DualModel_v2_10Epochs_clipped.csv',index=True)

In [None]:
#Compare Submissions

sm1 = pd.read_csv(f'Transformer_AdamW_DualModel_v2_10Epochs.csv', nrows=50000)
sm2 = pd.read_csv(f'Transformer_AdamW_DualModel_v2_10Epochs_clipped.csv', nrows=50000)
sm1.describe()
sm2.describe()

merged_df = pd.merge(sm1, sm2, on='id', how='inner', suffixes=["_sm1","_sm2"])

merged_df.sample(50)