In [43]:
import math
import os
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import TensorDataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import json
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder

# Hyperparameters

In [44]:
# Transformer HyperHPeters
# The length of our token sequences
ntokens_HP = 30
# Number of attention heads
nheads_HP = 8
# The encoding dimensions of our tokens
token_encode_size_HP = 4
# The embedding string size
# Pytorch will cut this embedded sequence 
# and give an equal amount to each head, different 
# than in theory 
embed_size_HP = token_encode_size_HP *  nheads_HP
# Output size of the heads, which 
# learn an embedding.
head_embedsize_HP = 4
# For some reason PyTorch needs us to do this manually
d_model_HP = head_embedsize_HP * nheads_HP
# No dropout for now
dropout_HP = 0
# Standard stuff
activation_HP = "relu"
layer_norm_eps_HP = 1e-5
batch_first_HP = True
norm_first_HP = False

# Trainig and validation Hyperparameters
datasetsize_HP = 1000
split_HP = 0.2
batchsize_HP = 100
batchsize_HP = min(batchsize_HP, (int)(datasetsize_HP*0.2))
nepochs_HP = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_HP = nn.MSELoss()
learnrate_HP = 1e-4

In [45]:
# Create transformer model
class RegressionTranformer(nn.Module):
    def __init__(self):
        super(RegressionTranformer, self).__init__()
        self.encoder_transformer = nn.TransformerEncoderLayer(
            d_model = embed_size_HP,
            nhead = nheads_HP,
            dropout = dropout_HP,
            activation = activation_HP,
            layer_norm_eps = layer_norm_eps_HP,
            batch_first = batch_first_HP,
            norm_first = norm_first_HP
        )
        # Takes indexed sequences of tokens and 
        # embeds them 
        self.embedder = nn.Embedding(ntokens_HP, embed_size_HP)
        self.linear = nn.Linear(d_model_HP, 1)

        # ADD FORWARD LAYER -> ENCODER OUTPUT -> 1 FOR REGRES

    def forward(self, x):
        ŷ = self.embedder(x)
        ŷ = self.encoder_transformer(ŷ)
        ŷ = self.linear(ŷ)
        return ŷ
        

In [46]:


embed_layer = nn.Embedding(4, 16)
input = torch.rand(10, 30).to(torch.int)
embedded = embed_layer(input)
print(output.shape)

encoder_layer = nn.TransformerEncoderLayer(d_model=16, nhead=8)
out = encoder_layer(embedded)
print(out.shape)

torch.Size([10, 30, 16])
torch.Size([10, 30, 16])


# Importation and preprocessing

In [47]:
features_df = pd.read_json('data\strandenergylist.json')
energy_df = features_df[1]
energy_df = energy_df.head(datasetsize_HP)
features_df = features_df.head(datasetsize_HP)
features_df = features_df[0].str.split('', expand=True)
features_df = features_df.drop([0, 31], axis=1)
labels_df = features_df[1]

### Transformation into pytorch tensors

In [48]:
# Transform string features into as DNA tensor
dna = features_df.to_numpy()
for i in range(dna.shape[0]):
    for j in range(dna.shape[1]):
        if (dna[i,j] == 'A'): dna[i,j] = 0
        if (dna[i,j] == 'C'): dna[i,j] = 1
        if (dna[i,j] == 'G'): dna[i,j] = 2
        if (dna[i,j] == 'T'): dna[i,j] = 3
dna = dna.astype(int)
dna = torch.from_numpy(dna)
#dna = F.one_hot(dna.to(torch.int64), num_classes=4).to(device)
#dna = dna.reshape(datasetsize_HP, 120)
# Get free energy as pytorch tensor
energy = torch.tensor(energy_df.values).to(torch.float).to(device)

### Creation of test and validation iterators

In [49]:
dataset = TensorDataset(dna, energy)
ntrain = (int) (split_HP * datasetsize_HP)
ntest = datasetsize_HP - ntrain
train_set, vali_set = torch.utils.data.random_split(dataset, [ntrain, ntest])

train_dataloader = DataLoader(train_set, batch_size=batchsize_HP, shuffle=True)
vali_dataloader = DataLoader(vali_set, batch_size=datasetsize_HP, shuffle=True)

# Train the Transformer

In [50]:
transformer = RegressionTranformer()
transformer = transformer.to(device)

transformer_optimizer = torch.optim.Adam(transformer.parameters(), lr=learnrate_HP)



def trainroutine():
    for epoch in range(nepochs_HP):
        running_loss = 0.0
        for i, (x_batch, y_batch) in enumerate(train_dataloader):
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            
            # Forward propagation
            ŷ_batch = transformer(x_batch)
            
            # Backpropagation
            batch_loss = loss_HP(ŷ_batch, y_batch)
            batch_loss.backward()
            transformer_optimizer.step()
            transformer_optimizer.zero_grad()

            # Print statistics
            running_loss += batch_loss.item()
        print(batch_loss)
trainroutine()
torch.save(transformer.state_dict(),"transformer_weights.pt")


AssertionError: was expecting embedding dimension of 32, but got 30

# Get validation accuracy

In [None]:
def get_reg_accuracy():
    batch_loss = 0
    for (x_fullbatch, y_fullbatch) in vali_dataloader:
            x_fullbatch = x_fullbatch.to(device)
            y_fullbatch = y_fullbatch.to(device)
            ŷ_fullbatch = transformer(x_fullbatch)
            batch_loss = loss_HP(ŷ_fullbatch, y_fullbatch)
    return batch_loss.item()
print( get_reg_accuracy() )

# Notes

- We could add a positionnal encoding (with
PositionalEncoding) (see https://pytorch.org/tutorials/beginner/transformer_tutorial.html)

Usually, embedding is done in order to compress the representation of words (loss of information).
In our case, their are 4 possible tokens, which means that no compression is needed.