In [1]:
import math
import os
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import TensorDataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import json
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder

if torch.cuda.is_available():
  torch.set_default_tensor_type('torch.cuda.FloatTensor')

# Hyperparameters

In [2]:
# Transformer HyperHPeters
# The length of our token sequences
ntokens_HP = 30
# Number of attention heads
nheads_HP = 8
# The encoding dimensions of our tokens
token_encode_size_HP = 4
# The embedding string size
# Pytorch will cut this embedded sequence 
# and give an equal amount to each head, different 
# than in theory 
embed_size_HP = token_encode_size_HP *  nheads_HP
# Output size of the heads, which 
# learn an embedding.
head_embedsize_HP = 4
# For some reason PyTorch needs us to do this manually
d_model_HP = head_embedsize_HP * nheads_HP
# No dropout for now
dropout_HP = 0
# Number of encoding layers
n_encoders_HP = 3

# Standard stuff
activation_HP = "relu"
layer_norm_eps_HP = 1e-5
batch_first_HP = True
norm_first_HP = False

# Trainig and validation Hyperparameters
datasetsize_HP = 10
split_HP = 0.9
batchsize_HP = 32
batchsize_HP = min(batchsize_HP, (int)(datasetsize_HP*0.2))
nepochs_HP = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_HP = nn.MSELoss()
learnrate_HP = 1e-4

# Import and preprocess the data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
features_df = pd.read_json('/content/drive/MyDrive/strandenergylist.json')
energy_df = features_df[1]
energy_df = energy_df.head(datasetsize_HP)
features_df = features_df.head(datasetsize_HP)
features_df = features_df[0].str.split('', expand=True)
features_df = features_df.drop([0, 31], axis=1)
labels_df = features_df[1]

### Transformation into pytorch tensors

In [6]:
# Json -> Dataframe
features_df = pd.read_json('/content/drive/MyDrive/strandenergylist.json')
energy_df = features_df[1]
energy_df = energy_df.head(datasetsize_HP)
features_df = features_df.head(datasetsize_HP)
features_df = features_df[0].str.split('', expand=True)
features_df = features_df.drop([0, 31], axis=1)
labels_df = features_df[1]


# Dataframe -> Tensors
dna = features_df.to_numpy()
for i in range(dna.shape[0]):
    for j in range(dna.shape[1]):
        if (dna[i,j] == 'A'): dna[i,j] = 0
        if (dna[i,j] == 'C'): dna[i,j] = 1
        if (dna[i,j] == 'G'): dna[i,j] = 2
        if (dna[i,j] == 'T'): dna[i,j] = 3
dna = dna.astype(int)
dna = torch.from_numpy(dna)
energy = torch.tensor(energy_df.values).to(torch.float)


# Tensors -> Dataloaders
dataset = TensorDataset(dna, energy)
ntrain = (int) (split_HP * datasetsize_HP)
ntest = datasetsize_HP - ntrain
train_set, vali_set = torch.utils.data.random_split(dataset, [ntrain, ntest], generator=torch.Generator(device='cuda'))

train_dataloader = DataLoader(train_set, batch_size=batchsize_HP, shuffle=True, generator=torch.Generator(device='cuda'))
vali_dataloader = DataLoader(vali_set, batch_size=batchsize_HP, shuffle=True, generator=torch.Generator(device='cuda'))

# Create the transformer model

In [9]:
# Created entirely by the Pytorch team and pasted here.
# Adds information of position in the encoding of the tokens.
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class RegressionTranformer(nn.Module):
    def __init__(self):
        super(RegressionTranformer, self).__init__()

        # Takes indexed sequences of tokens and embeds them.
        # [batch_size, seq_len] -> [seq_len, batch_size, embedding_dim] TODO check this!
        self.embedder = nn.Embedding(ntokens_HP, embed_size_HP)

        # Will add information about position in sequence to each token.
        # The second argument is the dropout probability.
        # [seq_len, batch_size, embedding_dim] -> [seq_len, batch_size, embedding_dim] TODO check this!
        self.posi_encoder = PositionalEncoding(embed_size_HP, 0.5)

        # The core of our regression transformer
        transformer_encoder_layer = nn.TransformerEncoderLayer(
            d_model = embed_size_HP,
            nhead = nheads_HP,
            dropout = dropout_HP,
            activation = activation_HP,
            layer_norm_eps = layer_norm_eps_HP,
            batch_first = batch_first_HP,
            norm_first = norm_first_HP
        )
        self.encoding_layers = nn.TransformerEncoder(transformer_encoder_layer, n_encoders_HP)

      
        # A linear map returning a scalar value, which is the predicted free
        # energy of the input sequence.
        # [batch_size, flatten_seq_len] -> [1]
        self.linear = nn.Linear(embed_size_HP * ntokens_HP, 1) 


    def forward(self, x):
        ŷ =  self.embedder(x)

        # Normalise the encoding.
        ŷ = ŷ * math.sqrt(embed_size_HP)

        ŷ = torch.permute(ŷ, (1,0,2)) # Necessary for the posi. enc. function.
        ŷ = self.posi_encoder(ŷ)

        ŷ = torch.permute(ŷ, (1,0,2)) # Back to normal
        ŷ = self.encoding_layers(ŷ)

        ŷ = ŷ.flatten(1,2)

        ŷ = self.linear(ŷ)

        return ŷ
        
transformer = RegressionTranformer().to(device)

# Train the Transformer

In [10]:
def get_reg_accuracy():
    total = 0
    
    with torch.inference_mode():
      transformer.eval()
      for x_batch, y_batch in vali_dataloader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            ŷ_batch = transformer(x_batch)
            batch_loss = loss_HP(ŷ_batch, y_batch.unsqueeze(1))
            total += batchsize_HP * batch_loss.item()

    transformer.train()
    return total / (datasetsize_HP*(1-split_HP))


transformer_optimizer = torch.optim.Adam(transformer.parameters(), lr=learnrate_HP)

def trainroutine():
    for epoch in range(nepochs_HP):
        running_loss = 0.0
        for i, (x_batch, y_batch) in enumerate(train_dataloader):
            # Put the data on the device
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            
            # Forward propagation
            ŷ_batch = transformer(x_batch)
            
            # Backpropagation
            batch_loss = loss_HP(ŷ_batch, y_batch.unsqueeze(1))
            transformer_optimizer.zero_grad()
            batch_loss.backward()
            transformer_optimizer.step()
            

            # Print statistics
            running_loss += batch_loss.item()*batchsize_HP
        print("[%d] Training loss: %.3f" %(epoch+1, running_loss/(datasetsize_HP*split_HP)))
        print("[%d] Test loss: %.3f" %(epoch+1, get_reg_accuracy()))
        
trainroutine()
torch.save(transformer.state_dict(),"transformer_weights.pt")


[1] Training loss: 8.872
[1] Test loss: 18.182
[2] Training loss: 4.251
[2] Test loss: 10.000
[3] Training loss: 3.139
[3] Test loss: 6.073
[4] Training loss: 1.969
[4] Test loss: 4.255
[5] Training loss: 1.534
[5] Test loss: 3.307
[6] Training loss: 1.555
[6] Test loss: 2.703
[7] Training loss: 1.801
[7] Test loss: 2.255
[8] Training loss: 1.366
[8] Test loss: 2.180
[9] Training loss: 1.405
[9] Test loss: 2.257
[10] Training loss: 1.190
[10] Test loss: 2.138
[11] Training loss: 1.314
[11] Test loss: 2.067
[12] Training loss: 1.101
[12] Test loss: 1.979
[13] Training loss: 1.359
[13] Test loss: 1.938
[14] Training loss: 0.939
[14] Test loss: 1.934
[15] Training loss: 0.956
[15] Test loss: 2.018
[16] Training loss: 1.412
[16] Test loss: 2.137
[17] Training loss: 0.896
[17] Test loss: 2.404
[18] Training loss: 1.174
[18] Test loss: 2.646
[19] Training loss: 1.012
[19] Test loss: 2.626
[20] Training loss: 0.524
[20] Test loss: 2.732
[21] Training loss: 0.561
[21] Test loss: 2.685
[22] Tra

In [11]:
model_parameters = filter(lambda p: p.requires_grad, transformer.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(params)

414433


# Get validation accuracy

In [12]:
def get_reg_accuracy():
    total = 0
    
    with torch.inference_mode():
      transformer.eval()
      for x_batch, y_batch in vali_dataloader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            ŷ_batch = transformer(x_batch)
            batch_loss = loss_HP(ŷ_batch, y_batch.unsqueeze(1))
            total += batchsize_HP * batch_loss.item()

    transformer.train()
#print( get_reg_accuracy() )

# Notes

- We could add a positionnal encoding (with
PositionalEncoding) (see https://pytorch.org/tutorials/beginner/transformer_tutorial.html)

Usually, embedding is done in order to compress the representation of words (loss of information).
In our case, their are 4 possible tokens, which means that no compression is needed.