In [None]:
import math
import os
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import json
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder

# HyperHPeters

In [None]:
# Transformer HyperHPeters
# Only 4 nucleotides possible (ACGT)
ntokens_HP = 4
# Number of attention heads
nhead_HP = 8
# Output size of the heads, which 
# learn an embedding.
head_embedsize_HP = 4
# For some reason PyTorch needs us to do this manually
d_model_HP = head_embedsize_HP * nhead_HP
# Output dimension of the feedforward network
# Set to 1 since this is a regression task
dim_feedforward_HP = 1
# No dropout for now
dropout_HP = 0
# Standard stuff
activation_HP = "relu"
layer_norm_eps_HP = 1e-5
batch_first_HP = True
norm_first_HP = False

# Data:
n_rows_HP = 100
split_HP = 0.2

In [None]:
# Create transformer model
class RegressionTranformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder_transformer = nn.TransformerEncoderLayer(
            d_model = d_model_HP,
            nhead = nhead_HP,
            dim_feedforward = dim_feedforward_HP,
            dropout = dropout_HP,
            activation = activation_HP,
            layer_norm_eps = layer_norm_eps_HP,
            batch_first = batch_first_HP,
            norm_first = norm_first_HP
        )
        self.embedder = nn.Embedding(ntokens_HP, d_model_HP)

    def forward(self, X):
        Y = self.embedder(X)
        Y = self.encoder_transformer(Y)
        return Y
        
RegressionTranformer = RegressionTranformer().to(device)

# Importation and preprocessing

In [None]:
features_df = pd.read_json('data\strandenergylist.json')
energy_df = features_df[1]
features_df = features_df.head(n_rows_HP)
features_df = features_df[0].str.split('', expand=True)
features_df = features_df.drop([0, 31], axis=1)
labels_df = features_df[1]

### Transformation into pytorch tensors

In [None]:
# Transform string features into as DNA tensor
dna = features_df.to_numpy()
for i in range(dna.shape[0]):
    for j in range(dna.shape[1]):
        if (dna[i,j] == 'A'): dna[i,j] = 0
        if (dna[i,j] == 'C'): dna[i,j] = 1
        if (dna[i,j] == 'G'): dna[i,j] = 2
        if (dna[i,j] == 'T'): dna[i,j] = 3
dna = dna.astype(int)
dna = torch.from_numpy(dna)
dna = F.one_hot(dna.to(torch.int64), num_classes=4)
dna

# Get free energy as pytorch tensor
energy = torch.tensor(energy_df.values)
print(energy)

### Creation of test and validation iterators

In [None]:
dataset = TensorDataset()
ntrain = (int) split_HP * n_rows_HP
ntest = n_rows_HP - ntrain
train_set, vali_set = torch.utils.data.random_split(dataset, [ntrain, ntest])
test_dataloader = 

vali_dataload = 

# Train the Transformer

# Notes

- We could add a positionnal encoding (with
PositionalEncoding) (see https://pytorch.org/tutorials/beginner/transformer_tutorial.html)

Usually, embedding is done in order to compress the representation of words (loss of information).
In our case, their are 4 possible tokens, which means that no compression is needed.