In [112]:
import math
import os
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import json
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder

# Hyperparameters

In [113]:
# Only 4 nucleotides possible (ACGT)
ntokens_PARAM = 4
# Number of attention heads
nhead_PARAM = 8
# Output size of the heads, which 
# learn an embedding.
head_embedsize_PARAM = 4
# For some reason PyTorch needs us to do this manually
d_model_PARAM = head_embedsize_PARAM * nhead_PARAM
# Output dimension of the feedforward network
# Set to 1 since this is a regression task
dim_feedforward_PARAM = 1
# No dropout for now
dropout_PARAM = 0
# Standard stuff
activation_PARAM = "relu"
layer_norm_eps_PARAM = 1e-5
batch_first_PARAM = True
norm_first_PARAM = False

# Data:
n_rows_PARAM = 10

In [114]:
# Create transformer model
class RegressionTranformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder_transformer = nn.TransformerEncoderLayer(
            d_model = d_model_PARAM,
            nhead = nhead_PARAM,
            dim_feedforward = dim_feedforward_PARAM,
            dropout = dropout_PARAM,
            activation = activation_PARAM,
            layer_norm_eps = layer_norm_eps_PARAM,
            batch_first = batch_first_PARAM,
            norm_first = norm_first_PARAM
        )
        self.embedder = nn.Embedding(ntokens_PARAM, d_model_PARAM)

    def forward(self, X):
        Y = self.embedder(X)
        Y = self.encoder_transformer(Y)
        return Y
        
RegressionTranformer = RegressionTranformer().to(device)

# Import the data and pre-process the data

In [115]:
dna_df = pd.read_json('data\strandenergylist.json')
dna_df = dna_df.head(n_rows_PARAM)
dna_df = dna_df[0].str.split('', expand=True)
dna_df = dna_df.drop([0, 31], axis=1)
dna_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
0,A,T,T,A,C,T,T,G,C,A,...,T,G,G,T,C,G,G,C,T,C
1,T,T,A,A,C,C,C,G,G,C,...,A,A,T,G,A,A,C,T,G,C
2,A,A,T,C,C,G,T,T,T,C,...,T,G,G,T,C,A,A,G,G,C
3,A,G,T,T,C,T,T,C,G,T,...,T,C,T,A,T,A,A,T,A,A
4,G,A,C,A,T,T,C,T,T,T,...,T,A,A,G,T,C,G,A,A,A
5,G,C,T,C,G,T,C,G,C,A,...,G,A,A,C,T,T,T,C,A,T
6,A,A,G,G,G,T,T,C,G,T,...,T,T,A,G,T,C,T,A,G,A
7,G,G,A,T,G,G,C,C,C,G,...,G,C,G,A,A,A,G,T,G,C
8,C,A,G,C,A,G,A,T,T,G,...,T,T,C,A,T,C,A,C,C,T
9,G,T,A,A,C,G,C,A,C,A,...,C,C,T,T,G,A,C,C,T,C


In [116]:
dna = dna_df.to_numpy()
dna
for i in range(dna.shape[0]):
    for j in range(dna.shape[1]):
        if (dna[i,j] == 'A'): dna[i,j] = 1
        if (dna[i,j] == 'C'): dna[i,j] = 2
        if (dna[i,j] == 'G'): dna[i,j] = 3
        if (dna[i,j] == 'T'): dna[i,j] = 4
dna = dna.astype(int)
dna

array([[1, 4, 4, 1, 2, 4, 4, 3, 2, 1, 4, 3, 1, 2, 3, 1, 4, 2, 3, 4, 4, 3,
        3, 4, 2, 3, 3, 2, 4, 2],
       [4, 4, 1, 1, 2, 2, 2, 3, 3, 2, 3, 4, 4, 4, 1, 3, 2, 2, 4, 2, 1, 1,
        4, 3, 1, 1, 2, 4, 3, 2],
       [1, 1, 4, 2, 2, 3, 4, 4, 4, 2, 3, 2, 2, 1, 3, 4, 3, 2, 2, 2, 4, 3,
        3, 4, 2, 1, 1, 3, 3, 2],
       [1, 3, 4, 4, 2, 4, 4, 2, 3, 4, 4, 1, 2, 4, 4, 4, 2, 4, 3, 4, 4, 2,
        4, 1, 4, 1, 1, 4, 1, 1],
       [3, 1, 2, 1, 4, 4, 2, 4, 4, 4, 3, 3, 4, 4, 3, 2, 2, 3, 1, 2, 4, 1,
        1, 3, 4, 2, 3, 1, 1, 1],
       [3, 2, 4, 2, 3, 4, 2, 3, 2, 1, 2, 4, 2, 1, 3, 4, 4, 4, 2, 3, 3, 1,
        1, 2, 4, 4, 4, 2, 1, 4],
       [1, 1, 3, 3, 3, 4, 4, 2, 3, 4, 3, 4, 3, 4, 4, 3, 3, 3, 4, 4, 4, 4,
        1, 3, 4, 2, 4, 1, 3, 1],
       [3, 3, 1, 4, 3, 3, 2, 2, 2, 3, 1, 1, 2, 1, 4, 2, 4, 1, 4, 2, 3, 2,
        3, 1, 1, 1, 3, 4, 3, 2],
       [2, 1, 3, 2, 1, 3, 1, 4, 4, 3, 1, 2, 4, 4, 4, 4, 2, 3, 4, 3, 4, 4,
        2, 1, 4, 2, 1, 2, 2, 4],
       [3, 4, 1, 1, 2, 3, 2, 1, 2, 1,

In [117]:
embedder = nn.Embedding(ntokens_PARAM, d_model_PARAM)
dna_tensor = torch.from_numpy(dna)
y = embedder(dna_tensor)
print(y)


IndexError: index out of range in self

# Train the Transformer

# Notes

- We could add a positionnal encoding (with
PositionalEncoding) (see https://pytorch.org/tutorials/beginner/transformer_tutorial.html)

Usually, embedding is done in order to compress the representation of words (loss of information).
In our case, their are 4 possible tokens, which means that no compression is needed.