<a href="https://colab.research.google.com/github/Pooret/resume/blob/main/pytorch_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


**Positional Encoding**

The purpose of these positional encodings is to inject some information about the relative or absolute position of the tokens in the sequence, since the transformer has no built-in notion of order.

*pos* - position

*i* - dimension

Each dimenson of the positional encoding corresponds to a sinusoid and have wavelengths that form a geometric progression from $2\pi$ to $10000 * 2\pi$

$${PE}_{(pos, 2i)} = \sin(\frac{pos}{10000^{2i / d_{model}}})$$
$${PE}_{(pos, 2i+1)} = \cos(\frac{pos}{10000^{2i / d_{model}}})$$



In [26]:
import torch
import torch.nn as nn
import math

d_model = 512
max_len = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=500):
    super(PositionalEncoding, self).__init__()
    self.encoding = torch.zeros(1, max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) # (max_len, 1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float()* -(math.log(10000.0) / d_model)) # decreasing scaling factor length of d_model//2

    self.encoding[:, :, 0::2] = torch.sin(position * div_term)
    self.encoding[:, :, 1::2] = torch.cos(position * div_term)

  def forward(self, x):
    """
    x - (batch_size, seq_len, d_model)
    """
    seq_len = x.size(1)
    return x + self.encoding[:, :seq_len, :].to(device)

pos_encoder = PositionalEncoding(d_model, max_len)
input_tensor = torch.zeros(1, max_len, d_model).to(device)
output = pos_encoder(input_tensor)
print(output.shape)  # (1, max_len, d_model)

torch.Size([1, 100, 512])


$$\mathbf{Attention}(Q, K, V) = \mathsf{softmax}(\frac{QK^T}{\sqrt{d_{k}}})V$$

In [27]:
d_model = 512
max_len = 100
dropout = 0.1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class ScaledDotProductAttention(nn.Module):
  def __init__(self, d_model, dropout=0.1):
    super(ScaledDotProductAttention, self).__init__()
    self.temperature = math.sqrt(d_model) # scaling factor
    self.dropout = nn.Dropout(dropout)
    self.softmax = nn.Softmax(dim=2)

  def forward(self, q, k, v, mask=None):
    """
    q, k, v (batch_size, n_heads, seq_len, d_k)
    """
    # k transpose (batch_size, n_heads, d_k, seq_len)
    attn = torch.matmul(q, k.transpose(-2, -1)) / self.temperature # attn - (batch_size, n_heads, seq_len, seq_len)
    if mask is not None:
      attn = attn.masked_fill(mask == 0, -1e9) # large negative numbers for softmax
    attn = self.softmax(attn)
    attn = self.dropout(attn)
    output = torch.matmul(attn, v)
    return output, attn # (batch_size, n_heads, seq_len, d_k) (batch_size, n_heads, seq_len, seq_len)

attn = ScaledDotProductAttention(d_model, dropout)
q = torch.rand(64, 10, d_model)  # (batch_size, seq_len, n_heads * d_k)
k = torch.rand(64, 10, d_model)
v = torch.rand(64, 10, d_model)
output, attn_weights = attn(q, k, v)
print(output.shape)  # (batch_size, seq_len, n_heads * d_k)
print(attn_weights.shape)  # (batch_size, seq_len, seq_len)

torch.Size([64, 10, 512])
torch.Size([64, 10, 10])


In [28]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, n_heads, dropout = 0.1):
    super(MultiHeadAttention, self).__init__()
    self.n_heads = n_heads
    self.d_model = d_model
    self.d_k = d_model // n_heads

    self.q_linear = nn.Linear(d_model, d_model)
    self.k_linear = nn.Linear(d_model, d_model)
    self.v_linear = nn.Linear(d_model, d_model)
    self.fc = nn.Linear(d_model, d_model)

    self.attention = ScaledDotProductAttention(self.d_k, dropout)
    self.dropout = nn.Dropout(dropout)
    self.layer_norm = nn.LayerNorm(d_model) # normalize across features dimension

  def forward(self, q, k, v, mask=None):
    batch_size = q.size(0) # (batch_size, seq_len, d_model)

    # q, k, v - (batch_size, seq_len, n_heads, d_k)
    q = self.q_linear(q).view(batch_size, -1, self.n_heads, self.d_k) # n_heads * d_k = d_model
    k = self.k_linear(k).view(batch_size, -1, self.n_heads, self.d_k)
    v = self.v_linear(v).view(batch_size, -1, self.n_heads, self.d_k)

    # (batch_size, seq_len, n_heads, d_k) -> (batch_size, n_heads, seq_len, d_k)
    q = q.transpose(1,2)
    k = k.transpose(1,2)
    v = v.transpose(1,2)

    attn_output, attn_weights = self.attention(q, k, v, mask) # outputs - (batch_size, n_heads, seq_len, d_k) (batch_size, n_heads, seq_len, seq_len)

    # The contiguous() function is required because transpose may change the memory layout, making it non-contiguous. The view function requires a contiguous tensor
    attn_output = attn_output.transpose(1,2).contiguous().view(batch_size, -1, self.d_model) #  (batch_size, n_heads, seq_len, d_k) - > (batch_size, seq_len, n_heads * d_k)
    output = self.dropout(self.fc(attn_output)) # (batch_size, seq_len, d_model)
    output = self.layer_norm(output + q.reshape(batch_size, -1, self.d_model))

    return output, attn_weights # (batch_size, seq_len, d_model), (batch_size, n_heads, seq_len, seq_len)

d_model = 512
n_heads = 8
dropout = 0.1
multi_head_attn = MultiHeadAttention(d_model, n_heads, dropout)
q = torch.rand(64, 10, d_model)  # (batch_size, seq_len, n_heads * d_k)
k = torch.rand(64, 10, d_model)
v = torch.rand(64, 10, d_model)
output, attn_weights = multi_head_attn(q, k, v)
print(output.shape)  # (batch_size, seq_len, d_model)
print(attn_weights.shape)  #(batch_size, n_heads, seq_len, seq_len)

torch.Size([64, 10, 512])
torch.Size([64, 8, 10, 10])


In [29]:
import torch.nn.functional as F

class PositionwiseFeedForward(nn.Module):
  def __init__(self, d_model, d_ff, dropout=0.1):
    super(PositionwiseFeedForward, self).__init__()
    self.linear1 = nn.Linear(d_model, d_ff)
    self.dropout = nn.Dropout(dropout)
    self.linear2 = nn.Linear(d_ff, d_model)
    self.layer_norm = nn.LayerNorm(d_model)

  def forward(self, x):
    residual = x
    x = self.linear1(x)
    x = F.relu(x)
    x = self.dropout(x)
    x = self.linear2(x)
    x = self.dropout(x)
    x = self.layer_norm(x + residual) # normalize across features dimension
    return x

d_model = 512
d_ff = 2048
dropout = 0.1
ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
input_tensor = torch.rand(64, 10, d_model)  # (batch_size, seq_len, d_model)
output = ffn(input_tensor)
print(output.shape)  # (batch_size, seq_len, d_model)

torch.Size([64, 10, 512])


In [30]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        # Self-attention sublayer
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.layer_norm(x + attn_output)  # Add & Norm

        # Feed-forward sublayer
        x = self.feed_forward(x)

        return x

d_model = 512
n_heads = 8
d_ff = 2048
num_layers = 1
dropout = 0.1
encoder_layer = EncoderLayer(d_model, n_heads, d_ff, dropout)
input_tensor = torch.rand(64, 10, d_model)  # (batch_size, seq_len, d_model)
output = encoder_layer(input_tensor)
print(output.shape)  # (batch_size, seq_len, d_model)

torch.Size([64, 10, 512])


In [31]:
class DecoderLayer(nn.Module):
  def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
    super(DecoderLayer, self).__init__()
    self.self_attn = MultiHeadAttention(d_model, n_heads, dropout)
    self.cross_attn = MultiHeadAttention(d_model, n_heads, dropout)
    self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
    self.layer_norm = nn.LayerNorm(d_model)

  def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
    # self-attention sublayer
    self_attn_output, _ = self.self_attn(x, x, x, tgt_mask)
    x = self.layer_norm(x + self_attn_output) # add and norm

    # cross-attention sublayer (q->x, k,v -> enc_output)
    cross_attn_output, _ = self.cross_attn(x, enc_output, enc_output, src_mask)

    # feed-forward sublayer
    x = self.feed_forward(x)

    return x # (batch_size, seq_len, d_model)

d_model = 512
n_heads = 8
d_ff = 2048
dropout = 0.1
decoder_layer = DecoderLayer(d_model, n_heads, d_ff, dropout)
input_tensor = torch.rand(64, 10, d_model)  # (batch_size, seq_len, d_model)
enc_output = torch.rand(64, 10, d_model)  # (batch_size, seq_len, d_model)
output = decoder_layer(input_tensor, enc_output)
print(output.shape)  # (batch_size, seq_len, d_model)

torch.Size([64, 10, 512])


In [34]:
class Encoder(nn.Module):
  def __init__(self, d_model, n_heads, d_ff, num_layers, dropout=0.1):
    super(Encoder, self).__init__()
    self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(num_layers)])
    self.layer_norm = nn.LayerNorm(d_model)

  def forward(self, x, mask=None):
    for layer in self.layers:
      x = layer(x, mask)
    x = self.layer_norm(x)
    return x

class Decoder(nn.Module):
  def __init__(self, d_model, n_heads, d_ff, num_layers, dropout=0.1):
    super(Decoder, self).__init__()
    self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(num_layers)])
    self.layer_norm = nn.LayerNorm(d_model)

  def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
    for layer in self.layers:
      x = layer(x, enc_output, src_mask, tgt_mask)
    x = self.layer_norm(x)
    return x

class Transformer(nn.Module):
  def __init__(self, d_model, n_heads, d_ff, num_layers, src_vocab_size, tgt_vocab_size, max_len, dropout=0.1):
    super(Transformer, self).__init__()
    self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
    self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
    self.positional_encoding = PositionalEncoding(d_model, max_len)
    self.encoder = Encoder(d_model, n_heads, d_ff, num_layers, dropout)
    self.decoder = Decoder(d_model, n_heads, d_ff, num_layers, dropout)
    self.fc = nn.Linear(d_model, tgt_vocab_size)

  def forward(self, src, tgt, src_mask=None, tgt_mask=None):
    # Encoder
    enc_output = self.encoder_embedding(src) # enc_output - (batch_size, src_len, d_model)
    enc_output = self.positional_encoding(enc_output)
    enc_output = self.encoder(enc_output, src_mask)

    # Decoder
    dec_output = self.decoder_embedding(tgt) # dec_output - (batch_size, tgt_len, d_model)
    dec_output = self.positional_encoding(enc_output)
    dec_output = self.decoder(dec_output, enc_output, src_mask, tgt_mask)

    # Final layer
    output = self.fc(dec_output) # (batch_size, tgt_len, tgt_vocab_size)
    return output

d_model = 512
n_heads = 8
d_ff = 2048
num_layers = 6
src_vocab_size = 10000
tgt_vocab_size = 10000
max_len = 100
dropout = 0.1
transformer = Transformer(d_model, n_heads, d_ff, num_layers, src_vocab_size, tgt_vocab_size, max_len, dropout).to(device)
src = torch.randint(0, src_vocab_size, (64, 10)).to(device)  # (batch_size, src_len)
tgt = torch.randint(0, tgt_vocab_size, (64, 10)).to(device)  # (batch_size, tgt_len)
output = transformer(src, tgt)
print(output.shape) # (batch_size, tgt_len, tgt_vocab_size)

torch.Size([64, 10, 10000])


In [35]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Datasets/chemdata/smiles_to_iupac/df_small.csv")
df.head()

Unnamed: 0,molregno,canonical_smiles,activity_id,standard_type,standard_value,standard_units,assay_id,tid,target_chembl_id,organism,...,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,np_likeness_score,bei,le,lle,sei,smiles_len,iupac
0,188,COC(=O)[C@@H](N)CO,339708,Km,34000000.0,nM,68870,11770,CHEMBL2943,Rattus norvegicus,...,4.0,3.0,0.0,0.91,,,,,18,methyl (2S)-2-amino-3-hydroxypropanoate
1,238,CCc1cc2c(O)ncnc2s1,16421311,Potency,100000.0,nM,1543582,114868,CHEMBL612558,Homo sapiens,...,3.0,1.0,0.0,-1.96,27.02,0.55,2.91,10.58,18,"6-ethyl-3H-thieno[2,3-d]pyrimidin-4-one"
2,242,C1=C(c2ccccc2)CCNC1,757035,IC50,12000000.0,nM,226540,13054,CHEMBL3998,Rattus norvegicus,...,1.0,1.0,0.0,0.27,37.68,0.68,3.94,49.88,19,"4-phenyl-1,2,3,6-tetrahydropyridine"
3,265,CCc1cc2c(Cl)ncnc2s1,994954,IC50,100000.0,nM,213817,22224,CHEMBL612558,Homo sapiens,...,2.0,0.0,0.0,-2.23,23.88,0.54,1.83,18.4,19,"4-chloro-6-ethylthieno[2,3-d]pyrimidine"
4,423,CSc1ccc(CC(C)N)cc1,5232140,Ki,630957.34,nM,725353,106368,CHEMBL614910,Rattus norvegicus,...,1.0,2.0,0.0,-0.66,39.33,0.81,4.83,27.4,18,1-(4-methylsulfanylphenyl)propan-2-amine


In [36]:
df = df[~df['iupac'].isnull()].reset_index(drop=True)
smiles = df['canonical_smiles']
labels = df['iupac']
smiles, labels

(0        COC(=O)[C@@H](N)CO
 1        CCc1cc2c(O)ncnc2s1
 2       C1=C(c2ccccc2)CCNC1
 3       CCc1cc2c(Cl)ncnc2s1
 4        CSc1ccc(CC(C)N)cc1
                ...         
 7364      Cc1csc(N)c1C(N)=O
 7365      Cc1cc(CO)c2n1CSC2
 7366      C#CC(=O)Nc1ccccc1
 7367          C#CCNC(=O)C#C
 7368           N#Cc1cccnc1O
 Name: canonical_smiles, Length: 7369, dtype: object,
 0                 methyl (2S)-2-amino-3-hydroxypropanoate
 1                 6-ethyl-3H-thieno[2,3-d]pyrimidin-4-one
 2                     4-phenyl-1,2,3,6-tetrahydropyridine
 3                 4-chloro-6-ethylthieno[2,3-d]pyrimidine
 4                1-(4-methylsulfanylphenyl)propan-2-amine
                               ...                        
 7364              2-amino-4-methylthiophene-3-carboxamide
 7365    (5-methyl-1,3-dihydropyrrolo[1,2-c][1,3]thiazo...
 7366                               N-phenylprop-2-ynamide
 7367                          N-prop-2-ynylprop-2-ynamide
 7368                     2-oxo-1H-py

In [37]:
from torch.utils.data import DataLoader, Dataset
import re
import pandas as pd
from collections import Counter
import itertools

# Tokenization functions
def tokenize_iupac(name):
    patterns = [
        r'\([^\)]+\)',  # Match groups inside parentheses
        r'\[[^\]]+\]',  # Match groups inside square brackets
        r'[A-Za-z]+-\d+',  # Match chemical groups with numbers (e.g., ethyl-6)
        r'\d+-[A-Za-z]+',  # Match numbers with chemical groups (e.g., 2-amino)
        r'\d+',  # Match standalone numbers
        r'[A-Za-z]+',  # Match standalone chemical groups
        r'[^\s\w]',     # Match remaining single non-word characters
    ]
    combined_pattern = '|'.join(patterns)
    tokens = re.findall(combined_pattern, name)
    return tokens

def tokenize_smiles(smiles):
    # Simple tokenization for SMILES (you can replace with a more complex tokenizer if needed)
    return list(smiles)


# Tokenize the SMILES and IUPAC names
df['smiles_tokens'] = df['canonical_smiles'].apply(tokenize_smiles)
df['iupac_tokens'] = df['iupac'].apply(tokenize_iupac)

# Build the vocabularies
def build_vocab(token_list, min_freq=1):
    counter = Counter(itertools.chain.from_iterable(token_list))
    vocab = {token: idx for idx, (token, freq) in enumerate(counter.items(), start=4) if freq >= min_freq}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    vocab['<unk>'] = 3
    return vocab

smiles_vocab = build_vocab(df['smiles_tokens'])
iupac_vocab = build_vocab(df['iupac_tokens'])

# Convert sentences to sequences of token IDs
def sentence_to_ids(sentence, vocab, tokenize_fn, add_special_tokens=True):
    tokens = tokenize_fn(sentence)
    ids = [vocab.get(token, vocab['<unk>']) for token in tokens]
    if add_special_tokens:
        ids = [vocab['<sos>']] + ids + [vocab['<eos>']]
    return ids

df['smiles_ids'] = df['canonical_smiles'].apply(lambda x: sentence_to_ids(x, smiles_vocab, tokenize_smiles))
df['iupac_ids'] = df['iupac'].apply(lambda x: sentence_to_ids(x, iupac_vocab, tokenize_iupac))

# Define the Dataset class
class ChemistryDataset(Dataset):
    def __init__(self, df, src_vocab, tgt_vocab, max_len=100):
        self.df = df
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src_seq = self.df.iloc[idx]['smiles_ids']
        tgt_seq = self.df.iloc[idx]['iupac_ids']
        src_seq = self.pad_sequence(src_seq, self.src_vocab)
        tgt_seq = self.pad_sequence(tgt_seq, self.tgt_vocab)
        return torch.tensor(src_seq), torch.tensor(tgt_seq)

    def pad_sequence(self, seq, vocab):
        if len(seq) < self.max_len:
            seq = seq + [vocab['<pad>']] * (self.max_len - len(seq))
        else:
            seq = seq[:self.max_len]
        return seq

# Initialize Dataset and DataLoader
max_len = 50
dataset = ChemistryDataset(df, smiles_vocab, iupac_vocab, max_len)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Example: Check a batch
for src_batch, tgt_batch in dataloader:
    print(src_batch)
    print(tgt_batch)
    break


tensor([[ 1, 13, 13,  ...,  0,  0,  0],
        [ 1,  5,  7,  ...,  0,  0,  0],
        [ 1,  4,  4,  ...,  0,  0,  0],
        ...,
        [ 1,  4,  5,  ...,  0,  0,  0],
        [ 1, 13, 26,  ...,  0,  0,  0],
        [ 1, 20, 14,  ...,  0,  0,  0]])
tensor([[   1, 2574,    6,  ...,    0,    0,    0],
        [   1, 3221,    6,  ...,    0,    0,    0],
        [   1,  413,    6,  ...,    0,    0,    0],
        ...,
        [   1,  176,    6,  ...,    0,    0,    0],
        [   1,  257,    6,  ...,    0,    0,    0],
        [   1, 2387,    6,  ...,    0,    0,    0]])


In [22]:
from tqdm import tqdm
import torch.optim as optim

# Hyperparameters
d_model = 512
n_heads = 8
d_ff = 2048
num_layers = 6
src_vocab_size = len(smiles_vocab)
tgt_vocab_size = len(iupac_vocab)
max_len = 50
dropout = 0.1
num_epochs = 20
learning_rate = 1e-4
batch_size = 32

# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(d_model, n_heads, d_ff, num_layers, src_vocab_size, tgt_vocab_size, max_len, dropout).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=smiles_vocab['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop with tqdm
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for src_batch, tgt_batch in progress_bar:
        src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)

        optimizer.zero_grad()

        output = model(src_batch, tgt_batch[:, :-1])

        # Adjust reshaping to match the target sequence length
        output = output[:, :-1, :].contiguous().view(-1, output.size(-1))
        tgt_batch = tgt_batch[:, 1:].contiguous().view(-1)

        loss = criterion(output, tgt_batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss/len(dataloader))

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader)}")


Epoch 1/20: 100%|██████████| 231/231 [00:36<00:00,  6.27it/s, loss=4.62]


Epoch 1/20, Loss: 4.616379322943749


Epoch 2/20: 100%|██████████| 231/231 [00:36<00:00,  6.41it/s, loss=3.97]


Epoch 2/20, Loss: 3.969528725652984


Epoch 3/20: 100%|██████████| 231/231 [00:36<00:00,  6.39it/s, loss=3.47]


Epoch 3/20, Loss: 3.4718978848808257


Epoch 4/20: 100%|██████████| 231/231 [00:36<00:00,  6.37it/s, loss=3.02]


Epoch 4/20, Loss: 3.0190702609685593


Epoch 5/20: 100%|██████████| 231/231 [00:36<00:00,  6.41it/s, loss=2.58]


Epoch 5/20, Loss: 2.583978627151225


Epoch 6/20: 100%|██████████| 231/231 [00:36<00:00,  6.40it/s, loss=2.18]


Epoch 6/20, Loss: 2.178943733116249


Epoch 7/20: 100%|██████████| 231/231 [00:36<00:00,  6.39it/s, loss=1.79]


Epoch 7/20, Loss: 1.7854753820411056


Epoch 8/20: 100%|██████████| 231/231 [00:36<00:00,  6.41it/s, loss=1.44]


Epoch 8/20, Loss: 1.436268154160801


Epoch 9/20: 100%|██████████| 231/231 [00:36<00:00,  6.40it/s, loss=1.11]


Epoch 9/20, Loss: 1.111081700046341


Epoch 10/20: 100%|██████████| 231/231 [00:36<00:00,  6.40it/s, loss=0.836]


Epoch 10/20, Loss: 0.8362588332845019


Epoch 11/20: 100%|██████████| 231/231 [00:36<00:00,  6.39it/s, loss=0.609]


Epoch 11/20, Loss: 0.6085210788301575


Epoch 12/20: 100%|██████████| 231/231 [00:36<00:00,  6.40it/s, loss=0.448]


Epoch 12/20, Loss: 0.44843552664760905


Epoch 13/20: 100%|██████████| 231/231 [00:36<00:00,  6.39it/s, loss=0.33]


Epoch 13/20, Loss: 0.3295167518771572


Epoch 14/20: 100%|██████████| 231/231 [00:36<00:00,  6.39it/s, loss=0.25]


Epoch 14/20, Loss: 0.25008180273043645


Epoch 15/20: 100%|██████████| 231/231 [00:36<00:00,  6.41it/s, loss=0.195]


Epoch 15/20, Loss: 0.19481092262448688


Epoch 16/20: 100%|██████████| 231/231 [00:36<00:00,  6.37it/s, loss=0.164]


Epoch 16/20, Loss: 0.16387022038300833


Epoch 17/20: 100%|██████████| 231/231 [00:36<00:00,  6.40it/s, loss=0.131]


Epoch 17/20, Loss: 0.13113447562569663


Epoch 18/20: 100%|██████████| 231/231 [00:36<00:00,  6.41it/s, loss=0.112]


Epoch 18/20, Loss: 0.11218131556139364


Epoch 19/20: 100%|██████████| 231/231 [00:36<00:00,  6.40it/s, loss=0.113]


Epoch 19/20, Loss: 0.1131838307881252


Epoch 20/20: 100%|██████████| 231/231 [00:36<00:00,  6.39it/s, loss=0.0933]

Epoch 20/20, Loss: 0.09328456190757421





In [43]:
def translate_smiles(model, smiles, src_vocab, tgt_vocab, max_len):
    model.eval()

    # Tokenize the input SMILES string
    src_seq = sentence_to_ids(smiles, src_vocab, tokenize_smiles, add_special_tokens=False)
    src_seq = torch.tensor(src_seq).unsqueeze(0).to(device)

    # Prepare the input tensor with padding if necessary
    if src_seq.size(1) < max_len:
        src_seq = torch.cat([src_seq, torch.tensor([src_vocab['<pad>']] * (max_len - src_seq.size(1))).unsqueeze(0).to(device)], dim=1)
    else:
        src_seq = src_seq[:, :max_len]

    # Prepare the source mask
    src_mask = (src_seq != src_vocab['<pad>']).unsqueeze(1).unsqueeze(2)

    # Pass through the encoder
    with torch.no_grad():
        memory = model.encoder_embedding(src_seq)
        memory = model.positional_encoding(memory)
        memory = model.encoder(memory, src_mask)

    # Prepare the initial decoder input
    tgt_seq = torch.tensor([tgt_vocab['<sos>']]).unsqueeze(0).to(device)

    generated_tokens = []

    for _ in range(max_len):
        # Prepare the target mask
        tgt_mask = (tgt_seq != tgt_vocab['<pad>']).unsqueeze(1).unsqueeze(2)

        # Pass through the decoder
        with torch.no_grad():
            tgt_emb = model.decoder_embedding(tgt_seq)
            tgt_emb = model.positional_encoding(tgt_emb)
            tgt_emb = model.decoder(tgt_emb, memory, src_mask, tgt_mask)
            output = model.fc(tgt_emb)

        # Get the next token
        next_token = output.argmax(dim=-1)[:, -1]
        generated_tokens.append(next_token.item())

        # Update the target sequence
        tgt_seq = torch.cat([tgt_seq, next_token.unsqueeze(0)], dim=1)

        # Stop if the end of sequence token is generated
        if next_token.item() == tgt_vocab['<eos>']:
            break

    # Convert token IDs to tokens
    translated_tokens = [list(tgt_vocab.keys())[list(tgt_vocab.values()).index(token)] for token in generated_tokens]

    return ' '.join(translated_tokens)

# Example usage:
# Assuming the model and vocabularies are properly defined and trained
translated_iupac = translate_smiles(model, df['canonical_smiles'].iloc[1], smiles_vocab, iupac_vocab, max_len)
print(f"Translated IUPAC name: {translated_iupac}")


Translated IUPAC name: <eos>
