In [7]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import csv
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
with open('../Data/train.csv', 'r') as f:
    reader = csv.reader(f)
    train_data = list(reader)[1:]
  
train_data

In [9]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.tokenizer import GPT4Tokenizer
tokenizer = GPT4Tokenizer()
tokenizer.load_vocab('vocab.json')


In [10]:
print(tokenizer.encode("NC(=O)c1ccc(Cl)cc1)[C@H]1[C@@H]2C[C@@H](n3cnc4cccc(F)c43)C[C@@H]21CNC[C@H](c1ccc2ccccc2c1)[C@H](O)c1cncc(OC)c1COc1cnc(NCCNCCCF)cc1[C@@H]1c2[nH]c3ccccc3c2C[C@@H](C)N1CC(F)(F)FC[C@@]1(c2cc(NC(=O)c3ccc(C#N)cn3)ccc2F)Cn2cc(C#N)n"))

[375, 269, 335, 324, 487, 438, 347, 413, 386, 256, 270, 288, 356, 488, 272, 289, 278, 352, 360, 325, 278, 302, 338, 256, 412, 41, 308, 79, 338, 313, 257, 78, 390, 498, 407, 485, 358, 400, 408, 457, 371, 336, 49, 385, 397, 284, 280, 454, 273, 361, 439, 67, 420, 259, 344, 296]


In [None]:
encoded_data = []
for row in train_data:
    text = row[0]  # SMILES string
    label = int(row[1])  # Label
    tokens = tokenizer.encode(text)
    encoded_data.append((torch.tensor(tokens), label))

(tensor([ 67, 289, 278, 352, 465, 325, 279, 302, 320, 353, 258]), 0)

In [13]:
vocab = list(set(''.join([data[0] for data in encoded_data])))
vocab_size = len(vocab)
print(vocab)
print(vocab_size)

TypeError: sequence item 0: expected str instance, Tensor found

In [None]:
def get_batch(block_size, batch_size):
    

In [12]:
class Attention(nn.Module):
  def __init__(self, emb_dim, n_heads, hidden_dim, dropout):
    super(Attention, self).__init__()
    self.emb_dim = emb_dim
    self.n_heads = n_heads
    self.hidden_dim = hidden_dim
    self.dropout = dropout
    self.att = nn.MultiheadAttention(emb_dim, n_heads, dropout=dropout)
    
    self.q = nn.Linear(emb_dim, hidden_dim)
    self.k = nn.Linear(emb_dim, hidden_dim)
    self.v = nn.Linear(emb_dim, hidden_dim)
    
  def forward(self, x):
    q = self.q(x)
    k = self.k(x)
    v = self.v(x)
    
    return self.att(q, k, v)[0]

In [13]:
class FeedForward(nn.Sequential):
  def __init__(self, emb_dim, hidden_dim, dropout):
    super().__init__(
      nn.Linear(emb_dim, hidden_dim),
      nn.GELU(inplace=False),
      nn.Dropout(dropout),
      nn.Linear(hidden_dim, emb_dim),
      nn.Dropout(dropout)
    )

In [11]:
class TransformerBlock(nn.Module):
  def __init__(self, emb_dim, n_heads, hidden_dim, dropout):
    super(TransformerBlock, self).__init__()
    
    self.attention = Attention(emb_dim, n_heads, hidden_dim, dropout)
    self.norm1 = nn.LayerNorm(emb_dim)
    self.ff = FeedForward(emb_dim, hidden_dim, dropout)
    self.norm2 = nn.LayerNorm(emb_dim)
    
    def forward(self, x):
      x = x + self.attention(self.norm1(x))
      x = x + self.ff(self.norm2(x))
      return x

In [None]:
class BioTransformer(nn.Module):
  def __init__(self, emb_dim=32, hidden_dim=128, out_dim=2, n_heads=2, n_layers=6,vocab_size=512, block_size=240, dropout=0.):
  
    super(self).__init__()
    self.embedding_dim = emb_dim
    self.hidden_dim = hidden_dim
    self.out_dim = out_dim
    self.heads = n_heads
    self.num_layers = n_layers
    self.dropout = dropout
    
    self.token_embedding = nn.Embedding(vocab_size, emb_dim)
    self.l_head = nn.Linear(emb_dim, out_dim)
    self.pos_embedding = nn.Embedding(block_size, emb_dim)
    self.blocks = nn.Sequential(*[TransformerBlock(emb_dim, n_heads, hidden_dim, dropout) for _ in range(n_layers)])
    
    
  def forward(self, x, targets=None):
    b, t = x.size()
    positions = torch.arange(t, device=x.device).expand(b, t)
    x = self.token_embedding(x) + self.pos_embedding(positions)
    
    x = self.blocks(x)
      
    x = self.l_head(x.mean(dim=1))
    
    if targets is not None:
      
      B, T, C = x.size()
      x = x.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(x, targets)
      
    else:
      loss = None
      
    return x, loss
  
  
  def predict(self, x):
    return F.softmax(self.forward(x), dim=1)
    

In [None]:
model = BioTransformer().to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

for iter in range(10):
  
    