In [1]:
import sqlite3
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
connection = sqlite3.connect("/content/drive/My Drive/drugs.db")

In [4]:
df = pd.read_sql_query("SELECT * from drugs_action_proc", connection)

In [5]:
df.head()

Unnamed: 0,index,cid,target,ki,ic50,kd,min,mean
0,0,3081361,HIV-1 Protease,0.24,0.0,0.0,0.24,0.08
1,1,5327236,HIV-1 Protease,0.25,0.0,0.0,0.25,0.083333
2,2,5327235,HIV-1 Protease,0.41,0.0,0.0,0.41,0.136667
3,3,5327234,HIV-1 Protease,0.8,0.0,0.0,0.8,0.266667
4,4,3009319,HIV-1 Protease,0.99,0.0,0.0,0.99,0.33


In [6]:
df = df[["cid", "ki", "ic50", "kd", "min", "mean", "target"]]

In [7]:
df.head()

Unnamed: 0,cid,ki,ic50,kd,min,mean,target
0,3081361,0.24,0.0,0.0,0.24,0.08,HIV-1 Protease
1,5327236,0.25,0.0,0.0,0.25,0.083333,HIV-1 Protease
2,5327235,0.41,0.0,0.0,0.41,0.136667,HIV-1 Protease
3,5327234,0.8,0.0,0.0,0.8,0.266667,HIV-1 Protease
4,3009319,0.99,0.0,0.0,0.99,0.33,HIV-1 Protease


In [8]:
train = df[["cid", "target"]].copy()

In [9]:
train["cid"] = train["cid"].values.astype(str)
train["target"] = train["target"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [10]:
train.head()

Unnamed: 0,cid,target
0,3081361,hiv-1 protease
1,5327236,hiv-1 protease
2,5327235,hiv-1 protease
3,5327234,hiv-1 protease
4,3009319,hiv-1 protease


In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [28]:
cid = train["cid"].values
corpus = cid
target = train["target"].values

In [13]:
corpus = np.hstack((cid, target)); corpus

array(['3081361', '5327236', '5327235', ...,
       'leucyl-cystinyl aminopeptidase', 'leucyl-cystinyl aminopeptidase',
       'leucyl-cystinyl aminopeptidase'], dtype=object)

In [29]:
i2w = dict(enumerate(set(corpus)))

In [30]:
vocab = {v: k for k, v in i2w.items()}

In [31]:
vocab_size = len(vocab) + 1
embed_size = 300
target_size = len(set(target))

In [32]:
vocab_size, target_size

(786217, 6350)

In [33]:
corp = corpus[:, None]; corp

array([['3081361'],
       ['5327236'],
       ['5327235'],
       ...,
       ['91898558'],
       ['91898559'],
       ['91898560']], dtype=object)

In [34]:
targets_index = dict(enumerate(sorted(set(target))))
targets = {v: k for k, v in targets_index.items()}

In [35]:
class DrugData(Dataset):
    def __init__(self,df,vocab,targets, train=True, valid_pct=0.1):
        self.data = df.values
        index = int(self.data.shape[0] * valid_pct)
        if train:
            self.data = self.data[index:, :]
        else:
            self.data = self.data[:index, :]
        self.targets = targets
        self.vocab = vocab
        self.samples = [()] * self.data.shape[0]
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        x, y = self.data[idx, 0], self.data[idx, 1]
        x, y = self.vocab.get(x, -1), self.targets.get(y, -1)
        x, y = map(torch.tensor, ([x], [y]))
        self.samples[idx] = (x, y)
        return self.samples[idx]

In [36]:
train_dataset = DrugData(train, vocab, targets, train=True)
val_dataset = DrugData(train, vocab, targets, train=False)

In [37]:
len(train_dataset), len(val_dataset)

(1631529, 181280)

In [38]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, num_workers=8)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=64, num_workers=8)

In [46]:
class Model(nn.Module):
    def __init__(self, vocab_size, embed_size, target_size):
        super(Model, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(embed_size, target_size)
        )
        
    def forward(self, x):
        embed_out = self.embed(x).squeeze()
        out = self.fc(embed_out)
        return F.log_softmax(out, dim=1)

In [47]:
model = Model(vocab_size, embed_size, target_size)

In [48]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); device

device(type='cuda')

In [49]:
model = model.to(device)

In [50]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-4)

In [51]:
from tqdm.notebook import tqdm_notebook as tqdm

In [None]:
EPOCHS = 10

print("epoch\t train_loss\t val_loss")
for epoch in range(1, EPOCHS+1):
    train_loop = tqdm(train_loader, total=len(train_loader), leave=False)
    val_loop = tqdm(val_loader, total=len(val_loader), leave=False)
    train_loss = 0.0
    val_loss = 0.0
    model.train()
    for x, y in train_loop:
        optimizer.zero_grad()
        x, y = x.to(device), y.to(device)
        yhat = model(x)
        # print(yhat.shape, y.shape)
        loss = criterion(yhat, y.squeeze())
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

        train_loop.set_description(f"Epoch [{epoch}/{EPOCHS}] - ")
        train_loop.set_postfix(epoch=epoch, loss=loss.item())
    model.eval()
    with torch.no_grad():
        for x, y in val_loop:
            x, y = x.to(device), y.to(device)
            yhat = model(x)
            loss = criterion(yhat, y.squeeze())
            val_loss += loss.item()

    print(f"{epoch}\t {round(train_loss/len(train_loader), 5)}\t {round(val_loss/len(val_loader), 5)}")
    torch.save(model.state_dict(), f"./models/stage-{epoch}.pt")

epoch	 train_loss	 val_loss


HBox(children=(FloatProgress(value=0.0, max=25493.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2833.0), HTML(value='')))

1	 7.76832	 8.95305


HBox(children=(FloatProgress(value=0.0, max=25493.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2833.0), HTML(value='')))

2	 6.9627	 10.05651


HBox(children=(FloatProgress(value=0.0, max=25493.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2833.0), HTML(value='')))

3	 6.79548	 10.80045


HBox(children=(FloatProgress(value=0.0, max=25493.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2833.0), HTML(value='')))