In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error

from rdkit import DataStructs
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

from typing import List, Union

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
def morgan_binary_features_generator(mol: Union[str, Chem.Mol], plot_img = False,
                                     radius: int = 6,
                                     num_bits: int = 4096) -> np.ndarray:
    
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    if plot_img:
        display(mol)
    
    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
    features = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features

In [3]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta : float=-1.0):
        self.patience = patience  # number of times to allow for no improvement before stopping the execution
        self.min_delta = min_delta  # the minimum change to be counted as improvement
        self.counter = 0  # count the number of times the validation accuracy not improving
        self.min_validation_loss = np.inf

    # return True when encountering _patience_ times decrease in validation loss 
    def __call__(self, validation_loss, verbose=False):
        if ((validation_loss+self.min_delta) < self.min_validation_loss):
            self.min_validation_loss = validation_loss
            self.counter = 0  # reset the counter if validation loss decreased at least by min_delta
        elif ((validation_loss+self.min_delta) > self.min_validation_loss):
            self.counter += 1 # increase the counter if validation loss is not decreased by the min_delta
            if verbose:
                print(f"  >> now{validation_loss:.3f} > best{self.min_validation_loss:.3f}")
            if self.counter >= self.patience:
                return True
        return False

In [4]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

train_fps = pd.DataFrame(train_df["SMILES"].apply(morgan_binary_features_generator).tolist())
test_fps = pd.DataFrame(test_df["SMILES"].apply(morgan_binary_features_generator).tolist())

train_fps.rename(columns=lambda x: "FPS_" + str(x), inplace=True)
test_fps.rename(columns=lambda x: "FPS_" + str(x), inplace=True)

fps_inputsize = train_fps.shape[1]
fps_inputsize

4096

In [5]:
class FPsDataset(Dataset):
    def __init__(self, fps_df, scaler=None):
        if scaler is not None:
            self.fps_df = scaler.fit_transform(fps_df)
        else:
            self.fps_df = fps_df.values

    def __getitem__(self, index):
        feature = self.fps_df[index]
        return torch.tensor(feature).float()
    
    def __len__(self):
        return len(self.fps_df)
    
FPs_dataset = FPsDataset(train_fps)
train_FPs_dataset, valid_FPs_dataset = train_test_split(FPs_dataset, test_size=0.2, random_state=42)

train_FPs_loader = DataLoader(dataset=train_FPs_dataset, batch_size=256, shuffle=True)
valid_FPs_loader = DataLoader(dataset=valid_FPs_dataset, batch_size=256, shuffle=True)

In [6]:
class FpsAutoEncoder(nn.Module):
    def __init__(self, input_size, output_size):
        super(FpsAutoEncoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, output_size)
        )

        self.decoder = nn.Sequential(
            nn.Linear(output_size, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_size)
        )

    def forward(self, x):
        out = self.encoder(x)
        out = self.decoder(out)

        return out
    
    def get_codes(self, x):
        return self.encoder(x)
    
model_auto = FpsAutoEncoder(fps_inputsize, 32).to("cuda")
print(model_auto)

criterion_auto = nn.MSELoss()
optimizer_auto = torch.optim.Adam(model_auto.parameters(), lr=0.0001)
scheduler_auto = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_auto, lr_lambda= lambda epoch : 0.95**(epoch))


FpsAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=4096, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=256, out_features=32, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=4096, bias=True)
  )
)


In [18]:

def train_autoencoder(model, train_fps_loader, valid_fps_loader, criterion, optimizer, scheduler, epochs=5000):

    earlyStopper = EarlyStopping(patience=5, min_delta=-0.1)
    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for inputs in train_fps_loader:
            optimizer.zero_grad()

            output = model(inputs.to("cuda"))
            loss = criterion(output, inputs.to("cuda"))
            loss.backward()

            optimizer.step()

            running_loss += loss.item()


        if epoch % 100 == 0:
            model.eval()
            valid_loss = 0
            with torch.no_grad():
                for inputs in valid_fps_loader:
                    output = model(inputs.to("cuda"))
                    loss = criterion(output, inputs.to("cuda"))

                    valid_loss += loss.item()
            

            print(f"Epoch: {epoch:4d}/{epochs}, lr: {scheduler.get_last_lr()[0]:.7f}, Train Loss: {np.sqrt(running_loss/len(train_fps_loader))}, Valid Loss: {np.sqrt(valid_loss/len(valid_fps_loader))}")
            scheduler.step()

            if earlyStopper(valid_loss, True):
                break

    return model

train_autoencoder(model_auto, train_FPs_loader, valid_FPs_loader, criterion_auto, optimizer_auto, scheduler_auto)

Epoch:    0/5000, lr: 0.0000017, Train Loss: 0.12449290529529422, Valid Loss: 0.13595647387159018
Epoch:  100/5000, lr: 0.0000016, Train Loss: 0.12446421534738933, Valid Loss: 0.13593135660912675
Epoch:  200/5000, lr: 0.0000015, Train Loss: 0.12452852574300592, Valid Loss: 0.1362083398855532
Epoch:  300/5000, lr: 0.0000014, Train Loss: 0.12445314290880831, Valid Loss: 0.13605875840633033
Epoch:  400/5000, lr: 0.0000013, Train Loss: 0.1244362883125623, Valid Loss: 0.13595841930097147
Epoch:  500/5000, lr: 0.0000013, Train Loss: 0.12441203160763148, Valid Loss: 0.1360053911436378
Epoch:  600/5000, lr: 0.0000012, Train Loss: 0.1243931829974885, Valid Loss: 0.13623808198025708
Epoch:  700/5000, lr: 0.0000012, Train Loss: 0.12452265203551856, Valid Loss: 0.13599368794019034
Epoch:  800/5000, lr: 0.0000011, Train Loss: 0.12449416276508528, Valid Loss: 0.1360067606744744
Epoch:  900/5000, lr: 0.0000010, Train Loss: 0.12436739807392537, Valid Loss: 0.13587015509909275
Epoch: 1000/5000, lr: 0.0

FpsAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=4096, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=256, out_features=32, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=4096, bias=True)
  )
)

In [19]:
torch.save(model_auto.state_dict(), "../archive_model/autoEncoder.pt")

In [20]:
model = FpsAutoEncoder(fps_inputsize, 32).to("cuda")
model.load_state_dict(torch.load('../archive_model/autoEncoder.pt', map_location="cuda:0"))

<All keys matched successfully>