In [None]:
!pip install rdkit --q

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
# Generate the Morgan Fingerprint generator
morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

# Utils
from tqdm import tqdm
import requests
import os

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Data Handling
from torch.utils.data import TensorDataset, DataLoader, Dataset,random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

# Transformer  # // https://arxiv.org/pdf/2010.09885 // #
from transformers import AutoTokenizer, AutoModel

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Analysis
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import gdown
# Set pandas to display all columns
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)

# Utilities for ML
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split,GridSearchCV



# Metrics
from sklearn.metrics import (
    classification_report,f1_score,matthews_corrcoef,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score,
    roc_curve,
    average_precision_score,
    precision_recall_curve,
    auc
)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [None]:
def generate_mol(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print('Error generating molecule from SMILES')
        return mol
    except Exception as e:
        print(f"Error generating molecule from SMILES")
        return None

def generate_morgan_fingerprint(mol):
    # Generate the fingerprint as a bit vector
    fp = morgan_generator.GetFingerprint(mol)

    # Convert the bit vector to a numpy array of 0s and 1s
    arr = np.zeros((2048,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)

    return arr

def get_maccs(mol):
    descriptors = {}
    # Add MACCS Fingerprint
    maccs_fp = AllChem.GetMACCSKeysFingerprint(mol)
    maccs_arr = np.zeros((167,), dtype=int)
    DataStructs.ConvertToNumpyArray(maccs_fp, maccs_arr)
    descriptors['MACCSFP'] = maccs_arr
    # Include MACCS fingerprint bits as individual descriptors
    for i, bit in enumerate(maccs_arr):
        descriptors[f'MACCSFP_bit_{i}'] = bit
    return descriptors

# ChemBerta Tokenizer and Predictive Model

tokenizer = AutoTokenizer.from_pretrained('seyonec/ChemBERTa_zinc250k_v2_40k')
model = AutoModel.from_pretrained('seyonec/ChemBERTa_zinc250k_v2_40k')
model = model.to(device)
model.eval()

# Generating the embeddings from ChemBerta
def embed_smiles(smiles):
    inputs = tokenizer(smiles, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    # You can take the mean of the embeddings or another summarization technique
    return embeddings.cpu().numpy()

# Mean Pooling the Embeddings
def mean_pooling(embeddings):
    # Removing the batch dimension
    embeddings = np.squeeze(embeddings, axis=0)  # Shape: (seq, 768)
    # Computing the mean over the sequence dimension (tokens)
    mean_embedding = embeddings.mean(axis=0)  # Shape: (768,)
    return mean_embedding

# Generate all Features for Classification
def extract_features_for_class(smiles):
    mol = generate_mol(smile)
    maccs_dict = get_maccs(mol)
    maccs = torch.tensor(maccs_dict['MACCSFP'].tolist(), dtype=torch.float32).to(device)  # Convert dict values to tensor
    morgan = torch.tensor(generate_morgan_fingerprint(mol), dtype=torch.float32).to(device)
    embeds = torch.tensor(mean_pooling(embed_smiles(smiles)), dtype=torch.float32).to(device)
    return mol, maccs, morgan, embeds

def classify_permeability(smiles):

  mol, maccs, morgan, embeds = extract_features_for_class(smiles)

  if mol is None:
    return np.Nan , np.Nan

  features = torch.cat([maccs, morgan, embeds],dim = 0 )

  # Convert to PyTorch tensor
  features_tensor = torch.tensor(features, dtype=torch.float32)  # Shape: (total_dim,)

  # Add batch dimension
  features_tensor = features_tensor.unsqueeze(0)
  features_tensor.size()

  cnn_model.eval()
  with torch.no_grad():
      output = torch.sigmoid(cnn_model(features_tensor))

  return output, mol

# Importing Model

In [None]:
class ConvNet(nn.Module):
    def __init__(self, input_dim, out_channels, classification_layer_dim = 64 ):
        super().__init__()

        self.input_dim = input_dim
        self.out_channels = out_channels
        self.class_dim =  classification_layer_dim

        # Convolutional layers
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=self.out_channels * 4, kernel_size=3, stride=1, padding=2),
            nn.BatchNorm1d(self.out_channels * 4),
            nn.PReLU(self.out_channels*4),
            nn.AvgPool1d(kernel_size=3, stride=2),
            nn.Dropout(0.2),
            nn.Conv1d(in_channels= self.out_channels * 4 , out_channels=self.out_channels *2 , kernel_size=5, stride=2, padding=2),
            nn.BatchNorm1d(self.out_channels * 2),
            nn.PReLU(self.out_channels*2),
            nn.AvgPool1d(kernel_size=5, stride=2),
            nn.Dropout(0.2),
        )

        # Calculate the output size after convolutional layers
        self._conv_output_size = self._get_conv_output()

        self.fc = nn.Sequential(
            nn.Linear(self._conv_output_size, self.class_dim  ),
            nn.BatchNorm1d(self.class_dim),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(self.class_dim, self.class_dim //2 ),
            nn.BatchNorm1d(self.class_dim//2),
            nn.PReLU(self.class_dim//2),
            nn.Dropout(0.25),
            nn.Linear(self.class_dim//2 , 1)
        )

    def _get_conv_output(self):

        """
        Computes the size of the output of the convolutional layers
         to define the input size of the first linear layer.
        """
        # Create a dummy input tensor with batch size 1
        dummy_input = torch.zeros(1, 1, self.input_dim)
        output_feat = self.conv(dummy_input)
        output_size = output_feat.numel()
        print(output_size)
        return output_size

    def forward(self, x):
        # x shape: (batch_size, input_dim)
        print('Starting Forward Method')
        x = x.unsqueeze(1)  # Add channel dimension: (batch_size, 1, input_dim)
        x = self.conv(x)
        x = x.view(x.size(0), -1)  # Flatten: (batch_size, conv_output_size)
        x = self.fc(x)
        return x


In [None]:
# Directly load the entire model
cnn_model = torch.load('/content/drive/My Drive/cnn_bbb_entire_model.pth', map_location=torch.device('cpu'))

# Move to device and set to evaluation mode
model.to(device)
model.eval()
model

  cnn_model = torch.load('/content/drive/My Drive/cnn_bbb_entire_model.pth', map_location=torch.device('cpu'))


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(52000, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout)

In [None]:
smile = 'CC(C)C1=C(C(=CC=C1)C(C)C)O.CNC1(CCCCC1=O)C2=CC=CC=C2Cl'