In [None]:
!pip install 'transformers[torch]'
!pip install rdkit-pypi
!pip install py3Dmol

<a target="_blank" href="https://colab.research.google.com/github/RodrigoAVargasHdz/CHEM-4PB3/blob/w2024/Course_Notes/Week%2012/transformers.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
import tqdm
import numpy as np
import pandas as pd
# RDkit
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix

# Pytorch
import torch
import torch
from torch import nn
import torch.functional as F
from torch.utils.data import Dataset, DataLoader

import py3Dmol
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from rdkit.Chem.Draw import IPythonConsole

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

IPythonConsole.ipython_useSVG = True
IPythonConsole.drawOptions.addAtomIndices = True
IPythonConsole.molSize = 300, 300

## Introduction to Hugging Face ##

Hugging Face is a leading company in the field of artificial intelligence, particularly renowned for its contributions to natural language processing (NLP). It has developed and maintains the Transformers library, which is an extensive collection of pre-trained models designed for a variety of NLP tasks such as text classification, question answering, and more. The company's platform and tools are widely used by researchers, developers, and companies to implement state-of-the-art AI models easily and efficiently. Hugging Face also fosters a vibrant community where AI enthusiasts and professionals share models, collaborate, and advance the field of machine learning. Beyond just providing tools and libraries, Hugging Face actively participates in AI research, pushing the boundaries of what's possible with machine learning technologies.

[Hugging Face](https://huggingface.co/docs/transformers/quicktour)



## fill-mask ##
Adapting the "fill-mask" task to SMILES (Simplified Molecular Input Line Entry System) notation represents an innovative approach in cheminformatics. SMILES is a notation that encodes the structure of a chemical compound in the form of a linear text string. Applying the "fill-mask" concept to SMILES involves predicting missing parts of a molecule's structure, given a partially masked SMILES string. This task could significantly aid in drug discovery and molecular design by enabling models to predict possible chemical structures or functional groups that fit within a given molecular scaffold.

For instance, in a masked SMILES string where certain atoms or bonds are replaced with a mask token (e.g., "COC(=O)C@@HN(\<mask\>)C"), a model trained on a large dataset of chemical structures could predict the missing elements, potentially proposing novel compounds or identifying key functional groups for biological activity. This requires training language models specifically on chemical structure datasets, allowing them to learn the syntax and semantics of SMILES notation. By leveraging models adept at "fill-mask" tasks, researchers can explore chemical space more efficiently, predicting novel molecules with desired properties or optimizing existing compounds for better efficacy or safety.

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("fill-mask", model="mrm8488/chEMBL_smiles_v1")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("mrm8488/chEMBL_smiles_v1")
model = AutoModelForMaskedLM.from_pretrained("mrm8488/chEMBL_smiles_v1")

In [None]:
smile1 = 'CN1C=NC2=C1C(=O)N(C(<mask>)N2C)C'
z = pipe(smile1)
max_score = 0
token_str = ''
smile1_new = ''
for zi in z:
  print(zi)
  if zi['score'] > max_score:
    max_score = zi['score']
    token_str = zi['token_str']
    smile1_new = zi['sequence']
print('suggested molecule: ', smile1_new)
print('Token, ', token_str)
print('Max score = ', max_score)

mol = Chem.MolFromSmiles(smile1_new)
mol

## Feature extraction ##

Feature extraction is a fundamental process in machine learning and data analysis where raw data is processed and transformed into a set of numerical features that can be used to train machine learning models. This process is crucial because raw data, such as text, images, or complex sensor data, often contains a lot of information that may not be directly relevant or usable for analysis. Feature extraction helps in distilling this raw data into a more manageable form, highlighting important patterns or characteristics that are useful for making predictions or classifications.



In [None]:
pipe = pipeline("feature-extraction" , model="mrm8488/chEMBL_smiles_v1")

# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("mrm8488/chEMBL_smiles_v1")
model = AutoModelForMaskedLM.from_pretrained("mrm8488/chEMBL_smiles_v1")

In [None]:
smile1 = "CC(C)CN(CC(OP(=O)(O)O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)"
z = pipe(smile1)
print(np.asarray(z).shape)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Example matrix data
# matrix = np.random.rand(10, 10)  # A 10x10 matrix of random values
matrix = np.array(z)[0]
print(matrix.shape)

# Plot the matrix
plt.figure(figsize=(10, 10))
plt.imshow(matrix, cmap='viridis', interpolation='none', aspect='auto')
plt.colorbar()  # Optional: Adds a colorbar to interpret the values
plt.title('Encoding Visualization')
plt.xlabel('Tokens')
plt.ylabel('Value')
plt.show()

In [None]:
data_url = "https://github.com/RodrigoAVargasHdz/CHEM-4PB3/raw/main/Course_Notes/data/qm9.csv"
data = pd.read_csv(data_url)
data = data.sample(10000)
data = data[['smiles','gap']]

df_train, df_test = train_test_split(data, test_size=0.5, random_state=42)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, smiles_all, labels_all, flatten=False, one_hot=False,cnn=False):
        self.labels_all = labels_all
        self.smiles_all = smiles_all

    def __len__(self):
        return len(self.labels_all)

    def __getitem__(self, idx):
        smi = self.smiles_all[idx]
        label = self.labels_all[idx]

        z = torch.tensor(pipe(smi), dtype=torch.float32)

        pad_size = (0,0,0,20 - z.shape[1])
        z = torch.nn.functional.pad(z, pad=pad_size, mode='constant', value=0)

        y = torch.tensor(label, dtype=torch.float32)
        return z, y

# smiles_all= data['smiles'].to_list()
# labels_all = data['gap'].to_list()

data_tr = CustomDataset(df_train['smiles'].to_list(), df_train['gap'].to_list())
train_dataloader = DataLoader(data_tr, batch_size=32, shuffle=True)



In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels= 1,
                out_channels=32,
                kernel_size=5,
                stride=1,
                padding=2,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 16, 3, 1, 2),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )        # fully connected layer, output 10 classes
        self.out = nn.Linear(18528, 1)

    def forward(self, x):
        x = self.conv1(x)
        # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)
        output = self.out(x)
        return output   # return x for visualization


In [None]:
def train(model,training_data,training_epochs=60,device='cuda'):
    # Define the loss function and optimizer
    loss_function = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4,weight_decay=1E-8)

    trainloader = torch.utils.data.DataLoader(
        training_data, batch_size=64, shuffle=True)

    iterator = tqdm.notebook.tqdm(range(training_epochs))

    # Run the training loop (epochs)
    loss_trajectory = []
    for epoch in iterator:

        # Set current loss value
        current_loss = []
        for i, data in enumerate(trainloader, 0):
            inputs, targets = data
            inputs, targets = inputs.to(device), targets.to(device) # move data to GPU

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_function(outputs[:,0], targets.float())
            loss.backward()
            optimizer.step()

            # Print statistics
            # current_loss += loss.item()
            current_loss.append(loss.item())
        # print('Epoch %s: %.4f +- %.4f'%(epoch,np.array(current_loss).mean(),np.array(current_loss).std()))
        iterator.set_postfix(loss=torch.tensor(current_loss).mean())
        loss_trajectory.append(current_loss)
        # Process is complete.
    return loss_trajectory

In [None]:
z, y= next(iter(train_dataloader))
print(z.shape)

device = 'cuda'
model = CNN().to(device)
compiled_model = torch.compile(model)
# output = model(z)
# print(output.shape)

loss_trj = train(model,data_tr,1,device)