In [None]:
import os
import sys
import threading
import random

import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger

from fastai import *
from fastai.text import *
from fastai.vision import *
from fastai.imports import *

import torch
import torchvision
import torch.nn.functional as F

# Disable RDKit warning messages
RDLogger.DisableLog('rdApp.*')

# Add custom path for imports
sys.path.append('./fastai1/')
from utils import *

# Print the current working directory
current_path = os.getcwd()
print(f"Current working directory: {current_path}")


In [None]:
#Define a custom tokenizer

# Don't include the defalut specific token of fastai, only keep the padding token
BOS,EOS,FLD,UNK,PAD = 'xxbos','xxeos','xxfld','xxunk','xxpad'
TK_MAJ,TK_UP,TK_REP,TK_WREP = 'xxmaj','xxup','xxrep','xxwrep'
defaults.text_spec_tok = [PAD]



special_tokens = ['[BOS]', '[C@H]', '[C@@H]','[C@]', '[C@@]','[C-]','[C+]', '[c-]', '[c+]','[cH-]',
                   '[nH]', '[N+]', '[N-]', '[n+]', '[n-]' '[NH+]', '[NH2+]', '[O-]', '[S+]', '[s+]',
                   '[S-]', '[O+]', '[SH]', '[B-]','[BH2-]', '[BH3-]','[b-]','[PH]','[P+]', '[I+]', 
                   '[Si]','[SiH2]', '[Se]','[SeH]', '[se]', '[Se+]', '[se+]','[te]','[te+]', '[Te]',
                   '[Pd]' , '[Ag]','[Cs]','[Li]','[K]','[Na]', '[N@]', '[N@@]', '[S@+]''[K+]', '[Ni+2]',
                   '[Mg]','[Li+]', '[Cl-]', '[Ni]','[Cs+]', '[Cu+2]', '[Zn+2]', '[Al]', '[Cu]']



class MolTokenizer(BaseTokenizer):
    def __init__(self, lang = 'en', special_tokens = special_tokens):
        self.lang = lang
        self.special_tokens = special_tokens
        
    def tokenizer(self, smiles):
        # add specific token '[BOS]' to represetences the start of SMILES
        smiles = '[BOS]' + smiles
        regex = '(\[[^\[\]]{1,10}\])'
        char_list = re.split(regex, smiles)
        tokens = []
        
        if self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    if char in special_tokens:
                        tokens.append(str(char))
                    else:
                        tokens.append('[UNK]')
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]                    
        
        if not self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    tokens.append(str(char))
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]
                
        #fix the 'Br' be splited into 'B' and 'r'
        if 'B' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'B':
                    if index < len(tokens)-1: # make sure 'B' is not the last character
                        if tokens[index+1] == 'r':
                            tokens[index: index+2] = [reduce(lambda i, j: i + j, tokens[index : index+2])]
        
        #fix the 'Cl' be splited into 'C' and 'l'
        if 'l' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'l':
                    if tokens[index-1] == 'C':
                            tokens[index-1: index+1] = [reduce(lambda i, j: i + j, tokens[index-1 : index+1])]
        return tokens    
    
    def add_special_cases(self, toks):
        pass

In [None]:
import os
import torch
import pandas as pd
from pathlib import Path



# Set CUDA environment variable directly for device 4
%env CUDA_VISIBLE_DEVICES=4

# Define the device (CUDA if available, otherwise CPU)
device = torch.device('cuda:4' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define result paths
result_path = Path('./results')
name = 'C-H-activation'
path = result_path / name
path.mkdir(exist_ok=True, parents=True)

# Create directory for models
mdl_path = path / 'models'
mdl_path.mkdir(exist_ok=True)

# Load dataset
csv_file = 'pretrain-4a-merged.csv'
if Path(csv_file).exists():
    df = pd.read_csv(csv_file)
    print("Dataset loaded successfully:")
    print(df.head())
else:
    print(f"Error: File '{csv_file}' not found.")


In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Split the dataset into training and validation sets
train = df.iloc[:-23653]
valid = df.iloc[-23653:]
print(f"Train shape: {train.shape}, Validation shape: {valid.shape}")

# Function to randomize SMILES strings
def randomize_smiles(smiles):
    """Randomizes the atom order in a SMILES string."""
    m = Chem.MolFromSmiles(smiles)
    ans = list(range(m.GetNumAtoms()))
    np.random.shuffle(ans)
    nm = Chem.RenumberAtoms(m,ans)
    return Chem.MolToSmiles(nm, canonical=False, isomericSmiles=True, kekuleSmiles=False)


# Function to randomize reaction SMILES
def randomize_rxn(rxn):
    """Randomizes the atom order in a reaction SMILES string."""
    precursors, product = rxn.split('>>')
    precursors_list = precursors.split('.')
    
    randomized_precursors = [randomize_smiles(precursor) for precursor in precursors_list]
    randomized_product = randomize_smiles(product)
    
    return f"{'.'.join(randomized_precursors)}>>{randomized_product}"

# Function to augment SMILES data
def smiles_augmentation(df, N_rounds, smiles_col="smiles"):
    dist_aug = {col_name: [] for col_name in df}

    for i in range(df.shape[0]):
        for j in range(N_rounds):
            #dist_aug[smiles_col].append(randomize_smiles(df.iloc[i][smiles_col]))
            dist_aug[smiles_col].append(randomize_smiles(df.iloc[i][smiles_col]))
            
            #dist_aug['canonical'].append('no')

    df_aug = pd.DataFrame.from_dict(dist_aug)
    
    #merge with original df
    df = pd.concat([df, df_aug], sort=False).reset_index(drop=True)
    #shuffle the data
    df = df.reindex(np.random.permutation(df.index))
    return pd.DataFrame.from_dict(df).drop_duplicates(smiles_col)


In [None]:
train_aug = smiles_augmentation(train, 4)
valid_aug = smiles_augmentation(valid, 4)

# Display the shapes of the augmented datasets
print(f"Augmented Train Shape: {train_aug.shape}")
print(f"Augmented Validation Shape: {valid_aug.shape}")

In [None]:
from fastai.text import *
from functools import partial

# Initialize the Tokenizer with a custom molecular tokenizer and special tokens
tok = Tokenizer(partial(MolTokenizer, special_tokens=special_tokens), n_cpus=6, pre_rules=[], post_rules=[])

# Set batch size
bs = 128

# Prepare the data for training and validation using the augmented datasets
data = TextLMDataBunch.from_df(
    path, train_aug, valid_aug, bs=bs, tokenizer=tok, 
    chunksize=50000, text_cols=0, max_vocab=60000, include_bos=False
)

# Show a batch of data for inspection
data.show_batch()

# Save the DataBunch to disk
data.save(f'{name}_databunch')

# Output the size of vocabulary and number of training examples
print(f"Vocabulary size: {len(data.vocab.itos)}")
print(f"Training dataset size: {len(data.train_ds)}")

# Load the saved DataBunch for further processing
data_lm = load_data(path, f'{name}_databunch', bs=bs)

# Create a language model learner with the AWD_LSTM architecture
learner = language_model_learner(data_lm, AWD_LSTM, drop_mult=1, pretrained=False)

# Display model architecture
print(learner.model)

# Set learning rate and scale it based on batch size
lr = 3e-3
lr *= bs / 48  # Scale learning rate by batch size

# Unfreeze the model and train it using one cycle learning rate policy
learner.unfreeze()
learner.fit_one_cycle(100, lr, moms=(0.8, 0.7))

# Define filenames for saving the model and vocabulary
lm_fns = [f'{name}_100_wt', f'{name}_100_vocab']

# Save the model weights (excluding optimizer) and vocabulary
learner.save(lm_fns[0], with_opt=False)
learner.data.vocab.save(mdl_path / (lm_fns[1] + '.pkl'))
