In [1]:
import os
import re
import numpy as np 
import pandas as pd 

from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

#### Dataset: coordination number classification

Import raw data

In [2]:
coordnumber = pd.DataFrame()
for file in os.listdir('data/raw_data/'):
    if file.endswith('.csv'):
        df_metal = pd.read_csv('data/raw_data/' + file, usecols=['Name', 'LigandSmiles', 'Metal', 'bondorder'])
        coordnumber = pd.concat([coordnumber, df_metal])

Canonicalize SMILES and remove duplicates

In [3]:
def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        return Chem.MolToSmiles(mol)
    except:
        return np.nan

coordnumber['LigandSmiles'] = coordnumber['LigandSmiles'].apply(canonicalize_smiles)
coordnumber = coordnumber.dropna()
coordnumber = coordnumber.drop_duplicates(subset=['LigandSmiles', 'Metal'])
coordnumber = coordnumber.reset_index(drop=True)

Replace atom numbers with atomic symbols

In [4]:
atom_symbols = {
    13: 'Al',
    14: 'Si',
    15: 'P',
    16: 'S',
    17: 'Cl',
    18: 'Ar',
    19: 'K',
    20: 'Ca',
    21: 'Sc',
    22: 'Ti',
    23: 'V',
    24: 'Cr',
    25: 'Mn',
    26: 'Fe',
    27: 'Co',
    28: 'Ni',
    29: 'Cu',
    30: 'Zn',
    31: 'Ga',
    32: 'Ge',
    33: 'As',
    34: 'Se',
    35: 'Br',
    36: 'Kr',
    37: 'Rb',
    38: 'Sr',
    39: 'Y',
    40: 'Zr',
    41: 'Nb',
    42: 'Mo',
    43: 'Tc',
    44: 'Ru',
    45: 'Rh',
    46: 'Pd',
    47: 'Ag',
    48: 'Cd',
    49: 'In',
    50: 'Sn',
    51: 'Sb',
    52: 'Te',
    53: 'I',
    54: 'Xe',
    55: 'Cs',
    56: 'Ba',
    57: 'La',
    58: 'Ce',
    59: 'Pr',
    60: 'Nd',
    61: 'Pm',
    62: 'Sm',
    63: 'Eu',
    64: 'Gd',
    65: 'Tb',
    66: 'Dy',
    67: 'Ho',
    68: 'Er',
    69: 'Tm',
    70: 'Yb',
    71: 'Lu',
    72: 'Hf',
    73: 'Ta',
    74: 'W',
    75: 'Re',
    76: 'Os',
    77: 'Ir',
    78: 'Pt',
    79: 'Au',
    80: 'Hg',
    81: 'Tl',
    82: 'Pb',
    83: 'Bi',
    84: 'Po',
    85: 'At',
    86: 'Rn',
    87: 'Fr',
    88: 'Ra',
    89: 'Ac',
    90: 'Th',
    91: 'Pa',
    92: 'U',
    93: 'Np',
    94: 'Pu',
    95: 'Am',
    96: 'Cm',
    97: 'Bk',
    98: 'Cf',
    99: 'Es',
    100: 'Fm',
}

coordnumber['Metal'] = coordnumber['Metal'].apply(lambda x: atom_symbols[int(x)])
coordnumber = coordnumber[coordnumber['Metal'].isin(['Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi'])]
coordnumber = coordnumber.reset_index(drop=True)

#### Dataset: oxidation state classification

Import raw data

In [10]:
oxidationstate = pd.read_csv('data/sample_new_46_carbenes.txt', sep=',', header=None, names=['smiles', 'Name'])

Match data frames on IDs to extract LigandSmiles and Metal columns

In [11]:
oxidationstate = pd.merge(oxidationstate, coordnumber[['Name', 'LigandSmiles', 'Metal', 'bondorder']], on='Name', how='left')
oxidationstate = oxidationstate.dropna()

Oxidation state from the SMILES string

In [12]:
def extract_oxidation_state(row):

    input_string = row['smiles']
    central_atom = row['Metal']
    
    pattern = fr"(?<=\[{central_atom})\+?-?\d*(?=\])"
    match = re.search(pattern, input_string)

    if match:
        match_string = match.group()
        if match_string == '':
            return '0'
        elif match_string == '+':
            return '+1'
        elif match_string == '-':
            return '-1'
        else:
            return match_string
    else:
        return '999'

oxidationstate['oxidation_states'] = oxidationstate.apply(extract_oxidation_state, axis=1)
oxidationstate = oxidationstate[oxidationstate['oxidation_states'] != '999']

Remove data with less than 1000 samples

In [13]:
oxidationstate = oxidationstate.groupby('oxidation_states').filter(lambda x: len(x) > 1000)
oxidationstate = oxidationstate.sort_values('oxidation_states').reset_index(drop=True)
oxidationstate['oxidation_states_classification'] = pd.factorize(oxidationstate['oxidation_states'])[0]

Export to csv

In [14]:
oxidationstate.to_csv('datasets/oxidationstate_46k.csv', index=False)

#### Continue with the coordination number classification dataset

Remove data with less than 1000 samples

In [15]:
coordnumber = coordnumber.groupby('bondorder').filter(lambda x: len(x) > 1000)
coordnumber = coordnumber.reset_index(drop=True)

Remove duplicates

In [16]:
coordnumber.drop_duplicates(subset=['LigandSmiles', 'Metal'], inplace=True)
coordnumber = coordnumber.reset_index(drop=True)

Export to csv

In [17]:
coordnumber.to_csv('datasets/coordnumber.csv', index=False)

Read generated csv file - required to process the next datasets

In [18]:
coordnumber = pd.read_csv('datasets/coordnumber.csv')