In [1]:
import numpy as np 
import pandas as pd 

from electrum import calculate_fingerprints
import electrum_os

from sklearn.model_selection import  KFold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

#### Preliminary function definitions

Calculate metrics

In [2]:
def calculate_metrics(y_true, y_pred, y_true_onehot, y_onehot):

    roc_auc_ovr_macro = roc_auc_score(y_true_onehot, y_onehot, average='macro', multi_class='ovr')
    roc_auc_ovr_weighted = roc_auc_score(y_true_onehot, y_onehot, average='weighted', multi_class='ovr')
    accuracy = accuracy_score(y_true, y_pred)
    precision_macro = precision_score(y_true, y_pred, average='macro')
    precision_weighted = precision_score(y_true, y_pred, average='weighted')
    recall_macro = recall_score(y_true, y_pred, average='macro')
    recall_weighted = recall_score(y_true, y_pred, average='weighted')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')

    columns = ['roc_auc_ovr_macro', 'roc_auc_ovr_weighted', 'accuracy', 'precision_macro', 'precision_weighted', 'recall_macro', 'recall_weighted', 'f1_macro', 'f1_weighted']
    metrics = [roc_auc_ovr_macro, roc_auc_ovr_weighted, accuracy, precision_macro, precision_weighted, recall_macro, recall_weighted, f1_macro, f1_weighted]

    return columns, metrics

#### Oxidation States

Import dataset

In [3]:
oxidationstate = pd.read_csv('datasets/oxidationstate_46k.csv', dtype={'oxidation_states': str})
oxidationstate.drop_duplicates(subset='smiles', inplace=True)

Calculate fingerprints and prepare for training

In [None]:
np.random.seed(42)

X = np.array(calculate_fingerprints(oxidationstate['LigandSmiles'], oxidationstate['Metal'], radius=2, n_bits=512))
y = np.array(oxidationstate['oxidation_states_classification'])
y_scrambled = np.random.permutation(y)

5-fold cross-validation: scrambled labels

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(512, 256, 128, 64, 32), max_iter=1000, random_state=42)

os_scrambled = pd.DataFrame()
for train, test in kf.split(X):
    
    # Fit model
    mlp.fit(X[train], y_scrambled[train])

    # Predict
    y_pred = mlp.predict(X[test])

    # Reshape y_true and y_pred to onehot
    label_binarizer = LabelBinarizer().fit(y_scrambled[train])
    y_true_onehot = label_binarizer.transform(y_scrambled[test])
    y_pred_onehot = label_binarizer.transform(y_pred)

    # Calculate metrics
    columns, metrics = calculate_metrics(y_scrambled[test], y_pred, y_true_onehot, y_pred_onehot)
    os_scrambled = pd.concat([os_scrambled, pd.DataFrame([metrics], columns=columns)], ignore_index=True)

In [None]:
mean_row = os_scrambled.mean()
std_row = os_scrambled.std()
os_scrambled = pd.concat([os_scrambled, mean_row.to_frame().T, std_row.to_frame().T], ignore_index=True)
os_scrambled.index = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Mean', 'Std']
os_scrambled.to_csv('results/oxidationstate_46k_mlp_scrambled.csv', index=False)

5-fold cross-validation: true labels

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(512, 256, 128, 64, 32), max_iter=1000, random_state=42)

os_true = pd.DataFrame()
for train, test in kf.split(X):
    
    # Fit model
    mlp.fit(X[train], y[train])

    # Predict
    y_pred = mlp.predict(X[test])

    # Reshape y_true and y_pred to onehot
    label_binarizer = LabelBinarizer().fit(y[train])
    y_true_onehot = label_binarizer.transform(y[test])
    y_pred_onehot = label_binarizer.transform(y_pred)

    # Calculate metrics
    columns, metrics = calculate_metrics(y[test], y_pred, y_true_onehot, y_pred_onehot)
    os_true = pd.concat([os_true, pd.DataFrame([metrics], columns=columns)], ignore_index=True)

In [None]:
mean_row = os_true.mean()
std_row = os_true.std()
os_true = pd.concat([os_true, mean_row.to_frame().T, std_row.to_frame().T], ignore_index=True)
os_true.index = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Mean', 'Std']
os_true.to_csv('results/oxidationstate_46k_mlp_true.csv', index=False)

#### Coordination numbers on Oxidation State Dataset

Calculate fingerprints and prepare for training

In [27]:
np.random.seed(42)

classes_to_keep = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

X = np.array(calculate_fingerprints(oxidationstate['LigandSmiles'], oxidationstate['Metal'], radius=2, n_bits=512))
y = np.array(oxidationstate['bondorder'].values.tolist())

mask = np.isin(y, classes_to_keep)
X = X[mask]
y = y[mask]

5-fold cross-validation: true labels

In [28]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(512, 256, 128, 64, 32), max_iter=1000, random_state=42)

bo_true = pd.DataFrame()
for train, test in kf.split(X):
    
    # Fit model
    mlp.fit(X[train], y[train])

    # Predict
    y_pred = mlp.predict(X[test])

    # Reshape y_true and y_pred to onehot
    label_binarizer = LabelBinarizer().fit(y[train])
    y_true_onehot = label_binarizer.transform(y[test])
    y_pred_onehot = label_binarizer.transform(y_pred)

    # Calculate metrics
    columns, metrics = calculate_metrics(y[test], y_pred, y_true_onehot, y_pred_onehot)
    bo_true = pd.concat([bo_true, pd.DataFrame([metrics], columns=columns)], ignore_index=True)

In [30]:
mean_row = bo_true.mean()
std_row = bo_true.std()
bo_true = pd.concat([bo_true, mean_row.to_frame().T, std_row.to_frame().T], ignore_index=True)
bo_true.index = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Mean', 'Std']
bo_true.to_csv('results/oxidationstate_bo_normalelectrum_true.csv', index=False)

Modified ELECTRUM with oxidation states

In [31]:
oxidationstate['OS'] = oxidationstate['oxidation_states'].apply(lambda x: int(x))

In [34]:
np.random.seed(42)

classes_to_keep = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

X = np.array([electrum_os.calculate_fingerprint(row['LigandSmiles'], row['Metal'], row['OS'], radius=2, n_bits=512) for index, row in oxidationstate.iterrows()])
y = np.array(oxidationstate['bondorder'])

mask = np.isin(y, classes_to_keep)
X = X[mask]
y = y[mask]

5-fold cross-validation: true labels and os-augmented electrum

In [35]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(512, 256, 128, 64, 32), max_iter=1000, random_state=42)

os_true = pd.DataFrame()
for train, test in kf.split(X):
    
    # Fit model
    mlp.fit(X[train], y[train])

    # Predict
    y_pred = mlp.predict(X[test])

    # Reshape y_true and y_pred to onehot
    label_binarizer = LabelBinarizer().fit(y[train])
    y_true_onehot = label_binarizer.transform(y[test])
    y_pred_onehot = label_binarizer.transform(y_pred)

    # Calculate metrics
    columns, metrics = calculate_metrics(y[test], y_pred, y_true_onehot, y_pred_onehot)
    os_true = pd.concat([os_true, pd.DataFrame([metrics], columns=columns)], ignore_index=True)

In [36]:
mean_row = os_true.mean()
std_row = os_true.std()
os_true = pd.concat([os_true, mean_row.to_frame().T, std_row.to_frame().T], ignore_index=True)
os_true.index = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Mean', 'Std']
os_true.to_csv('results/oxidationstate_bo_oselectrum_true.csv', index=False)

#### Coordination numbers

Import dataset

In [None]:
coordnumber = pd.read_csv('datasets/coordnumber.csv')

Calculate fingerprints

In [None]:
np.random.seed(42)

X = np.array(calculate_fingerprints(coordnumber['LigandSmiles'], coordnumber['Metal'], radius=2, n_bits=512))
y = np.array(coordnumber['bondorder'])
y_scrambled = np.random.permutation(y)

5-fold cross-validation: scrambled labels

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(512, 256, 128, 64, 32), max_iter=1000, random_state=42)

bo_scrambled = pd.DataFrame()
for train, test in kf.split(X):
    
    # Fit model
    mlp.fit(X[train], y_scrambled[train])

    # Predict
    y_pred = mlp.predict(X[test])

    # Reshape y_true and y_pred to onehot
    label_binarizer = LabelBinarizer().fit(y_scrambled[train])
    y_true_onehot = label_binarizer.transform(y_scrambled[test])
    y_pred_onehot = label_binarizer.transform(y_pred)

    # Calculate metrics
    columns, metrics = calculate_metrics(y_scrambled[test], y_pred, y_true_onehot, y_pred_onehot)
    bo_scrambled = pd.concat([bo_scrambled, pd.DataFrame([metrics], columns=columns)], ignore_index=True)

In [None]:
mean_row = bo_scrambled.mean()
std_row = bo_scrambled.std()
bo_scrambled = pd.concat([bo_scrambled, mean_row.to_frame().T, std_row.to_frame().T], ignore_index=True)
bo_scrambled.index = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Mean', 'Std']
bo_scrambled.to_csv('results/coordnumber_mlp_scrambled.csv', index=False)

5-fold cross-validation: true labels

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(512, 256, 128, 64, 32), max_iter=1000, random_state=42)

bo_true = pd.DataFrame()
for train, test in kf.split(X):
    
    # Fit model
    mlp.fit(X[train], y[train])

    # Predict
    y_pred = mlp.predict(X[test])

    # Reshape y_true and y_pred to onehot
    label_binarizer = LabelBinarizer().fit(y[train])
    y_true_onehot = label_binarizer.transform(y[test])
    y_pred_onehot = label_binarizer.transform(y_pred)

    # Calculate metrics
    columns, metrics = calculate_metrics(y[test], y_pred, y_true_onehot, y_pred_onehot)
    bo_true = pd.concat([bo_true, pd.DataFrame([metrics], columns=columns)], ignore_index=True)

In [None]:
mean_row = bo_true.mean()
std_row = bo_true.std()
bo_true = pd.concat([bo_true, mean_row.to_frame().T, std_row.to_frame().T], ignore_index=True)
bo_true.index = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Mean', 'Std']
bo_true.to_csv('results/coordnumber_mlp_true.csv', index=False)