# Clustering on External Ligand Databases

This notebook applies the clustering workflow to two published ligand datasets, both of which describe ligand properties in a general (non–problem-specific) context:

- **Fey (LKB‑PP):** J. Jover, N. Fey, J. N. Harvey, G. C. Lloyd-Jones, A. G. Orpen, G. J. J. Owen-Smith, P. Murray, D. R. J. Hose, R. Osborne, M. Purdie, *Organometallics* **2012**, *31*, 5302–5306.  
- **Sigman:** J. J. Dotson, L. van Dijk, J. C. Timmerman, S. Grosslight, R. C. Walroth, F. Gosselin, K. Püntener, K. A. Mack, M. S. Sigman, *J. Am. Chem. Soc.* **2023**, *145*, 110–121.

For each dataset, clustering is performed both **with** and **without** feature selection to assess the extent to which general descriptors can differentiate ligands based on their experimentally observed speciation behavior.


## Fey Dataset (LKB-PP)

Configuration:

In [None]:
from pathlib import Path

# Path to the data file
DATA = Path("projects/ni(I)-co2/other/fey.csv")

# IDs of experimentally tested ligands:
positives = [
    "208", # i-Pr-Xantphos
    "210", # t-Bu-Xantphos
    "289", # dtbpf
]

negatives = [
    "212", # Xantphos
    "291", # dppf
    "207", # Me-Xantphos
]

# IDs of the positive and negative reference ligand for feature selection
POSITIVE = "210"  # t-Bu-Xantphos
NEGATIVE = "212"  # Xantphos

Workflow without feature selection:

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from aixchem.dataset import Dataset
from aixchem.transform.preprocess import CorrelationAnalyzer, Scaler
from aixchem.model.cluster import Clusterer
from aixchem.model.optimization import Optimization

# Load data
data = Dataset(DATA, index="ID", store_raw=True, sep=";")

# Perform correlation analysis
corr = CorrelationAnalyzer(method="pearson", threshold=0.8)
data = corr.fit(data).transform(data)

# Scale dataset
scaler = Scaler(StandardScaler)
data = scaler.fit(data).transform(data)

# Set optimization parameters 
params = {
    "model": [KMeans],
    "random_state": [42],
    "n_clusters": list(range(2, 16)),
    "n_init": [5000],
    }

# Run optimization
opt = Optimization(obj=Clusterer, params=params)
optimization = opt.run(data, njobs=-1)  # Table of optimization metrics at different ks

# Run the clustering with optimized parameters
kmeans = Clusterer(KMeans, n_clusters=4, random_state=42, n_init=5000)
kmeans.fit(data)

# Get clustering results and display clusters of the experimentally tested ligands
data.raw["Cluster"] = kmeans.predict(data)
data.raw.loc[positives + negatives]["Cluster"]

ID
208    0
210    3
289    0
212    0
291    0
207    0
Name: Cluster, dtype: int32

Workflow with feature selection:

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from aixchem.dataset import Dataset
from aixchem.transform.preprocess import CorrelationAnalyzer, Scaler
from aixchem.transform.fselect import FeatureSeparation
from aixchem.model.cluster import Clusterer
from aixchem.model.optimization import Optimization

# Load data
data = Dataset(DATA, index="ID", store_raw=True, sep=";")

# Perform correlation analysis
corr = CorrelationAnalyzer(method="pearson", threshold=0.8)
data = corr.fit(data).transform(data)

# Perform feature selection
fselect = FeatureSeparation()
data = fselect.fit(data, idx=POSITIVE, idy=NEGATIVE).transform(data, threshold=0.2)  # threshold corresponds to approx. mean across all features

# Scale dataset
scaler = Scaler(StandardScaler)
data = scaler.fit(data).transform(data)

# Set optimization parameters 
params = {
    "model": [KMeans],
    "random_state": [42],
    "n_clusters": list(range(2, 16)),
    "n_init": [5000],
    }

# Run optimization
opt = Optimization(obj=Clusterer, params=params)
optimization = opt.run(data, njobs=-1)  # Table of optimization metrics at different ks

# Run the clustering with optimized parameters
kmeans = Clusterer(KMeans, n_clusters=4, random_state=42, n_init=5000)
kmeans.fit(data)

# Get clustering results and display clusters of the experimentally tested ligands
data.raw["Cluster"] = kmeans.predict(data)
data.raw.loc[positives + negatives]["Cluster"]

ID
208    0
210    0
289    0
212    0
291    1
207    1
Name: Cluster, dtype: int32

## Sigman Dataset

Configuration:

In [4]:
from pathlib import Path

# Path to the data file
DATA = Path("projects/ni(I)-co2/other/sigman.csv")

# IDs of experimentally tested ligands:
positives = [
    441, # i-Pr-Xantphos
    468, # dtbpf
]

negatives = [
    719, # Xantphos
    175, # dppf
]

# IDs of the positive and negative reference ligand for feature selection
POSITIVE = 441  # i-Pr-Xantphos (retrospective because t-Bu-Xantphos not present)
NEGATIVE = 719  # Xantphos

Workflow with feature selection:

In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from aixchem.dataset import Dataset
from aixchem.transform.preprocess import CorrelationAnalyzer, Scaler
from aixchem.model.cluster import Clusterer
from aixchem.model.optimization import Optimization

# Load data
data = Dataset(DATA, index="ID", store_raw=True, sep=";")
data.drop(columns=["Ligand"])

# Perform correlation analysis
corr = CorrelationAnalyzer(method="pearson", threshold=0.8)
data = corr.fit(data).transform(data)

# Scale dataset
scaler = Scaler(StandardScaler)
data = scaler.fit(data).transform(data)

# Set optimization parameters 
params = {
    "model": [KMeans],
    "random_state": [42],
    "n_clusters": list(range(2, 16)),
    "n_init": [500],
    }

# Run optimization
opt = Optimization(obj=Clusterer, params=params)
optimization = opt.run(data, njobs=-1)  # Table of optimization metrics at different ks

# Run the clustering with optimized parameters
kmeans = Clusterer(KMeans, n_clusters=6, random_state=42, n_init=500)
kmeans.fit(data)

# Get clustering results and display clusters of the experimentally tested ligands
data.raw["Cluster"] = kmeans.predict(data)
data.raw.loc[positives + negatives]["Cluster"]

ID
441    2
468    2
719    2
175    4
Name: Cluster, dtype: int32

In [6]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from aixchem.dataset import Dataset
from aixchem.transform.preprocess import CorrelationAnalyzer, Scaler
from aixchem.transform.fselect import FeatureSeparation
from aixchem.model.cluster import Clusterer
from aixchem.model.optimization import Optimization

# Load data
data = Dataset(DATA, index="ID", store_raw=True, sep=";")
data.drop(columns=["Ligand"])

# Perform correlation analysis
corr = CorrelationAnalyzer(method="pearson", threshold=0.8)
data = corr.fit(data).transform(data)

# Perform feature selection
fselect = FeatureSeparation()
data = fselect.fit(data, idx=POSITIVE, idy=NEGATIVE).transform(data, threshold=0.15)  # threshold corresponds to approx. mean across all 

# Scale dataset
scaler = Scaler(StandardScaler)
data = scaler.fit(data).transform(data)

# Set optimization parameters 
params = {
    "model": [KMeans],
    "random_state": [42],
    "n_clusters": list(range(2, 16)),
    "n_init": [500],
    }

# Run optimization
opt = Optimization(obj=Clusterer, params=params)
optimization = opt.run(data, njobs=-1)  # Table of optimization metrics at different ks

# Run the clustering with optimized parameters
kmeans = Clusterer(KMeans, n_clusters=4, random_state=42, n_init=500)
kmeans.fit(data)

# Get clustering results and display clusters of the experimentally tested ligands
data.raw["Cluster"] = kmeans.predict(data)
data.raw.loc[positives + negatives]["Cluster"]

ID
441    2
468    2
719    2
175    1
Name: Cluster, dtype: int32