# Trialkylphosphine Ni<sup>(I)</sup> dimers



Set important configuration variables:

In [1]:
import pandas as pd
from pathlib import Path

# Path to the data file
DATA = Path("projects/ni(I)-dimer/descriptors.csv")

# Path to results
RESULTS = Path("projects/ni(I)-dimer/results")
RESULTS.mkdir(parents=True, exist_ok=True)

# IDs of positive and negative reference ligand
POSITIVE = 16  # P(t-Bu)3
NEGATIVE = 19  # P(Np)3

## Dataset

Load .csv-file and perform basic cleaning:

In [2]:
from aixchem.dataset import Dataset

index_column = "LKB-P ID"
cols_to_drop = ["SMILES"]

data = Dataset(DATA, index=index_column, store_raw=True)
data.dropna(axis=0).drop(columns=cols_to_drop)

data.X.head()

Unnamed: 0_level_0,L - Dipole,L - HOMO,L - LUMO,L - NMR(P),L - Q(P),L2Ni - %V_Bur(Ni),L2Ni - AMS_0(Ni),L2Ni - AMS_0(P2Ni),"L2Ni - Angle(P,Ni,P)",L2Ni - Dipole,...,L2NiBr2 - Q(P),"L2NiBr2 - R(Ni,Br)","L2NiBr2 - R(Ni,P)","L2NiBr2 - Sterimol_B1(Ni,P)","L2NiBr2 - Sterimol_B5(Ni,P)","L2NiBr2 - Sterimol_L(Ni,P)","LNiBr2 - BO(Ni,Br)",LNiBr2 - NMR(Br),LNiBr2 - Q(Br),"LNiBr2 - R(Ni,Br)"
LKB-P ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15,2.1888,-0.2132,-0.00839,253.4359,0.81185,80.885674,18.933435,33.087927,179.882688,0.0093,...,1.17127,2.373452,2.314027,3.913067,5.626779,7.705016,0.6422,3136.8225,-0.448595,2.293036
16,1.8797,-0.21399,-0.00941,226.2353,0.84285,89.475587,19.31442,31.697026,179.908397,0.011,...,1.131775,2.40147,2.447209,3.918521,4.918677,6.882165,0.648,3268.03915,-0.42827,2.297956
19,1.505,-0.22119,-0.01007,355.3232,0.76497,96.575475,12.661542,30.335862,179.974377,0.0022,...,1.07671,2.37849,2.348787,4.447298,6.137511,7.209989,0.64135,3025.9104,-0.438475,2.298392
36,2.031,-0.21555,-0.00809,262.2426,0.80504,80.469667,19.375653,33.971564,179.87766,0.0813,...,1.162135,2.375078,2.314604,3.584775,5.018594,7.697304,0.6488,3149.61295,-0.44464,2.290767
37,2.1266,-0.22092,-0.00891,256.7924,0.81329,83.654692,19.205343,33.056008,177.179049,0.1983,...,1.17073,2.377878,2.364396,3.375833,4.867783,6.755406,0.6505,3289.55745,-0.42926,2.285924


### Correlation Analysis

Perform correlation anaylsis and remove features exceeding a correlation threshold of 90%:

In [3]:
from aixchem.transform.preprocess import CorrelationAnalyzer

# Perform correlation analysis and drop highly correlated columns
corr = CorrelationAnalyzer(method="pearson", threshold=0.9, sort=False)
data = corr.fit(data).transform(data)

# Full correlation matrix
corr.matrix.to_csv(RESULTS / "correlation.csv")

# Correlation matrix after dropping highly correlated (> 0.9) features
# corr.matrix_after.to_csv(RESULTS / "corr_filter.csv")  

### Feature Selection

Perform two-sample feature selection and select the 33 features that cause the widest separation between positive and negative reference ligands:

In [4]:
from aixchem.transform.fselect import FeatureSeparation

fselect = FeatureSeparation()
data = fselect.fit(data, idx=POSITIVE, idy=NEGATIVE).transform(data, n_best=33)

# Get ranking of features
fselect.ranking.to_csv(RESULTS / "feature_ranking.csv")

### Feature Scaling

Scale the features using the sklearn.StandardScaler():

In [5]:
from sklearn.preprocessing import StandardScaler
from aixchem.transform.preprocess import Scaler

scaler = Scaler(StandardScaler)
data = scaler.fit(data).transform(data)

## Principal Component Analysis

Perform principal component analysis on the preprocessed dataset:

In [6]:
from aixchem.transform.decomposition import PCA

pca = PCA(n_components=4)
pc_data = pca.fit_transform(data)

pca.summary.to_csv(RESULTS / "pca_summary.csv")  # Get summary
pca.loadings.to_csv(RESULTS / "pca_loadings.csv")  # Get eigenvalues

## k-Means Clustering

Optimization of cluster number k:

In [7]:
from sklearn.cluster import KMeans
from aixchem.model.cluster import Clusterer
from aixchem.model.optimization import Optimization

# Set optimization parameters 
params = {
    "model": [KMeans],
    "random_state": [42],
    "n_clusters": list(range(2, 11)),
    "n_init": [5000],
    }

opt = Optimization(obj=Clusterer, params=params)

# run in parallel
opt.run(data, njobs=-1)  

# Note: To assess per-sample silhouettes scores, run the optimization sequentially:
# opt.run(data, njobs=1)
# for model in opt.grid:
#     print(model.params, model.silhouettes)

opt.results.to_csv(RESULTS / "optimization.csv")

Run kmeans-clustering using a cluster number of k=5 (determined from the optimization):

In [8]:
from sklearn.cluster import KMeans
from aixchem.model.cluster import Clusterer

kmeans = Clusterer(KMeans, n_clusters=5, random_state=42, n_init=5000)
kmeans.fit(data)

clusters = kmeans.predict(data)

Create results dataframe:

In [9]:
import pandas as pd

results = pd.concat([data.raw, pc_data.X], axis=1)
results["Cluster"] = clusters

results.to_csv(RESULTS / "results.csv")

# Display ligands that are stored in the cluster of the positive reference:
# results.loc[results["Cluster"] == results.loc[POSITIVE]["Cluster"]]["SMILES"]

### Statistical considerations

Perform statistical evaluation of the clustering (assess cluster robustness), by rerunning the algorithm 1000 times with different random seeds and display the resulting scores (number of times that a ligand was clustered with the references divided by the total number of clusterings performed):

In [10]:
from aixchem.model.cluster import ClusterRobustness

stats = ClusterRobustness(kmeans, random_states=list(range(1001)))
stats.run(data, njobs=-1)

stats_data = stats.check_candidates(POSITIVE)

stats_data.to_csv(RESULTS / "robustness.csv")