# Trialkylphosphine Ni<sup>(I)</sup> dimers



Path settings:

In [None]:
import sys
from pathlib import Path

# Append aixchem module to system path to enable custom imports
sys.path.append(str(Path().cwd().parent.parent))

# Path to .csv-file
CSV = Path().cwd() / "descriptors.csv"

Define the IDs of the positive and negative reference ligands:

In [None]:
POSITIVE = 16  # P(t-Bu)3
NEGATIVE = 19  # P(Np)3

## Dataset

Load .csv-file and drop unnecessary columns:

In [None]:
from aixchem.data.handler import TabularDataHandler

data = TabularDataHandler(data=CSV, index=0, sep=",")
data = TabularDataHandler.from_yaml(data=CSV, index=0, sep=",")
data.drop(cols=["SMILES"])

### Correlation Analysis

Perform correlation anaylsis and remove features exceeding a correlation threshold of 90%:

In [None]:
from aixchem.analysis.correlation import CorrelationAnalysis

correlation = CorrelationAnalysis(df=data.X)
data.drop(cols=correlation.filter(threshold=0.9))

correlation.matrix

### Feature Selection

Perform two-sample feature selection and select the 33 features that cause the widest separation between positive and negative reference ligands:

In [None]:
from aixchem.data.fselect import TwoSampleFeatureSelection

fselect = TwoSampleFeatureSelection(df=data.X, idx=POSITIVE, idy=NEGATIVE, n_best=33, quantiles=(0.01, 0.99))
data.drop(cols=[col for col in data.X.columns if col not in fselect.selection])

fselect.selection

### Feature Scaling

Scale the features using the sklearn.StandardScaler():

In [None]:
scaler = data.scale()

## Principal Component Analysis

Perform principal component analysis on the preprocessed dataset:

In [None]:
from aixchem.analysis.pca import PrincipalComponentAnalysis

pca = PrincipalComponentAnalysis(df=data.X, n_components=4)

Display PCA summary:

In [None]:
pca.summary

Display resulting principal components:

In [None]:
pca.components

## k-Means Clustering

Initialize k-means clustering handler:

In [None]:
from aixchem.analysis.kmeans import KMeansClustering

kmeans = KMeansClustering()

N_INIT = 5000

### Optimization

Perform optimization of the cluster number k:

In [None]:
per_sample_silhouette_scores = kmeans.optimize(df=data.X, ks=range(2, 11), n_init=N_INIT)

Display global optimization metrics:

In [None]:
kmeans.metrics

Display per-sample silhouette scores:

In [None]:
per_sample_silhouette_scores

### Results

Run kmeans-clustering using a cluster number of k=5 (determined from the optimization):

In [None]:
kmeans.run(df=data.X, k=5, n_init=N_INIT)

Display the resulting clusters:

In [None]:
kmeans.clusters

Display the ligands that are stored in the same cluster as the positive reference:

In [None]:
data.raw.loc[kmeans.clusters[kmeans.clusters == kmeans.clusters[POSITIVE]].index]["SMILES"]

### Statistical considerations

Perform statistical evaluation of the clustering, by rerunning the algorithm 1000 times with different random seeds and display the resulting scores (number of times that a ligand was clustered with the references divided by the total number of clusterings performed):

In [None]:
stats = kmeans.statistics(df=data.X, k=5, n=1000, refs=POSITIVE, n_init=N_INIT)

stats