In [44]:
!python -m pip install dscribe nglview


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [99]:
from functools import partial

import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import cloudpickle
import json

# plotting
import matplotlib.pyplot as plt

import importlib as imp
import sklearn_utils
imp.reload(sklearn_utils)
from sklearn_utils import run_regressor_nested_cv

import ase
import ase.io

import dscribe

In [100]:
data = pd.read_excel("data/dataset_pp.2022-12-09.xlsx", index_col=0)
for folder in [p for p in data["path"] if not os.path.exists(os.path.join(p, "POSCAR"))]:
    os.rename(os.path.join(folder, "POSCAR.vasp"), os.path.join(folder, "POSCAR"))
[p for p in data["path"] if not os.path.exists(os.path.join(p, "POSCAR"))]

[]

In [101]:
data = pd.read_excel("data/dataset_pp.2022-12-09.xlsx", index_col=0)
data["structure"] = [ase.io.read(os.path.join(p, "POSCAR")) for p in data["path"]]

extra_features = data["System"].str.split("-")
extra_df = pd.DataFrame.from_dict(dict(zip(extra_features.index, extra_features.values))).T
extra_df.columns = ["...", "sys", "pos", "ads"]
extra_df = extra_df.iloc[:, 1:]
ads_names = extra_df["ads"]
ads_cats = ads_names.astype("category").cat.codes


from dscribe.descriptors import SOAP
soap_sub = SOAP(species=["Mg"], periodic=True, r_cut=2, n_max=2, l_max=2,
                average="outer")

soap_ads = SOAP(species=["C", "H", "O", "Mg"], periodic=True, r_cut=4.0, n_max=4, l_max=3,
                average="outer")


In [102]:
allfp_sub = []
allfp_ads = []

for s in data["structure"].to_list():
    substrate_index = np.array(s.get_chemical_symbols()) == "Mg"
    substrate = s[substrate_index]
    adsorbant = s[np.logical_not(substrate_index)]
    sub_z = substrate.positions[:,2].min()
    ads_C = adsorbant.positions[adsorbant.numbers == 6][0]
    ads_C_z = ads_C[2]
    ads_ref = ads_C.copy()
    ads_ref[2] -= (ads_C_z - sub_z)
    ads_ref_atom = ase.Atoms("Mg", [ads_ref,])
    ads_fp_sys = adsorbant + ads_ref_atom
    
    # substrate fp
    fp_sub = soap_sub.create([substrate])
    # full structure fp around the adsorbant
    #fp_sub = soap_ads.create([s], [[ads_ref,]])
    # reference system only
    fp_ads = soap_ads.create([ads_fp_sys], [[ads_ref],])
    
    allfp_sub.append(fp_sub)
    allfp_ads.append(fp_ads)
    
allfp_sub = np.array(allfp_sub)
allfp_ads = np.array(allfp_ads)

total_fp = np.hstack([allfp_sub, allfp_ads])
target = data["E_ads"]
print()
from ase.visualize import view
print(ads_C, ads_ref, allfp_sub.shape, allfp_ads.shape, total_fp.shape)


[ 3.00807222  7.48893484 14.17360878] [ 3.00807222  7.48893484 10.87250042] (116, 9) (116, 544) (116, 553)


In [111]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

from sklearn.decomposition import PCA

total_fp_pca = PCA(n_components=20).fit_transform(total_fp)

res = run_regressor_nested_cv(total_fp, target, 
                        partial(KernelRidge), dict(
                         alpha = np.logspace(-3, 4, num=20),
                         kernel= ["rbf"],
                         gamma = np.logspace(-3, 4, num=20),
                        ),
                        sample_class=ads_cats,
                        view_class=ads_names,
                        test_split=0.2,
                        scaler=None, name="SOAP_KRR",
                        pp_kws = dict(
                            min_max = (-0.8, 0.8))
                       )
print(res)

{'train_mae': ('0.1013', '0.0061'), 'test_mae': ('0.1330', '0.0297'), 'train_mse': ('0.0342', '0.0051'), 'test_mse': ('0.0551', '0.0181'), 'train_r2': ('0.4841', '0.0649'), 'test_r2': ('0.2476', '0.2390')}


<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

In [112]:

from sklearn.ensemble import RandomForestRegressor

total_fp_pca = PCA(n_components=20).fit_transform(total_fp)

res = run_regressor_nested_cv(total_fp, target, 
                        partial(RandomForestRegressor, random_state=0), dict(
                            n_estimators = np.logspace(1, 3.5, num=5).astype(int),
                            max_depth = [None, 5, 10, 20],
                            min_samples_split = [2, 5, 10]
                        ),
                        #sample_class=ads_cats,
                        view_class=ads_names,
                        test_split=0.2,
                        scaler=None, name="SOAP_RF",
                        pp_kws = dict(
                            min_max = (-0.8, 0.8))
                       )
print(res)

{'train_mae': ('0.0702', '0.0060'), 'test_mae': ('0.1061', '0.0368'), 'train_mse': ('0.0187', '0.0026'), 'test_mse': ('0.0461', '0.0329'), 'train_r2': ('0.7043', '0.0748'), 'test_r2': ('0.3726', '0.1916')}


<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

In [113]:
import sklearn_utils
imp.reload(sklearn_utils)
from sklearn_utils import run_regressor_nested_cv, run_regressor_manual
## FULL DATA GBR
from sklearn.ensemble import GradientBoostingRegressor


res = run_regressor_nested_cv(total_fp, target, 
                        partial(GradientBoostingRegressor, random_state=0), dict(
                            n_estimators = np.logspace(1, 2.5, num=5).astype(int),
                            max_depth = [1, 5, 6],
                            min_samples_split = [2, 3, 4]),
                        sample_class=ads_cats,
                        view_class=ads_names,
                        scaler=StandardScaler, name="SOAP_GB_base",
                        test_split=0.2,
                        pp_kws = dict(
                            min_max = (-0.8, 0.8))
                       )
print(res)

{'train_mae': ('0.0738', '0.0100'), 'test_mae': ('0.1497', '0.0324'), 'train_mse': ('0.0175', '0.0046'), 'test_mse': ('0.0706', '0.0252'), 'train_r2': ('0.7358', '0.0635'), 'test_r2': ('0.0521', '0.2450')}


<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>