In [1]:

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

random_state = 42


In [2]:
# I assume that the data is already preprocessed and ready to be used
X = pd.read_csv('X_formation_energy_union_features.csv',index_col=0)
y = pd.read_csv('y_formation_energy.csv',index_col=0)
# turn y into a 1D array
y = y.iloc[:,0]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [3]:
X

Unnamed: 0_level_0,mean neighbor distance variation,MagpieData mode NsValence,MagpieData maximum SpaceGroupNumber,MagpieData maximum MendeleevNumber,avg_dev local difference in CovalentRadius,compound possible,MagpieData range NdValence,MagpieData mean Column,mean local difference in Electronegativity,minimum local difference in GSbandgap,...,range local difference in SpaceGroupNumber,maximum CN_VoronoiNN,MagpieData range Column,MagpieData mean MeltingT,MagpieData maximum MeltingT,MagpieData mode NUnfilled,frac p valence electrons,MagpieData minimum GSbandgap,minimum local difference in Column,MagpieData mode NpUnfilled
jid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
JVASP-90856,0.064699,1.0,227.0,84.0,9.730249,False,10.0,11.000000,0.316376,0.000000,...,23.850694,10.623333,11.0,1518.942500,1941.0,1.0,0.147059,0.0,2.606183,0.0
JVASP-86097,0.200279,2.0,194.0,72.0,19.976881,False,0.0,11.571429,0.278830,0.353776,...,21.148365,24.598746,10.0,2253.285714,2348.0,5.0,0.200000,0.0,2.321363,5.0
JVASP-64906,0.011565,2.0,194.0,67.0,0.500000,False,7.0,5.000000,0.408989,0.000000,...,0.000000,11.982697,6.0,2258.250000,3306.0,0.0,0.000000,0.0,3.895137,0.0
JVASP-98225,0.071601,1.0,229.0,86.0,1.528927,False,10.0,8.000000,0.827830,0.000000,...,21.640334,12.260669,14.0,440.465000,544.4,1.0,0.100000,0.0,8.792858,0.0
JVASP-10,0.119720,2.0,229.0,89.0,7.657251,False,7.0,12.333333,0.487429,0.284273,...,112.248335,11.168806,11.0,1057.000000,2183.0,2.0,0.216216,0.0,3.913647,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
JVASP-156020,0.112164,1.0,225.0,81.0,14.515769,False,9.0,9.800000,0.531222,0.000000,...,0.000000,18.191019,11.0,1399.644000,2237.0,3.0,0.051948,0.0,4.447399,0.0
JVASP-156398,0.046326,2.0,194.0,76.0,9.002206,False,10.0,9.333333,0.357266,0.000000,...,0.000000,9.971808,10.0,824.560000,1204.0,0.0,0.022727,0.0,5.000000,0.0
JVASP-156099,0.109214,2.0,229.0,86.0,15.511391,False,10.0,11.600000,0.522216,0.000000,...,76.048553,16.143290,13.0,589.660000,1000.0,3.0,0.093023,0.0,3.533003,3.0
JVASP-156007,0.100676,2.0,227.0,78.0,9.547456,False,5.0,9.000000,0.272436,0.406256,...,10.277844,17.295015,11.0,2010.400000,2430.0,4.0,0.108108,0.0,4.674021,0.0


In [4]:
y

jid
JVASP-90856    -0.42762
JVASP-86097    -0.41596
JVASP-64906     0.04847
JVASP-98225    -0.44140
JVASP-10       -0.71026
                 ...   
JVASP-156020   -0.30652
JVASP-156398   -0.34112
JVASP-156099   -0.39352
JVASP-156007   -0.54853
JVASP-156008   -0.28045
Name: formation_energy_peratom, Length: 71571, dtype: float64

# RFR

In [None]:
# define the hyperparameter grid to search
param_grid = {
    'n_estimators': [300, 500, 700, 900],
    'max_depth': [100, 200, 300],
    'max_features': [0.4],
    'bootstrap': [False],
    'min_samples_split': [ 3, 5, 7, 9],
    'min_samples_leaf': [1, 2, 3, 4, 5],
}

# define the model
rf = RandomForestRegressor(random_state=random_state,n_jobs=-1)

# define the search, using 5-fold cross-validation within the training set
search = RandomizedSearchCV(
    rf,
    param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    cv=5,
    verbose=3,
    random_state=random_state,
    n_iter=100
    )

# perform the search
search.fit(X_train, y_train)

# save the results of the search to csv
results = pd.DataFrame(search.cv_results_)
results.to_csv('rf_hyperparameter_tuning.csv')

# summarize the best score and configuration
print("Best: %f using %s" % (search.best_score_, search.best_params_))

# train the model with the best hyperparameters
best_rf = search.best_estimator_
best_rf.fit(X_train, y_train)

# evaluate the model using MAE, RMSE and R2
y_pred_rf = best_rf.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_pred_rf))
print('RMSE:', mean_squared_error(y_test, y_pred_rf, squared=False))
print('R2:', r2_score(y_test, y_pred_rf))

df_y_rf = pd.DataFrame({'y_test':y_test, 'y_pred_rf':y_pred_rf})
df_y_rf.to_csv('rf_y_test_y_pred.csv')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
best_rf
# save the model 
import joblib
joblib.dump(best_rf, 'rf_formation_energy_model.pkl')


# XGB

In [8]:
# define the hyperparameter grid to search
param_grid = {
    'n_estimators': [500, 600, 700],
    'max_depth': [ 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [ 0.9],
    'colsample_bytree': [ 0.5, 0.7, 0.9],
}

# define the model
xgb = XGBRegressor(random_state=random_state,
                   n_jobs=-1,
                   )

# define the search, using 5-fold cross-validation within the training set

search = RandomizedSearchCV(
    xgb,
    param_distributions=param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    cv=5,
    n_iter=100,  # Number of parameter settings sampled
    verbose=3,
    random_state=random_state
)

# perform the search
search.fit(X_train, y_train)

# save the results of the search to csv
results = pd.DataFrame(search.cv_results_)

results.to_csv('xgb_hyperparameter_tuning.csv')

# summarize the best score and configuration
print("Best: %f using %s" % (search.best_score_, search.best_params_))

# train the model with the best hyperparameters
best_xgb = search.best_estimator_
best_xgb.fit(X_train, y_train)

# evaluate the model using MAE, RMSE and R2
y_pred_xgb = best_xgb.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_pred_xgb))
print('RMSE:', mean_squared_error(y_test, y_pred_xgb, squared=False))
print('R2:', r2_score(y_test, y_pred_xgb))

df_y_xgb = pd.DataFrame({'y_test':y_test, 'y_pred_xgb':y_pred_xgb})
df_y_xgb.to_csv('xgb_y_test_y_pred.csv')


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.039 total time=  13.7s
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.038 total time=  13.8s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.032 total time=  14.0s
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.039 total time=  15.9s
[CV 5/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.039 total time=  17.3s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=10, n_estimators=600, subsample=0.9;, score=-0.030 total time=  21.1s
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=10, n_estimators=600, subsample=0.9;, score=-0.037 total time=  21.

python(42114) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 5/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=50, n_estimators=500, subsample=0.7;, score=-0.048 total time=11.7min


python(42115) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 4/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=500, n_estimators=200, subsample=0.7;, score=-0.039 total time= 5.9min


python(42118) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 3/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=50, n_estimators=500, subsample=0.7;, score=-0.045 total time=13.6min


python(42120) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 4/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=50, n_estimators=500, subsample=0.7;, score=-0.040 total time=13.7min
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=None, n_estimators=600, subsample=0.3;, score=-0.048 total time=   4.5s


python(42122) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 3/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=50, n_estimators=300, subsample=0.3;, score=-0.039 total time= 2.0min


python(42123) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 2/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=None, n_estimators=600, subsample=0.3;, score=-0.047 total time=   4.5s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=None, n_estimators=600, subsample=0.3;, score=-0.046 total time=   4.5s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=None, n_estimators=600, subsample=0.3;, score=-0.042 total time=   4.7s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=None, n_estimators=600, subsample=0.3;, score=-0.047 total time=   4.8s
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=500, n_estimators=200, subsample=0.7;, score=-0.047 total time= 4.1min


python(42159) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 4/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=50, n_estimators=300, subsample=0.3;, score=-0.033 total time= 2.0min
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=100, n_estimators=300, subsample=0.1;, score=-0.046 total time=  51.5s
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=100, n_estimators=300, subsample=0.1;, score=-0.046 total time=  50.9s
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=100, n_estimators=300, subsample=0.1;, score=-0.042 total time=  51.3s
[CV 5/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=100, n_estimators=300, subsample=0.1;, score=-0.042 total time=  50.9s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=100, n_estimators=300, subsample=0.1;, score=-0.039 total time=  51.3s
[CV 5/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=50, n_estimators=300, subsample=0.3;, score=-0.040 total time= 2.1min
[CV 1/5] END colsample_bytree=0.3, learning_rate=0

python(42313) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42314) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 4/5] END colsample_bytree=0.3, learning_rate=0.2, max_depth=5, n_estimators=300, subsample=1;, score=-0.046 total time=   2.2s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.2, max_depth=5, n_estimators=300, subsample=1;, score=-0.051 total time=   2.3s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=500, n_estimators=600, subsample=0.1;, score=-0.047 total time= 1.3min
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=500, n_estimators=600, subsample=0.1;, score=-0.046 total time= 1.3min
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=500, n_estimators=300, subsample=0.7;, score=-0.041 total time=10.4min
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.2, max_depth=100, n_estimators=200, subsample=0.7;, score=-0.052 total time= 1.6min
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=500, n_estimators=300, subsample=0.7;, score=-0.043 total time=10.5min
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.05, max_

python(42440) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 2/5] END colsample_bytree=0.5, learning_rate=0.2, max_depth=None, n_estimators=100, subsample=0.9;, score=-0.060 total time=   1.2s
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.2, max_depth=None, n_estimators=100, subsample=0.9;, score=-0.056 total time=   1.3s
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.2, max_depth=None, n_estimators=100, subsample=0.9;, score=-0.052 total time=   1.2s
[CV 2/5] END colsample_bytree=0.5, learning_rate=0.05, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.038 total time=  14.0s
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.2, max_depth=None, n_estimators=100, subsample=0.9;, score=-0.056 total time=   1.2s


python(42442) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 1/5] END colsample_bytree=0.5, learning_rate=0.05, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.038 total time=  15.7s
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.05, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.037 total time=  14.7s
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.05, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.031 total time=  15.3s
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.05, max_depth=10, n_estimators=500, subsample=0.5;, score=-0.038 total time=  16.6s
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=100, n_estimators=600, subsample=0.7;, score=-0.043 total time= 8.4min
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=100, n_estimators=600, subsample=0.7;, score=-0.042 total time= 7.9min
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.05, max_depth=100, n_estimators=600, subsample=0.7;, score=-0.041 total time= 7.6min
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.1

python(42637) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 4/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=None, n_estimators=500, subsample=1;, score=-0.043 total time=   4.6s
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=50, n_estimators=500, subsample=1;, score=-0.045 total time= 3.4min
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=50, n_estimators=500, subsample=1;, score=-0.047 total time= 3.3min
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=None, n_estimators=500, subsample=1;, score=-0.048 total time=   3.9s
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=50, n_estimators=500, subsample=1;, score=-0.040 total time= 3.4min
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.05, max_depth=100, n_estimators=300, subsample=0.7;, score=-0.036 total time= 8.9min
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=500, n_estimators=500, subsample=0.1;, score=-0.053 total time= 1.3min
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.05, max_dept



In [11]:
best_xgb
# save the model
import joblib
joblib.dump(best_xgb, 'xgb_formation_energy_model.pkl')


# KNN

In [None]:
# pick up from here next time

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd

# Define the parameter grid for KNN
param_grid = {
    'regressor__n_neighbors': [3, 5, 7, 9],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'regressor__leaf_size': [20, 30, 40, 50],
    'regressor__p': [1, 2]  # for different distance metrics: Manhattan and Euclidean
}


# Define the model
#knn = KNeighborsRegressor(n_jobs=-1)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', KNeighborsRegressor(n_jobs=-1))
])

# Define the search, using 5-fold cross-validation within the training set
search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    cv=5,
    verbose=3
)

# Perform the search
search.fit(X_train, y_train)

# Save the results of the search to CSV
results = pd.DataFrame(search.cv_results_)
results.to_csv('knn_hyperparameter_tuning.csv')

# Summarize the best score and configuration
print("Best: %f using %s" % (search.best_score_, search.best_params_))

# Train the model with the best hyperparameters
best_knn = search.best_estimator_
best_knn.fit(X_train, y_train)

# Evaluate the model using MAE, RMSE, and R2
y_pred_knn = best_knn.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_pred_knn))
print('RMSE:', mean_squared_error(y_test, y_pred_knn, squared=False))
print('R2:', r2_score(y_test, y_pred_knn))

df_y_knn = pd.DataFrame({'y_test': y_test, 'y_pred_knn': y_pred_knn})
df_y_knn.to_csv('knn_y_test_y_pred.csv')

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


python(42917) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42918) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42919) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42920) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42921) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42922) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42923) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42924) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42925) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42926) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(42927) Malloc

[CV 1/5] END regressor__algorithm=auto, regressor__leaf_size=20, regressor__n_neighbors=3, regressor__p=2, regressor__weights=uniform;, score=-0.082 total time=   2.9s
[CV 2/5] END regressor__algorithm=auto, regressor__leaf_size=20, regressor__n_neighbors=3, regressor__p=2, regressor__weights=uniform;, score=-0.082 total time=   3.0s
[CV 4/5] END regressor__algorithm=auto, regressor__leaf_size=20, regressor__n_neighbors=3, regressor__p=2, regressor__weights=uniform;, score=-0.074 total time=   2.8s
[CV 3/5] END regressor__algorithm=auto, regressor__leaf_size=20, regressor__n_neighbors=3, regressor__p=2, regressor__weights=uniform;, score=-0.077 total time=   2.9s
[CV 1/5] END regressor__algorithm=auto, regressor__leaf_size=20, regressor__n_neighbors=3, regressor__p=2, regressor__weights=distance;, score=-0.075 total time=   2.8s
[CV 5/5] END regressor__algorithm=auto, regressor__leaf_size=20, regressor__n_neighbors=3, regressor__p=2, regressor__weights=uniform;, score=-0.090 total time



In [16]:
best_knn