In [23]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import numpy as np
import pickle
import os

# Custom scripts
from utils.dataset_utils import create_preprocessed_datasets
from inputs_perov_data_v2.dataset_inputs import list_of_nn_datasets_dict
from utils.custom_utils import set_global_random_seed

global_seed = 0
set_global_random_seed(0)

Global seed set to 0


## Load the dataset

In [24]:
nn_save_dir = '../runs/perovskite_multiscale_dataset_v2'
if os.path.exists(nn_save_dir + '/datasets'):
    print('Skipping creating dataset directory.')
else:
    nn_datasets_dict = list_of_nn_datasets_dict[0]
    test_split = None
    mode = 'train'
    kfolds = 5
    create_preprocessed_datasets(nn_save_dir, 
                                    nn_datasets_dict,
                                    global_seed,
                                    test_split=test_split,
                                    mode=mode,
                                    kfolds=kfolds)

descriptors = list(list_of_nn_datasets_dict[0]['train']['PSC_eff_v2']['variables'].keys())
target = descriptors[-1]
descriptors.pop(-1)

Skipping creating dataset directory.


'PCE'

## LASSO

In [5]:
lasso_results = []
lasso_train_cod = []
lasso_test_cod = []
# Concatenate all the dictionary values with the key specified as the descriptor
for i in range(kfolds):
    train_dataset = torch.load(f'{nn_save_dir}/datasets/train_dataset_fold_{i}.pt')
    val_dataset = torch.load(f'{nn_save_dir}/datasets/val_dataset_fold_{i}.pt')

    X_train = torch.cat([train_dataset[:][descriptor] for descriptor in descriptors], dim=1).detach().numpy()
    y_train = train_dataset[:][target].detach().numpy().ravel()
    X_test = torch.cat([val_dataset[:][descriptor] for descriptor in descriptors], dim=1).detach().numpy()
    y_test = val_dataset[:][target].detach().numpy().ravel()

    lasso = Lasso(random_state=global_seed)
    search = GridSearchCV(lasso,{'alpha':np.arange(0.001,0.1,0.001)}, scoring="neg_mean_absolute_error")

    search.fit(X_train, y_train)
    lasso_parameters = search.best_params_
    coefficients = search.best_estimator_.coef_

    lasso_results.append(search)
    print(lasso_parameters)
    print(coefficients)
    lasso_train_cod.append(search.score(X_train, y_train))
    lasso_test_cod.append(search.score(X_test, y_test))
    print(f'Fold {i}')
    print(f'Train R^2 : {lasso_train_cod[-1]}')
    print(f'Test R^2 : {lasso_test_cod[-1]}')
    
# Save search to pickle
pickle.dump(lasso_results, open(f'{nn_save_dir}/lasso_results.pkl', 'wb'))
pickle.dump(lasso_train_cod, open(f'{nn_save_dir}/lasso_train_cod.pkl', 'wb'))
pickle.dump(lasso_test_cod, open(f'{nn_save_dir}/lasso_test_cod.pkl', 'wb'))

{'alpha': 0.017}
[ 1.61717631e+00  3.32099175e-02  0.00000000e+00 -5.31447559e-02
 -0.00000000e+00  0.00000000e+00  2.21938199e-01 -2.21568666e+00
 -0.00000000e+00  6.17198810e-02  2.13807178e+00  3.81043781e-05
 -0.00000000e+00 -4.43501243e-04 -0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.36399251e+00
  0.00000000e+00  6.09655303e-01  1.94417583e+00 -0.00000000e+00
  0.00000000e+00 -1.85304640e-01  1.18441418e+00  2.18728477e+00
 -2.18009688e-01 -3.28641855e-01 -1.11336919e+00 -2.63491451e+00
  3.90516275e-04  4.04922582e-01]
Train R^2 : -12.45765265577068
Test R^2 : -11.614273449360475
{'alpha': 0.012}
[ 1.01713116e+00  0.00000000e+00  0.00000000e+00 -6.46406174e-01
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -2.46046739e+00
  0.00000000e+00  7.03672087e-02  2.03185365e+00  1.33107017e-04
 -0.00000000e+00 -8.71222690e-04 -0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.16070061e+00
  0.00000000e+00  7.75670

  model = cd_fast.enet_coordinate_descent(


{'alpha': 0.014000000000000002}
[ 1.31934815e+00  1.55580010e-01  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00 -2.40348424e+00
  2.51375603e-03  7.00197438e-02  2.11721600e+00  0.00000000e+00
 -0.00000000e+00 -4.90059708e-04 -0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.92479918e+00
 -0.00000000e+00  5.38541816e-01  2.22415642e+00 -4.80398126e-01
  0.00000000e+00 -0.00000000e+00  1.42044565e+00  2.25651265e+00
 -4.83281944e-01 -1.77597243e-01 -2.08151101e+00 -2.89393091e+00
  8.10250639e-02  2.27788432e-01]
Train R^2 : -11.742055479473951
Test R^2 : -13.889126751710364
{'alpha': 0.014000000000000002}
[ 1.24544457e+00  1.54505619e-01  0.00000000e+00 -5.58792260e-02
 -0.00000000e+00  0.00000000e+00  1.99635416e-01 -2.07565589e+00
 -0.00000000e+00  5.95049626e-02  2.09845300e+00  4.30555638e-05
 -0.00000000e+00 -6.44501544e-04 -0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.7889174

In [16]:
gbregressor_results = []
gbregressor_train_cod = []
gbregressor_test_cod = []
for i in range(kfolds):
    train_dataset = torch.load(f'{nn_save_dir}/datasets/train_dataset_fold_{i}.pt')
    val_dataset = torch.load(f'{nn_save_dir}/datasets/val_dataset_fold_{i}.pt')

    X_train = torch.cat([train_dataset[:][descriptor] for descriptor in descriptors], dim=1).detach().numpy()
    y_train = train_dataset[:][target].detach().numpy().ravel()
    X_test = torch.cat([val_dataset[:][descriptor] for descriptor in descriptors], dim=1).detach().numpy()
    y_test = val_dataset[:][target].detach().numpy().ravel()

    gbregressor = GradientBoostingRegressor(random_state=global_seed, n_estimators=200, learning_rate=0.01, max_depth=20, subsample=0.5, loss='absolute_error')
    gbregressor.fit(X_train, y_train)

    gbregressor_results.append(gbregressor)
    gbregressor_train_cod.append(gbregressor.score(X_train, y_train))
    gbregressor_test_cod.append(gbregressor.score(X_test, y_test))
    print(f'Fold {i}')
    print(f'Train R^2 : {gbregressor_train_cod[-1]}')
    print(f'Test R^2 : {gbregressor_test_cod[-1]}')
    print(f'Train Loss : {gbregressor.train_score_[-1]}')
    print(f'Test Loss : {gbregressor.loss_(y_test, gbregressor.predict(X_test))}')
    print('\n')

# Save search to pickle
pickle.dump(gbregressor_results, open(f'{nn_save_dir}/gbregressor_results.pkl', 'wb'))
pickle.dump(gbregressor_train_cod, open(f'{nn_save_dir}/gbregressor_train_cod.pkl', 'wb'))
pickle.dump(gbregressor_test_cod, open(f'{nn_save_dir}/gbregressor_test_cod.pkl', 'wb'))

Fold 0
Train R^2 : 0.8113484588667679
Test R^2 : 0.5636766075954437
Train Loss : 1.4152355185354097
Test Loss : 2.2245287252821777






Fold 1
Train R^2 : 0.7958265124058295
Test R^2 : 0.6102684810675147
Train Loss : 1.4859802795263821
Test Loss : 2.6921415598171397






Fold 2
Train R^2 : 0.799960167145499
Test R^2 : 0.5399171933392033
Train Loss : 1.3713412793015893
Test Loss : 2.4637450316965452






Fold 3
Train R^2 : 0.7968480786950437
Test R^2 : 0.6274673249957221
Train Loss : 1.5659464599253106
Test Loss : 2.2675800268635204






Fold 4
Train R^2 : 0.7958182205413917
Test R^2 : 0.5436314756311713
Train Loss : 1.420663238707411
Test Loss : 2.8351282842667263






In [22]:
svr_results = []
svr_train_cod = []
svr_test_cod = []

for i in range(kfolds):
    train_dataset = torch.load(f'{nn_save_dir}/datasets/train_dataset_fold_{i}.pt')
    val_dataset = torch.load(f'{nn_save_dir}/datasets/val_dataset_fold_{i}.pt')

    X_train = torch.cat([train_dataset[:][descriptor] for descriptor in descriptors], dim=1).detach().numpy()
    y_train = train_dataset[:][target].detach().numpy().ravel()
    X_test = torch.cat([val_dataset[:][descriptor] for descriptor in descriptors], dim=1).detach().numpy()
    y_test = val_dataset[:][target].detach().numpy().ravel()

    svr = SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1)
    svr.fit(X_train, y_train)

    svr_results.append(svr)
    svr_train_cod.append(svr.score(X_train, y_train))
    svr_test_cod.append(svr.score(X_test, y_test))
    print(f'Fold {i}')
    print(f'Train R^2 : {svr_train_cod[-1]}')
    print(f'Test R^2 : {svr_test_cod[-1]}')
    print(f'Train Loss : {mean_absolute_error(y_train, svr.predict(X_train))}')
    print(f'Test Loss : {mean_absolute_error(y_test, svr.predict(X_test))}')
    print('\n')

# Save search to pickle
pickle.dump(svr_results, open(f'{nn_save_dir}/svr_results.pkl', 'wb'))
pickle.dump(svr_train_cod, open(f'{nn_save_dir}/svr_train_cod.pkl', 'wb'))
pickle.dump(svr_test_cod, open(f'{nn_save_dir}/svr_test_cod.pkl', 'wb'))

Fold 0
Train R^2 : 0.5754964097034966
Test R^2 : 0.4732361949694095
Train Loss : 2.4476897533866713
Test Loss : 2.3208716261330355


Fold 1
Train R^2 : 0.5765299746761042
Test R^2 : 0.5877961706661152
Train Loss : 2.3324877670448836
Test Loss : 2.7645098891761033


Fold 2
Train R^2 : 0.5879285966830929
Test R^2 : 0.5005570609479452
Train Loss : 2.354083686978085
Test Loss : 2.5604169334972706


Fold 3
Train R^2 : 0.5907766553442368
Test R^2 : 0.5499663185503176
Train Loss : 2.3734653750412993
Test Loss : 2.4413497878293984


Fold 4
Train R^2 : 0.5724630396864618
Test R^2 : 0.4830237154156276
Train Loss : 2.340109636681176
Test Loss : 2.9641579471351416


