# SVR training

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import sklearn as sk
import sklearn.metrics
import sklearn.model_selection 
import sklearn.svm

In [2]:
# constants
PARA_JOBS = 8
NUM_SEARCH_PARAMS = 10

In [3]:
def custom_huber_loss(y_true, y_pred): # as implemented in https://github.com/astroML/astroML
    delta = 1.0
    t = abs((y_true - y_pred))
    flag = t > delta
    ave=np.mean(np.array(y_true))
    return - (np.sum((~flag) * (0.5 * t * t) + (flag) * delta * (t - 0.5 * delta))
        /np.sum(0.5*(y_true-ave)*(y_true-ave)))

In [4]:
# read training and test dataset
[X_trainvalid, X_test, y_trainvalid, y_test] = pickle.load(open('./dataset/raw_data.pickle', 'rb'))

In [5]:
# read full dataset
store = pd.HDFStore('./dataset/data_train.h5')
df_dataset = store['df_all']
store.close()

In [None]:
# train model
huber_score=sklearn.metrics.make_scorer(custom_huber_loss, greater_is_better=True)
params = {'C':np.logspace(-2,3,NUM_SEARCH_PARAMS), 'epsilon':np.logspace(-3,2,NUM_SEARCH_PARAMS)}

grid_search = sk.model_selection.GridSearchCV(sk.svm.SVR(),params, cv=5, scoring=huber_score,
                                              return_train_score=True, n_jobs=PARA_JOBS, verbose=0)
grid_search.fit(X_trainvalid, y_trainvalid)
model = sk.svm.SVR(**grid_search.best_params_)
model.fit(X_trainvalid, y_trainvalid)
print('best hyperparameters')
print(grid_search.best_params_)

In [None]:
# predict
pred_trainvalid = model.predict(X_trainvalid)
pred_test = model.predict(X_test)

In [None]:
# create DataFrame for analyses
df_temp=pd.DataFrame(pred_test, index=X_test.index, columns=['pred_svr'])
df_dataset_pred_test = pd.concat([df_temp, df_dataset], axis=1, join_axes=[df_temp.index])
df_dataset_pred_test['ml_error'] = df_dataset_pred_test['pred_svr']-df_dataset_pred_test['PG18PI17_e_electronic_ave']

In [None]:
# Fig 1
plt.xlim([0,22])
plt.ylim([0,22])
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(df_dataset_pred_test['PG18PI17_e_electronic_ave'],df_dataset_pred_test['pred_svr'], s=10)

In [None]:
# Table II
fs = [sk.metrics.r2_score, sk.metrics.mean_absolute_error,
     sk.metrics.mean_squared_error, 
     lambda y1,y2: np.sqrt(abs(sk.metrics.mean_squared_error(y1,y2)))]
pd.DataFrame([[f(x[0],x[1]) for x in 
            [[y_trainvalid, pred_trainvalid],[y_test, pred_test]]]
           for f in fs], index=['$r^2$','MAE','MSE','RMSE'],columns=['training','test'])

In [None]:
# Table III
(df_dataset_pred_test.iloc[np.argsort(-abs(df_dataset_pred_test.ml_error)).values]
 [['MP_pretty_formula','pred_svr','PG18PI17_e_electronic_ave','ml_error','MP_e_above_hull']]
 .head(10).set_axis(
     ['formula','$ε_{SVR}$','$ε_{Dataset}$','$\Deltaε_{Dataset}$','$\Delta E_{hull}$ (meV)'],
     axis=1, inplace=False).reset_index(drop=True))