## Testing for individual systems.
In the following block of codes we start with csv file of a particular system ('Ag2O') that contains the 
information of compound name. We then shuffle and split the selected rows into 80/20 train/test dataset. We will 
get the parameters for best performing model using different kernels (we used laplacian kernel in our case) in 
Kernel Ridge Regression (KRR). We then shuffle and get new train/test data using 80/20 split and use that best 
performing model to test the new train/test data. We repeat that step 20 times. We first start by importing the 
required modules. If you don't have some or all of the modules below try "pip install pandas" and so on for 
all missing modules in the terminal.
We end up with a train and test-set MAE corresponding to various test-sets across the same system. 

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
#
import tarfile
import re
import sys
import random 
import operator 
import os
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
def mae(y_true, y_pred):
    return np.mean(abs(y_true-y_pred))

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

def string_number_separator(string):
    import re
    temp = re.compile("([a-zA-Z]+)([0-9]+)")
    res = temp.match(string).groups()
    return res

def list_of_atoms_and_numbers(string):
    atoms=[]
    atoms_number=[]
    str1=''
    while len(str1)<len(string):
        string1=str(string)[len(str1):]
        x,y=string_number_separator(string1)
        str1=str1+str(x)+str(y)
        atoms.append(str(x))
        atoms_number.append(int(y))
    return atoms, atoms_number

def input_column_creater(atom_list,prop):
    orbs=['1s','2s','2p','3s','3p','3d','4s','4p','4d','5s','5p','5d','6s','6p']
    orbs_generated=[]
    for elements in atom_list:
        for i in range (len(orbs)):
            new_str=str(orbs[i]+str(elements))
            orbs_generated.append(new_str)
    for j in range (len(prop)):
        orbs_generated.append(str(prop[j]))
    return orbs_generated

def train_model(x_train, y_train, x_test,y_test, nfold, nthread, in_alpha, in_gamma, kernel=None, rseed=None):

    if rseed:
        random.seed(rseed)
        np.random.seed(rseed)

    neg_root_mean_squared_error = make_scorer(root_mean_squared_error, greater_is_better=False)
    clf = GridSearchCV(KernelRidge(kernel=kernel), cv=nfold, n_jobs=nthread, verbose=1, scoring=neg_root_mean_squared_error, param_grid={"alpha":in_alpha, "gamma": in_gamma})
    clf.fit(x_train, y_train)
    y_train_pred = clf.predict(x_train)
    y_test_pred = clf.predict(x_test)
    train_err = mae(y_train_pred, y_train)
    test_err = mae(y_test_pred, y_test)
    return clf,train_err,test_err

def predict_value(clf,x_test,y_test):
    y_test_pred=clf.predict(x_test)
    test_err=mae(y_test_pred,y_test)
    return test_err

def modified_pettifor(in_atom):

    mod_scale = {
                "He": 1,
                "Ne": 2,
                "Ar": 3,
                "Kr": 4,
                "Xe": 5,
                "Rn": 6,
                "Fr": 7,
                "Cs": 8,
                "Rb": 9,
                "K": 10,
                "Na": 11,
                "Li": 12,
                "Ra": 13,
                "Ba": 14,
                "Sr": 15,
                "Ca": 16,
                "Eu": 17,
                "Yb": 18,
                "Lu": 19,
                "Tm": 20,
                "Y": 21,
                "Er": 22,
                "Ho": 23,
                "Dy": 24,
                "Tb": 25,
                "Gd": 26,
                "Sm": 27,
                "Pm": 28,
                "Nd": 29,
                "Pr": 30,
                "Ce": 31,
                "La": 32,
                "Ac": 33,
                "Th": 34,
                "Pa": 35,
                "U": 36,
                "Np": 37,
                "Pu": 38,
                "Am": 39,
                "Cm": 40,
                "Bk": 41,
                "Cf": 42,
                "Es": 43,
                "Fm": 44,
                "Md": 45,
                "No": 46,
                "Lr": 47,
                "Sc": 48,
                "Zr": 49,
                "Hf": 50,
                "Ti": 51,
                "Ta": 52,
                "Nb": 53,
                "V": 54,
                "Cr": 55,
                "Mo": 56,
                "W": 57,
                "Re": 58,
                "Tc": 59,
                "Os": 60,
                "Ru": 61,
                "Ir": 62,
                "Rh": 63,
                "Pt": 64,
                "Pd": 65,
                "Au": 66,
                "Ag": 67,
                "Cu": 68,
                "Ni": 69,
                "Co": 70,
                "Fe": 71,
                "Mn": 72,
                "Mg": 73,
                "Zn": 74,
                "Cd": 75,
                "Hg": 76,
                "Be": 77,
                "Al": 78,
                "Ga": 79,
                "In": 80,
                "Tl": 81,
                "Pb": 82,
                "Sn": 83,
                "Ge": 84,
                "Si": 85,
                "B": 86,
                "C": 87,
                "N": 88,
                "P": 89,
                "As": 90,
                "Sb": 91,
                "Bi": 92,
                "Po": 93,
                "Te": 94,
                "Se": 95,
                "S": 96,
                "O": 97,
                "At": 98,
                "I": 99,
                "Br": 100,
                "Cl": 101,
                "F": 102,
                "H": 103
            }

    return mod_scale[in_atom]/float(max(mod_scale.values()))

def compound_pettifor(compound):
    cpd=compound.split('-')[0]
    in_atom,num_atom=list_of_atoms_and_numbers(cpd)
    if len(in_atom)==1:
        norm_pettifor=modified_pettifor(in_atom[0])
        return norm_pettifor
    else:
        norm_pettifor=0
        for i in range (len(in_atom)):
            tot=sum(num_atom)
            norm_pettifor+=(num_atom[i]/tot)*modified_pettifor(in_atom[i])
        return norm_pettifor
                

In [None]:
def best_fit(csv_path):
    df = pd.read_csv(csv_path)
    df['pet'] = [ compound_pettifor(compound) for compound in df['compound'] ]
    tmp_feats = ['pet', '1s', '2s', '2p', '3s', '3p', '3d', '4s', '4p', '4d', '5s', '5p', '5d', '6s', '6p']
    keep_cols = tmp_feats + ['EF_PBE']
    # print(keep_cols)
    # keep_cols = columns
    y_feat = 'eigshift(eV)'
    #y_feat='EF_HSE'
    x_data = df[keep_cols].to_numpy()
    y_data = df[y_feat].to_numpy()

    print(np.shape(x_data)) #x_data)
    print(np.shape(y_data)) #y_data)


    test_size = 0.20 
    nfold = 5
    nthread = 2
    ker = 'laplacian'
    #ker = 'rbf'
    alphas = [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]
    gammas = [0.0001, 0.001, 0.01, 0.1, 1.0 , 10.0 , 100.0]
    number = random.randint(1, 1000)
    print("random seed", number)
    rseed = random.seed(number)
    #rseed=300
    test_err=[]
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_size, shuffle=True)
    clf,train_error,test_error= train_model(x_train, y_train, x_test, y_test, nfold, nthread, in_alpha=alphas, in_gamma=gammas, kernel=ker, rseed=rseed)
    test_err.append(test_error)
    print(train_error)
    for i in range (25):
        x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_size, shuffle=True)
        clf.fit(x_train,y_train)
        test_mae=predict_value(clf,x_test,y_test)
        test_err.append(test_mae)
    return test_err,clf

In [None]:
cwd=os.getcwd()
atom_path=os.path.join(cwd,'atom_sum_compound_csv_files/')
concat_path=os.path.join(cwd,'special_concatenations_compounds_atom_sum/')
listdir_cwd=os.listdir(cwd)
#print(listdir)
check=['atom_sum_compound_csv_files','special_concatenations_compounds_atom_sum']
for entries in check:
    if entries not in listdir_cwd:
        file=tarfile.open(str(entries)+'.tar.gz')
        file.extractall()
        file.close
    else:
        print ('%s %s'%(entries,'exists'))

test_err,clf=best_fit(atom_path+'Ag2O1.csv')
test_error=np.array(test_err)
print(clf.best_params_)
print(np.min(test_error),np.max(test_error),np.mean(test_error),np.std(test_error))