# Notebook to Process Data for Model Training

### Here are some parameters which will affect the way the notebook executes.


In [1]:
pm_aug_minority_classes_only = False

# Write the maximum normalized spectra (set max val=1) 
# instead of the feff normalized spectra.
# This matters more for the polynomial vectors, which were processed elsewhere.
# This will DIRECTLY write the max-normalized mu to the .npy files. You can also normalize them later 
# per your use case (which is what Train_Run_Models does).
use_max_normalized = False
norm_str = 'max' if use_max_normalized else 'feff'

# Flag to remove spectra which have a high error once fit to polynomials. This was used in the publication.
drop_poly_problems = True


# Use augmented data from the processed spectra on the *training set*.
# We did NOT use this flag in the publication, as we found it interfered with the interpretability.
# You are welcome to experiment, though!
data_aug = False            

# Augmented data in the training set includes strech / squeeze (dilation of the spectrum by 5%)
# and pm 1 / 2, which shift the domain by 1 or 2 eV respectively

# Over-sample in your data set minority classes, randomly with replacement.
# Done only for pointwise models.
oversampling = True

# Random seed used for all train_test splits
rseed = 42

In [2]:
import sys
import os
import json
import numpy as np
from tabulate import tabulate
from collections import Counter
from tqdm.notebook import tqdm
from pprint import pprint
from pymatgen.core import Structure
from pymatgen.analysis.structure_matcher import StructureMatcher, ElementComparator
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

#sys.path.append(os.path.join(os.getcwd(), '..'))  # TRIXS path if not automatically detected
from trixs.spectra.core import XAS_Spectrum, XAS_Collation
from trixs.spectra.util import NumpyEncoder
from trixs.machine_learning.benchmarks import precision_recall_matrix, confusion_dict
from trixs.spectra.spectrum_featurize import polynomialize_by_idx, gauge_polynomial_error

figure_write_folder = "./gen_figures_feffnorm" if use_max_normalized else './gen_figures_maxnorm'
storage_directory = './spectral_data'

## Define domains which will be used for x-axis labels later, as well as define the elements which will be imported for use.

In [3]:
target_elements_groups=[('Ti','O'),('V','O'),('Cr','O'),
                        ('Mn','O'),('Fe','O'),('Co','O'),
                        ('Ni','O'),('Cu','O')]

x_domains = {  ('Co','O'):  np.linspace(7713.5, 7765.83,100),
               ('Fe','O'): np.linspace(7115.0, 7167.764,100),
               ('V','O'):  np.linspace(5468.0, 5520.631,100),
               ('Cu','O'): np.linspace( 8987.5, 9039.712,100),
               ('Ni','O'): np.linspace( 8336.5 ,8388.723,100),
               ('Cr','O'): np.linspace(5993.1, 6045.686,100),
               ('Mn','O'): np.linspace(6541.7, 6594.417,100),
               ('Ti','O'): np.linspace(4969.0, 5021.024,100)}

colors_by_pair = {('Ti','O'):'orangered',
                  ('V','O'):'darkorange',
                  ('Cr','O'):'gold',
                  ('Mn','O'):'seagreen',
                  ('Fe','O'):'dodgerblue',
                  ('Co','O'):'navy',
                  ('Ni','O'):'rebeccapurple',
                  ('Cu','O'):"mediumvioletred"}

pair_to_name={'Ti':"Titanium",'V':'Vanadium',
              'Cr':'Chromium','Mn':"Manganese",
              'Fe':"Iron",'Co':"Cobalt",
             'Ni':'Nickel','Cu':'Copper'}

In [4]:
def prune_outliers(mu):
    """
    Given an absorption spectra, check on three unphysical ways we saw FEFF spectra behave for a small
    number of outlying data sets:
    1. A strange absorption pattern in which the absorption increases, rather than decays, in the XAFS region
    2. Unusually high absorption value
    3. A spuriously high peak-- larger than the edge's peak-- in the pre-edge region.
    """
    
    
    if mu[-1]==np.max(mu):
        return False
    
    if np.max(mu)>3:
        return False
    
    if np.max(mu) in mu[:10]:
        return False
    
    return True


def test_poly_error_fit_N(X_set, idxs, x_domain, error_bound=.1, use_norm = True):
    """
    Run one-shot fitting over a large variety of X values.
    """
    idx_keep = []
    
    for spec,idx in zip(X_set,idxs):
        
        within_bound = one_shot_poly_error_fit(x_domain,spec,
                                               use_norm=use_norm,
                                               error_bound=error_bound)
        
        if not within_bound:
            continue
        idx_keep.append(idx)
            
    return idx_keep, len(idxs)-len(idx_keep)

def one_shot_poly_error_fit(x_domain,mu,use_norm = True, error_bound = .1):
    """
    Pass in indexes to test of a set of polynomials, and test to see if
    the polynomials fit to within a certain error. 
    """
    
    if use_norm:
        poly_set = polynomialize_by_idx(x_domain,mu/np.max(mu),N=20,deg=3,label_type='frac')
    else:
        poly_set = polynomialize_by_idx(x_domain,mu,N=20,deg=3,label_type='frac')
    
    for poly in poly_set:
            poly.error = gauge_polynomial_error(poly.x,poly.y,poly,error='abs')
            if poly.error>.1:
                return False


    return True

# Load in Pointwise Data; 
### Prune points based on undesirable X or Y features and record.

Criteria are:
1. If coordination is not 4,5,6 AND no Bader charge info is available:
2. If mu contains abnormally high white-line absorption, if there are spuriously large pre-edge peaks, or if there are spuriously large absorption values for large energy values.
3. If the finest polynomials cannot fit to the spectrum above a cutoff fidelity.

"Ineligible specta are those with missing bader charge, or coordination not in [4,5,6].
"Outliered" spectra are those which we gauged to be unphysical due to the criteria described above.
"Unfeaturized" spectra are those which 20-fold cubic polynonmial fits perform poorly on.

In [5]:
data_by_pair = {pair:[] for pair in target_elements_groups}
for pair in tqdm(target_elements_groups,desc='Loading in data'):
    target_file = storage_directory + '/{}_XY.json'.format(pair[0])
    cur_x = x_domains[pair]
    with open(target_file, 'r') as f:
        raw_count = 0
        ineligible = 0
        outliered = 0
        unfeaturized = 0 
        for line in f.readlines():
            cur_data = json.loads(line)
            raw_count+=1  
           
            if not (cur_data.get('coordination') in [4,5,6] or cur_data.get('bader')):
                ineligible +=1
                continue
            
            if not prune_outliers(cur_data['mu']):
                outliered +=1
                continue
                
            if not one_shot_poly_error_fit(x_domains[pair],cur_data['mu']):
                unfeaturized +=1
                continue
            
            data_by_pair[pair].append(cur_data)
            #print(cur_data['metadata'])
            
    print(f"{pair}: Raw total:{raw_count}, Kept: {len(data_by_pair[pair])}, Ineligible={ineligible},"
          f" Outliered={outliered}, Unfeaturized= {unfeaturized},")

HBox(children=(FloatProgress(value=0.0, description='Loading in data', max=8.0, style=ProgressStyle(descriptio…

('Ti', 'O'): Raw total:4930, Kept: 4793, Ineligible=57, Outliered=26, Unfeaturized= 54,
('V', 'O'): Raw total:7120, Kept: 6929, Ineligible=30, Outliered=12, Unfeaturized= 149,
('Cr', 'O'): Raw total:2542, Kept: 2395, Ineligible=8, Outliered=18, Unfeaturized= 121,
('Mn', 'O'): Raw total:8504, Kept: 7917, Ineligible=42, Outliered=504, Unfeaturized= 41,
('Fe', 'O'): Raw total:7362, Kept: 6744, Ineligible=25, Outliered=523, Unfeaturized= 70,
('Co', 'O'): Raw total:3533, Kept: 3453, Ineligible=7, Outliered=66, Unfeaturized= 7,
('Ni', 'O'): Raw total:3420, Kept: 3396, Ineligible=11, Outliered=12, Unfeaturized= 1,
('Cu', 'O'): Raw total:3496, Kept: 3444, Ineligible=21, Outliered=31, Unfeaturized= 0,



# Set up train/test sets 
##  (using indices to allow for easier data augmentation if desired)

In [6]:
ttc_by_pair = {pair:{} for pair in target_elements_groups}
ttb_by_pair = {pair:{} for pair in target_elements_groups}
ttnn_by_pair = {pair:{} for pair in target_elements_groups}
ttmd_by_pair = {pair:{} for pair in target_elements_groups}


for pair in target_elements_groups:
    
    np.random.seed(rseed)

    X_c = []; X_c_st =[] ; X_c_sq = [];
    X_c_p1 = []; X_c_m1 = []; M_c = []
    Y_c = []
    
    # Bader
    X_b = [];
    X_b_st = []; X_b_sq = []; 
    X_b_p1 = []; X_b_m1 = []

    Y_b = [] # bader value
    
    # NN dist max - min
    X_nn = [];
    X_nn_st = []; X_nn_sq = []; 
    X_nn_p1 = []; X_nn_m1 = []
    Y_nn = []; Y_md = [] 
    
    coord_tally = [bool(point.get('coordination') in [4,5,6]) for point in data_by_pair[pair]]
    bader_tally = [bool(point.get('bader')) for point in data_by_pair[pair]]
    
    pruned_coord_env    = []
    
    ############################
    # EXTRACT DATA FROM POINTS #
    ############################
    discounted_md = 0
    ineleigible = 0
    outliered = 0
    unfeaturized = 0
    for point in data_by_pair[pair]:
            
        if point.get('coordination') in [4,5,6]:
            if use_max_normalized:
                
                X_c.append(point['mu_norm'])
                
            else:
                X_c.append(point['mu'])
            
            Y_c.append(point['coordination'])
            M_c.append(point['metadata'])
            
        if point.get('bader'):
            

            if use_max_normalized:
                X_b.append(point['mu_norm'])
            else:
                X_b.append(point['mu'])

            Y_b.append(point['bader'])       
                
        if point.get('nn_min-max') is not None and point.get('coordination') in [4,5,6]:
                
            if use_max_normalized:
                X_nn.append( point['mu_norm'])
                
                X_nn_st.append(point['mu_stretch']/np.max(point['mu_stretch']))
                X_nn_sq.append(point['mu_squeeze']/np.max(point['mu_squeeze']))
                
                X_nn_p1.append(point['mu_p1']/np.max(point['mu_p1']))
                X_nn_m1.append(point['mu_m1']/np.max(point['mu_m1']))
            else:
                X_nn.append(point['mu'])
                
                X_nn_st.append(point['mu_stretch'])
                X_nn_sq.append(point['mu_squeeze'])
                
                X_nn_p1.append(point['mu_p1'])
                X_nn_m1.append(point['mu_m1'])
            
            Y_nn.append(point['nn_min-max'])    
            Y_md.append(point['avg_nn_dists'])
            
        elif point.get('nn_min-max') is not None:
            discounted_md += 1
        
    
    print("{} Coordination Data Points:".format(pair),len(Y_c), [str(x)+":"+str(Counter(Y_c)[x]) for x in sorted(Counter(Y_c).keys())])
    print("{} Bader Data Points:".format(pair),len(Y_b))
    print("{} Nearest Neighbor Data Points Used:".format(pair),len(Y_md),'/',discounted_md)
    
    all_coord_indices = [n for n in range(len(Y_c))]
    all_bader_indices = [n for n in range(len(Y_b))]
    
    X_c = np.array(X_c);  
    X_c_p1 = np.array(X_c_p1); X_c_m1 = np.array(X_c_m1)
    X_c_sq = np.array(X_c_sq); X_c_st = np.array(X_c_st)
    Y_c = np.array(Y_c)


    X_b = np.array(X_b); 
    X_b_p1 = np.array(X_b_p1) ;X_b_m1 = np.array(X_b_m1)
    X_b_sq = np.array(X_b_sq); X_b_st = np.array(X_b_st)
    Y_b = np.array(Y_b)
  
    X_nn = np.array(X_nn)    
    X_nn_p1 = np.array(X_nn_p1); X_nn_m1 = np.array(X_nn_m1)
    X_nn_st = np.array(X_nn_st); X_nn_sq = np.array(X_nn_sq)
    Y_nn = np.array(Y_nn); Y_md = np.array(Y_md)

    
    ##########################
    # COORDINATION SETUP
    ##########################
    c_train_idx , c_test_idx, _, _ = \
          train_test_split(all_coord_indices,all_coord_indices, test_size=0.1,
                   random_state=rseed)
    c_train_idx , c_valid_idx, _, _ = \
          train_test_split(c_train_idx,c_train_idx, test_size=0.1,
                   random_state=rseed)
    
    if drop_poly_problems:
        c_train_idx, c_drop_train = test_poly_error_fit_N(X_c,c_train_idx,x_domains[pair])
        c_valid_idx, c_drop_valid = test_poly_error_fit_N(X_c,c_valid_idx,x_domains[pair])
        c_test_idx, c_drop_test  = test_poly_error_fit_N(X_c,c_test_idx, x_domains[pair])
    
    c_squeeze_idx = np.random.choice(c_train_idx,size=int(.3*len(c_train_idx)),replace=False)
    c_stretch_idx = np.random.choice(c_train_idx,size=int(.3*len(c_train_idx)),replace=False)
    
    xc_train = X_c[c_train_idx]; yc_train = Y_c[c_train_idx]
    xc_valid = X_c[c_valid_idx]; yc_valid = Y_c[c_valid_idx]
    xc_test = X_c[c_test_idx]  ; yc_test = Y_c[c_test_idx]
    pre_aug = len(yc_train)
    

    
    ros = RandomOverSampler(random_state = rseed)
    if oversampling:
        xc_train, yc_train = ros.fit_resample(xc_train,yc_train)
    
    
    ttc_by_pair[pair]['train_x'] = xc_train 
    ttc_by_pair[pair]['train_y'] = yc_train
    
    ttc_by_pair[pair]['valid_x'] = xc_valid
    ttc_by_pair[pair]['valid_y'] = yc_valid
    
    ttc_by_pair[pair]['test_x'] = xc_test
    ttc_by_pair[pair]['test_y'] = yc_test
    ttc_by_pair[pair]['valid_metadata'] = np.array(M_c)[c_valid_idx]
    
    print("{} Coordination Train/Train+Aug/Valid/Test Split: {}/{}/{}/{}".format(pair,pre_aug,len(yc_train),len(yc_valid),len(yc_test)))

    
    
    #############################################
    # BADER SETUP
    #############################################
    
    b_train_idx , b_test_idx, _, _ = \
          train_test_split(all_bader_indices,all_bader_indices, test_size=0.1,
                   random_state=rseed)
    b_train_idx , b_valid_idx, _, _ = \
          train_test_split(b_train_idx,b_train_idx, test_size=0.1,
                   random_state=rseed)
    
    if drop_poly_problems:
        
        b_train_idx, drop_train = test_poly_error_fit_N(X_b,b_train_idx,x_domains[pair])
        b_valid_idx, drop_valid = test_poly_error_fit_N(X_b,b_valid_idx,x_domains[pair])
        b_test_idx, drop_test = test_poly_error_fit_N(X_b,b_test_idx, x_domains[pair])
        #print(f"Dropped for {pair} bader: {drop_train+drop_valid+drop_test}")

    
    b_squeeze_idx = np.random.choice(b_train_idx,size=int(.3*len(b_train_idx)),replace=False)
    b_stretch_idx = np.random.choice(b_train_idx,size=int(.3*len(b_train_idx)),replace=False)
    
    yb_train = Y_b[b_train_idx]
    yb_valid = Y_b[b_valid_idx]
    yb_test  = Y_b[b_test_idx]

    xb_train = X_b[b_train_idx]
    xb_valid = X_b[b_valid_idx]
    xb_test  = X_b[b_test_idx]  
    
    if data_aug:
        xb_train = np.vstack((xb_train, 
                              X_b_p1[b_train_idx],
                              X_b_m1[b_train_idx], 
                              X_b_sq[b_squeeze_idx],
                              X_b_st[b_stretch_idx]))
        yb_train = np.array(list(yb_train)+ \
                            list(Y_b[b_train_idx])+ \
                            list(Y_b[b_train_idx])+ \
                            list(Y_b[b_squeeze_idx])+ \
                            list(Y_b[b_stretch_idx]))

        
    assert xb_train.shape[0] == len(yb_train)
    
    ttb_by_pair[pair]['train_x'] = xb_train 
    ttb_by_pair[pair]['train_y'] = yb_train
    
    ttb_by_pair[pair]['valid_x'] = xb_valid
    ttb_by_pair[pair]['valid_y'] = yb_valid
    
    ttb_by_pair[pair]['test_x'] = xb_test
    ttb_by_pair[pair]['test_y'] = yb_test
    
    
    ######################################################
    # NEIGHBOR PART
    ######################################################
    assert (len(Y_md)==len(Y_nn))
    all_nbr_indices = [n for n in range(len(Y_md))]

    nbr_train_idx , nbr_test_idx, _, _ = \
      train_test_split(all_nbr_indices,all_nbr_indices, test_size=0.1,
               random_state=rseed)
    nbr_train_idx , nbr_valid_idx, _, _ = \
      train_test_split(nbr_train_idx,nbr_train_idx, test_size=0.1,
                   random_state=rseed)
    
    if drop_poly_problems:
        
        nbr_train_idx, drop_train = test_poly_error_fit_N(X_nn,nbr_train_idx,x_domains[pair])
        nbr_valid_idx, drop_valid = test_poly_error_fit_N(X_nn,nbr_valid_idx,x_domains[pair])
        nbr_test_idx, drop_test  = test_poly_error_fit_N(X_nn,nbr_test_idx, x_domains[pair])
        #print(f"Dropped for {pair} mean: {drop_train+drop_valid+drop_test}")

    nn_squeeze_idx = np.random.choice(nbr_train_idx,size=int(.3*len(nbr_train_idx)),replace=False)
    nn_stretch_idx = np.random.choice(nbr_train_idx,size=int(.3*len(nbr_train_idx)),replace=False)
    

    # NEAREST NEIGHBOR DISTANCE MAX - MIN , UN NORMALIZED
    
    xnn_train = X_nn[nbr_train_idx]
    xnn_valid = X_nn[nbr_valid_idx]
    xnn_test = X_nn[nbr_test_idx]

    ymd_train = Y_md[nbr_train_idx]
    ymd_valid = Y_md[nbr_valid_idx]
    ymd_test  = Y_md[nbr_test_idx]
    
    assert xnn_train.shape[0] == len(ymd_train)
    
    if data_aug:
        xnn_train = np.vstack((xnn_train, 
                              X_nn_p1[nbr_train_idx],
                              X_nn_m1[nbr_train_idx], 
                               X_nn_sq[nn_squeeze_idx],
                              X_nn_st[nn_stretch_idx]))
        ymd_train = np.array(list(ymd_train)
                             + list(Y_md[nbr_train_idx])
                             + list(Y_md[nbr_train_idx])
                             + list(Y_md[nn_squeeze_idx])
                             +  list(Y_md[nn_stretch_idx]))
    
    
    assert xnn_train.shape[0] == len(ymd_train), str(xnn_train.shape) + str(len(ymd_train))
    
    ttnn_by_pair[pair]['train_x'] = xnn_train 
    ttnn_by_pair[pair]['valid_x'] = xnn_valid
    ttnn_by_pair[pair]['test_x'] = xnn_test
    
    # MEAN NEIGHBOR DISTANCE 

    ttmd_by_pair[pair]['train_y'] = ymd_train
    ttmd_by_pair[pair]['valid_y'] = ymd_valid
    ttmd_by_pair[pair]['test_y'] = ymd_test
    print('-----------')



('Ti', 'O') Coordination Data Points: 4709 ['4:334', '5:2301', '6:2074']
('Ti', 'O') Bader Data Points: 3201
('Ti', 'O') Nearest Neighbor Data Points Used: 4709 / 84
('Ti', 'O') Coordination Train/Train+Aug/Valid/Test Split: 3814/5514/424/471
-----------
('V', 'O') Coordination Data Points: 6862 ['4:1954', '5:2404', '6:2504']
('V', 'O') Bader Data Points: 3863
('V', 'O') Nearest Neighbor Data Points Used: 6862 / 67
('V', 'O') Coordination Train/Train+Aug/Valid/Test Split: 5557/6063/618/687
-----------
('Cr', 'O') Coordination Data Points: 2342 ['4:436', '5:580', '6:1326']
('Cr', 'O') Bader Data Points: 1809
('Cr', 'O') Nearest Neighbor Data Points Used: 2342 / 53
('Cr', 'O') Coordination Train/Train+Aug/Valid/Test Split: 1896/3213/211/235
-----------
('Mn', 'O') Coordination Data Points: 7810 ['4:302', '5:3873', '6:3635']
('Mn', 'O') Bader Data Points: 4031
('Mn', 'O') Nearest Neighbor Data Points Used: 7810 / 107
('Mn', 'O') Coordination Train/Train+Aug/Valid/Test Split: 6326/9453/703

In [7]:
if not os.path.exists('./model_data'):
    os.makedirs('./model_data')

In [8]:
for pair in target_elements_groups:
    for key in ['train_x','train_y','valid_x','valid_y','test_x','test_y']:
        np.save(f'./model_data/{pair[0]}_coord_{key}.npy',ttc_by_pair[pair][key],allow_pickle=False)
        np.save(f'./model_data/{pair[0]}_bader_{key}.npy',ttb_by_pair[pair][key],allow_pickle=False)
        if 'x' in key:
            np.save(f'./model_data/{pair[0]}_md_{key}.npy',ttnn_by_pair[pair][key],allow_pickle=False)
        else:
            np.save(f'./model_data/{pair[0]}_md_{key}.npy',ttmd_by_pair[pair][key],allow_pickle=False)

## Load in the Polynomial Fit Data

Note: Run the following cells twice, once in both feff and max normalized mode, to write both data files.

In [5]:
use_max_normalized = False
norm_str = 'max' if use_max_normalized else 'feff'

In [6]:
poly_data_by_pair = {pair:[] for pair in target_elements_groups}
for pair in tqdm(target_elements_groups,desc='Loading in data'):
    if use_max_normalized: 
        target_file = storage_directory + '/{}_maxnorm_polynomial_XY.json'.format(pair[0])
    else: 
        target_file = storage_directory + '/{}_feffnorm_polynomial_XY.json'.format(pair[0])
    with open(target_file, 'r') as f:
        for line in f.readlines():
            cur_data = json.loads(line)
            if cur_data.get('one_hot_coord') or cur_data.get('bader') or cur_data.get('nn_min-max'):
                poly_data_by_pair[pair].append(cur_data)
sorted_keys = sorted(list(poly_data_by_pair[pair][0]['labeled_coefficients'].keys()))
sorted_keys_filtered = [key for key in sorted_keys if ('fraction_size:1,' not in key and 'fraction_size:2,' not in key and 'random' not in key)]
sorted_keys = sorted_keys_filtered

HBox(children=(FloatProgress(value=0.0, description='Loading in data', max=8.0, style=ProgressStyle(descriptio…




In [7]:
pttc_by_pair = {pair:{} for pair in target_elements_groups}
pttb_by_pair = {pair:{} for pair in target_elements_groups}
pttmd_by_pair = {pair:{} for pair in target_elements_groups}

for pair in target_elements_groups:

    X_pc = []
    Y_pc = []
    
    X_pmd = []
    Y_pmd = []
    
    X_pb = []
    Y_pb = []
    
    for point in poly_data_by_pair[pair]:
        if point.get('coordination') in [4,5,6]:
            
            X_pc.append([point['labeled_coefficients'][key] for key in sorted_keys])
            Y_pc.append(point['coordination'])
    
        if point.get('coordination') in [4,5,6] and point.get('avg_nn_dists') is not None:
            X_pmd.append([point['labeled_coefficients'][key] for key in sorted_keys])
            Y_pmd.append(point['avg_nn_dists'])

        if point.get('bader') is not None:
            X_pb.append([point['labeled_coefficients'][key] for key in sorted_keys])
            Y_pb.append(point['bader'])            

    assert len(Y_pmd)==len(X_pmd) and len(X_pc)==len(Y_pc) and len(X_pb)==len(Y_pb)
    print("{} Coordination Data Points:".format(pair),len(Y_pc))
    print("{} Bader Data Points:".format(pair),len(Y_pb))

    X_pc = np.array(X_pc)
    X_pb = np.array(X_pb)
    X_pmd = np.array(X_pmd)
    
    Y_pc = np.array(Y_pc)
    Y_pb = np.array(Y_pb)
    Y_pmd = np.array(Y_pmd)

    # COORDINATION ON POLYNOMIALS
    xc_train, xc_test, yc_train, yc_test = \
      train_test_split(X_pc, Y_pc, test_size=0.1,
                   random_state=rseed)
    xc_train, xc_valid, yc_train, yc_valid = \
      train_test_split(xc_train, yc_train, test_size=0.1,
                       random_state=rseed)
        
    pttc_by_pair[pair]['train_x'] = xc_train 
    pttc_by_pair[pair]['train_y'] = yc_train
    pttc_by_pair[pair]['valid_x'] = xc_valid
    pttc_by_pair[pair]['valid_y'] = yc_valid
    pttc_by_pair[pair]['test_x']  = xc_test
    pttc_by_pair[pair]['test_y']  = yc_test
    
    
    # BADER CHARGES ON POLYNOMIALS
    xbn_train, xbn_test, ybn_train, ybn_test = \
      train_test_split(X_pb, Y_pb, test_size=0.1,
                   random_state=rseed)
    xbn_train, xbn_valid, ybn_train, ybn_valid = \
      train_test_split(xbn_train, ybn_train, test_size=0.1,
                       random_state=rseed)
    
    pttb_by_pair[pair]['train_x'] = xbn_train 
    pttb_by_pair[pair]['train_y'] = ybn_train
    pttb_by_pair[pair]['valid_x'] = xbn_valid
    pttb_by_pair[pair]['valid_y'] = ybn_valid
    pttb_by_pair[pair]['test_x']  = xbn_test
    pttb_by_pair[pair]['test_y']  = ybn_test
    
    
    # Max - Min distance ON POLYNOMIALS
    
    all_md_idx = [n for n in range(len(X_pmd))]
    
    md_train_idx, md_test_idx , _, _ = \
            train_test_split(all_md_idx, all_md_idx, test_size=0.1,
                               random_state=rseed)
    md_train_idx, md_valid_idx, _, _ = \
        train_test_split(md_train_idx, md_train_idx, test_size=0.1,
                                   random_state=rseed)
    
    xmd_train = X_pmd[md_train_idx]
    xmd_valid = X_pmd[md_valid_idx]
    xmd_test  = X_pmd[md_test_idx]
    
    ymd_train = Y_pmd[md_train_idx]
    ymd_valid = Y_pmd[md_valid_idx]
    ymd_test  = Y_pmd[md_test_idx]
    
    
    pttmd_by_pair[pair]['train_x'] = xmd_train 
    pttmd_by_pair[pair]['train_y'] = ymd_train
    
    pttmd_by_pair[pair]['valid_x'] = xmd_valid
    pttmd_by_pair[pair]['valid_y'] = ymd_valid
    
    pttmd_by_pair[pair]['test_x']  = xmd_test    
    pttmd_by_pair[pair]['test_y']  = ymd_test    
    

('Ti', 'O') Coordination Data Points: 4709
('Ti', 'O') Bader Data Points: 3201
('V', 'O') Coordination Data Points: 6862
('V', 'O') Bader Data Points: 3863
('Cr', 'O') Coordination Data Points: 2342
('Cr', 'O') Bader Data Points: 1809
('Mn', 'O') Coordination Data Points: 7810
('Mn', 'O') Bader Data Points: 4031
('Fe', 'O') Coordination Data Points: 6673
('Fe', 'O') Bader Data Points: 3908
('Co', 'O') Coordination Data Points: 3436
('Co', 'O') Bader Data Points: 2075
('Ni', 'O') Coordination Data Points: 3361
('Ni', 'O') Bader Data Points: 2224
('Cu', 'O') Coordination Data Points: 3376
('Cu', 'O') Bader Data Points: 2167


Write polynomial model data.

In [8]:
for pair in target_elements_groups:
    for key in ['train_x','train_y','valid_x','valid_y','test_x','test_y']:
        np.save(f'./model_data/{pair[0]}_{norm_str}norm_polynomial_coord_{key}.npy',pttc_by_pair[pair][key],allow_pickle=False)
        np.save(f'./model_data/{pair[0]}_{norm_str}norm_polynomial_bader_{key}.npy',pttb_by_pair[pair][key],allow_pickle=False)
        np.save(f'./model_data/{pair[0]}_{norm_str}norm_polynomial_md_{key}.npy',pttmd_by_pair[pair][key],allow_pickle=False)