# Predicting Radical Migration in Collagen
This notebook contains the code to prepare the data for and train the final PaiNN models for the prediction of the height of hydrogen atom transfer reaction barriers in collagen.

## Assign BDEs and other descriptors to data points
In a first step, we have to assign the correct BDEs and other descriptors to the data points.

In [None]:
import pandas as pd
from pathlib import Path
from utils import add_ref_idx
import numpy as np

In [None]:
### import BDEs
# names = Chemical name of the amino acid radical
# BDE_H, BDE_G = BDE, BDFE
# pdb, pdb_H = path to PDB file of radical and reactant
# ref_comp = reference compound used in the isodesmic reaction method
data = pd.read_csv(
    '/hits/fast/mbm/treydewk/optimized_test/BDEs.txt',
    sep = '\t\t',
    names = ['names', 'BDE_H', 'BDE_G', 'charge', 'pdb', 'pdb_H', 'ref_comp'],
    engine='python'
)

### read in manually assigned alternative indices
# This need to be done because the matching function returns unique
# indices, but several H atoms can be chemically equivalent
alt_ind = pd.read_csv(
    '/hits/fast/mbm/treydewk/optimized_test/alt_ind.txt',
    sep = '\t\t',
    names = ['names', 'alt_idx'], engine='python'
)
# the indices returned by the matching function are zero-indexed,
# whereas the PDB indices are one-indexed
for i in range(0,len(alt_ind['alt_idx'])):
    if not alt_ind.iloc[i,1] is None:
        ele_list = alt_ind.iloc[i,1].split(sep=',')
        for j,k in enumerate(ele_list):
            ele_list[j] = int(k)-1 
        alt_ind.iloc[i,1] = ele_list

### Manual structure matching for Gly_x-1-9.pdb
termini_df = data.iloc[[180,182]] # 180 -> 8, 182 -> 10
termini_df = termini_df.reset_index(drop=True)
ref_termini = [Path('/hits/basement/mbm/riedmiki/structures/KR0008/reference_structures/Gly_x-1-9.pdb'), Path('/hits/basement/mbm/riedmiki/structures/KR0008/reference_structures/Gly_x-1-9.pdb')]
idx_termini = [8,10]
idx_termini_list = [[8,],[10,]]
matched = pd.DataFrame(zip(ref_termini, idx_termini, idx_termini_list), columns=['ref', 'ref_idx', 'alt_idx'])
termini_df = termini_df.join(matched)

### drop data for neutral arginine [8-13], aspartic acid [19-21], backbone [22], C termini [23-24], glutamic acid [54-57],
# N termini [135-137], acetylated and N-amino formylated termini [180-183], cationic histidine [59-64],
# pi tautomer of histidine [65-68], hlknl crosslinks [74-85],
# uncharged hydroxylysine alpha and beta [93-94], charged hydroxylysine amine radical [92] (computation led to fragmentation),
# uncharged lysine [125-130], pyd crosslinks [151-156]
# There are no data points and reference PDBs available for these radicals
data = data.drop(index=[
    8, 9, 10, 11, 12, 13, 19, 20, 21, 22, 23, 24, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
    74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 92, 93, 94, 125, 126, 127, 128, 129, 130, 135, 136,
    137, 151, 152, 153, 154, 155, 156, 180, 181, 182, 183
])
alt_ind = alt_ind.drop(index=[
    8, 9, 10, 11, 12, 13, 19, 20, 21, 22, 23, 24, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
    74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 92, 93, 94, 125, 126, 127, 128, 129, 130, 135, 136,
    137, 151, 152, 153, 154, 155, 156, 180, 181, 182, 183
])
data = data.reset_index(drop=True)
alt_ind = alt_ind.reset_index(drop=True)

### average BDEs for cis and trans conformers of proline and hydroxyproline
# Hyp
alpha_cis = data[data['names'] == 'Hyp_cis_alpha']
alpha_trans = data[data['names'] == 'Hyp_trans_alpha']
pyrr3_cis = data[data['names'] == 'Hyp_cis_pyrr3']
pyrr3_trans = data[data['names'] == 'Hyp_trans_pyrr3']
pyrr4_onC_cis = data[data['names'] == 'Hyp_cis_pyrr4_onC']
pyrr4_onC_trans = data[data['names'] == 'Hyp_trans_pyrr4_onC']
pyrr4_onO_cis = data[data['names'] == 'Hyp_cis_pyrr4_onO']
pyrr4_onO_trans = data[data['names'] == 'Hyp_trans_pyrr4_onO']
pyrr5_cis = data[data['names'] == 'Hyp_cis_pyrr5']
pyrr5_trans = data[data['names'] == 'Hyp_trans_pyrr5']

alpha_idx = alpha_cis.index[0]
pyrr3_idx = pyrr3_cis.index[0]
pyrr4_onC_idx = pyrr4_onC_cis.index[0]
pyrr4_onO_idx = pyrr4_onO_cis.index[0]
pyrr5_idx = pyrr5_cis.index[0]

data.iloc[alpha_idx,0] = 'Hyp_alpha'
data.iloc[pyrr3_idx,0] = 'Hyp_pyrr3'
data.iloc[pyrr4_onC_idx,0] = 'Hyp_pyrr4_onC'
data.iloc[pyrr4_onO_idx,0] = 'Hyp_pyrr4_onO'
data.iloc[pyrr5_idx,0] = 'Hyp_pyrr5'

data.iloc[alpha_idx,1] = (alpha_cis.iloc[0,1] + alpha_trans.iloc[0,1])/2
data.iloc[pyrr3_idx,1] = (pyrr3_cis.iloc[0,1] + pyrr3_trans.iloc[0,1])/2
data.iloc[pyrr4_onC_idx,1] = (pyrr4_onC_cis.iloc[0,1] + pyrr4_onC_trans.iloc[0,1])/2
data.iloc[pyrr4_onO_idx,1] = (pyrr4_onO_cis.iloc[0,1] + pyrr4_onO_trans.iloc[0,1])/2
data.iloc[pyrr5_idx,1] = (pyrr5_cis.iloc[0,1] + pyrr5_trans.iloc[0,1])/2

data.iloc[alpha_idx,2] = (alpha_cis.iloc[0,2] + alpha_trans.iloc[0,2])/2
data.iloc[pyrr3_idx,2] = (pyrr3_cis.iloc[0,2] + pyrr3_trans.iloc[0,2])/2
data.iloc[pyrr4_onC_idx,2] = (pyrr4_onC_cis.iloc[0,2] + pyrr4_onC_trans.iloc[0,2])/2
data.iloc[pyrr4_onO_idx,2] = (pyrr4_onO_cis.iloc[0,2] + pyrr4_onO_trans.iloc[0,2])/2
data.iloc[pyrr5_idx,2] = (pyrr5_cis.iloc[0,2] + pyrr5_trans.iloc[0,2])/2

data = data.drop(index=[
    alpha_trans.index[0], pyrr3_trans.index[0], pyrr4_onC_trans.index[0], pyrr4_onO_trans.index[0], pyrr5_trans.index[0]
])
data = data.reset_index(drop=True)

alt_ind = alt_ind.drop(index=[
    alpha_trans.index[0], pyrr3_trans.index[0], pyrr4_onC_trans.index[0], pyrr4_onO_trans.index[0], pyrr5_trans.index[0]
])
alt_ind = alt_ind.reset_index(drop=True)

# Pro
alpha_cis = data[data['names'] == 'Pro_cis_alpha']
alpha_trans = data[data['names'] == 'Pro_trans_alpha']
pyrr3_cis = data[data['names'] == 'Pro_cis_pyrr3']
pyrr3_trans = data[data['names'] == 'Pro_trans_pyrr3']
pyrr4_cis = data[data['names'] == 'Pro_cis_pyrr4']
pyrr4_trans = data[data['names'] == 'Pro_trans_pyrr4']
pyrr5_cis = data[data['names'] == 'Pro_cis_pyrr5']
pyrr5_trans = data[data['names'] == 'Pro_trans_pyrr5']

alpha_idx = alpha_cis.index[0]
pyrr3_idx = pyrr3_cis.index[0]
pyrr4_idx = pyrr4_cis.index[0]
pyrr5_idx = pyrr5_cis.index[0]

data.iloc[alpha_idx,0] = 'Pro_alpha'
data.iloc[pyrr3_idx,0] = 'Pro_pyrr3'
data.iloc[pyrr4_idx,0] = 'Pro_pyrr4'
data.iloc[pyrr5_idx,0] = 'Pro_pyrr5'

data.iloc[alpha_idx,1] = (alpha_cis.iloc[0,1] + alpha_trans.iloc[0,1])/2
data.iloc[pyrr3_idx,1] = (pyrr3_cis.iloc[0,1] + pyrr3_trans.iloc[0,1])/2
data.iloc[pyrr4_idx,1] = (pyrr4_cis.iloc[0,1] + pyrr4_trans.iloc[0,1])/2
data.iloc[pyrr5_idx,1] = (pyrr5_cis.iloc[0,1] + pyrr5_trans.iloc[0,1])/2

data.iloc[alpha_idx,2] = (alpha_cis.iloc[0,2] + alpha_trans.iloc[0,2])/2
data.iloc[pyrr3_idx,2] = (pyrr3_cis.iloc[0,2] + pyrr3_trans.iloc[0,2])/2
data.iloc[pyrr4_idx,2] = (pyrr4_cis.iloc[0,2] + pyrr4_trans.iloc[0,2])/2
data.iloc[pyrr5_idx,2] = (pyrr5_cis.iloc[0,2] + pyrr5_trans.iloc[0,2])/2

data = data.drop(index=[
    alpha_trans.index[0], pyrr3_trans.index[0], pyrr4_trans.index[0], pyrr5_trans.index[0]
])
data = data.reset_index(drop=True)

alt_ind = alt_ind.drop(index=[
    alpha_trans.index[0], pyrr3_trans.index[0], pyrr4_trans.index[0], pyrr5_trans.index[0]
])
alt_ind = alt_ind.reset_index(drop=True)

In [None]:
results = data.join(data.apply(add_ref_idx, axis=1, result_type="expand"))

In [None]:
### append manually assigned alternative indices for chemically equivalent H atoms
alt_ind.drop(columns=['names',], inplace=True)
results = results.join(alt_ind)
results = results.append(termini_df)
results = results.reset_index(drop=True)

In [None]:
### let's save our progress
results.to_pickle('BDE_df')
# results = pd.read_pickle('BDE_df')

In [None]:
### Dataframe containing HAT reaction barriers
df_tidy_idx = pd.read_pickle('/hits/basement/mbm/riedmiki/structures/KR0008/df_tidy_pckl_220118_idx')

In [None]:
### to append BDEs to the dataframe containing HAT reaction barriers,
# for each row, we will look for the same reference PDB and then the
# same index of the reacting H atom. Because there are several PDBs 
# for a specific amino acid with differently sized capping groups 
# that have different suffices, we need to only check that the stem
# is the same.
ref_stem = pd.DataFrame([str(ref.resolve())[:-6] for ref in results['ref']], columns = ['ref_stem',])
results = results.join(ref_stem)

In [None]:
### Creating the dataframe with the BDEs for each data point
name_rad = []; name_H = []; ref_comp_rad = []; ref_comp_H = []
BDEs_sorted_rad = []; BDEs_sorted_H = []; BDEs_G_sorted_rad = []; BDEs_G_sorted_H = []

for rad_ref, rad_ref_idx in zip(df_tidy_idx['rad_ref'], df_tidy_idx['rad_ref_idx']):
    ref_path = str(rad_ref.resolve())[:-6]
    found = results[results['ref_stem']==ref_path]
    to_drop = []
    for i,l in zip(found.index, found['alt_idx']):
        if int(rad_ref_idx) not in l:
            to_drop.append(i)
    found.drop(to_drop, inplace=True)
    if found.shape[0]>0:
        idx = found.index[0]
        name_rad.append(results.iloc[idx,0])
        BDEs_sorted_rad.append(results.iloc[idx,1])
        BDEs_G_sorted_rad.append(results.iloc[idx,2])
        ref_comp_rad.append(results.iloc[idx,6])
    else:
        name_rad.append(np.nan)
        BDEs_sorted_rad.append(np.nan)
        BDEs_G_sorted_rad.append(np.nan)
        ref_comp_rad.append(np.nan)

for h_ref, h_ref_idx in zip(df_tidy_idx['h_ref'], df_tidy_idx['h_ref_idx']):
    ref_path = str(h_ref.resolve())[:-6]
    found = results[results['ref_stem']==ref_path]
    to_drop = []
    for i,l in zip(found.index, found['alt_idx']):
        if int(h_ref_idx) not in l:
            to_drop.append(i)
    found.drop(to_drop, inplace=True)
    if found.shape[0]>0:
        idx = found.index[0]
        name_H.append(results.iloc[idx,0])
        BDEs_sorted_H.append(results.iloc[idx,1])
        BDEs_G_sorted_H.append(results.iloc[idx,2])
        ref_comp_H.append(results.iloc[idx,6])
    else:
        name_H.append(np.nan)
        BDEs_sorted_H.append(np.nan)
        BDEs_G_sorted_H.append(np.nan)
        ref_comp_H.append(np.nan)

BDEs_df = pd.DataFrame(
    zip(name_rad, name_H, ref_comp_rad, ref_comp_H, BDEs_sorted_rad, BDEs_sorted_H, BDEs_G_sorted_rad, BDEs_G_sorted_H),
    columns = ['rad_chem_name', 'H_chem_name', 'rad_ref_comp', 'H_ref_comp', 'rad_BDE', 'H_BDE', 'rad_BDE_G', 'H_BDE_G']
)

In [None]:
complete = df_tidy_idx.join(BDEs_df)
# drop data points for which no BDE was found, i.e., HAT after a backbone break
complete_dropped = complete.dropna(subset=['rad_BDE'])
complete_dropped = complete_dropped.dropna(subset=['H_BDE'])
# let's save our progress again
complete_dropped.to_pickle('data_complete')
# complete_dropped = pd.read_pickle('data_complete')

In [None]:
### Now we still have to append the other descriptors
BDE_data = pd.read_pickle('BDE_df')

rad_BDE = complete_dropped['rad_BDE'].to_list()
H_BDE = complete_dropped['H_BDE'].to_list()

# PDB files of radical in HAT reactions
rad_PDB = []
for rad in rad_BDE:
    idx = BDE_data[BDE_data['BDE_H'] == rad].index[0]
    rad_PDB.append(BDE_data.iloc[idx, 4])

# PDB files of H donors in HAT reactions
H_PDB = []
for H in H_BDE:
    idx = BDE_data[BDE_data['BDE_H'] == H].index[0]
    H_PDB.append(BDE_data.iloc[idx, 4])

In [None]:
descriptors_df = pd.read_csv('/hits/fast/mbm/treydewk/optimized_test/descriptors.csv')
# Mordred errors
to_drop = []
for column in descriptors_df.columns:
    if isinstance(descriptors_df[column].to_list()[1], str):
        to_drop.append(column)
del to_drop[to_drop.index('names')], to_drop[to_drop.index('pdb')], to_drop[to_drop.index('SMILES')]
descriptors_df.drop(to_drop, axis=1, inplace=True)

### average descriptors for cis and trans conformers of proline and hydroxyproline
# Hyp
alpha_cis = descriptors_df[descriptors_df['names'] == 'hyp_cis_pyrr2_1']
alpha_trans = descriptors_df[descriptors_df['names'] == 'Hyp_trans_pyrr2_1']
pyrr3_cis = descriptors_df[descriptors_df['names'] == 'hyp_cis_pyrr3_1']
pyrr3_trans = descriptors_df[descriptors_df['names'] == 'Hyp_trans_pyrr3_1']
pyrr4_onC_cis = descriptors_df[descriptors_df['names'] == 'hyp_cis_pyrr4_onC_1']
pyrr4_onC_trans = descriptors_df[descriptors_df['names'] == 'Hyp_trans_pyrr4_onC_1']
pyrr4_onO_cis = descriptors_df[descriptors_df['names'] == 'hyp_cis_pyrr4_onO_1']
pyrr4_onO_trans = descriptors_df[descriptors_df['names'] == 'Hyp_trans_pyrr4_onO_1']
pyrr5_cis = descriptors_df[descriptors_df['names'] == 'hyp_cis_pyrr5_1']
pyrr5_trans = descriptors_df[descriptors_df['names'] == 'Hyp_trans_pyrr5_1']

alpha_idx = alpha_cis.index[0]
pyrr3_idx = pyrr3_cis.index[0]
pyrr4_onC_idx = pyrr4_onC_cis.index[0]
pyrr4_onO_idx = pyrr4_onO_cis.index[0]
pyrr5_idx = pyrr5_cis.index[0]

for i in range(4,descriptors_df.shape[1]):
    descriptors_df.iloc[alpha_idx,i] = (alpha_cis.iloc[0,i] + alpha_trans.iloc[0,i])/2
    descriptors_df.iloc[pyrr3_idx,i] = (pyrr3_cis.iloc[0,i] + pyrr3_trans.iloc[0,i])/2
    descriptors_df.iloc[pyrr4_onC_idx,i] = (pyrr4_onC_cis.iloc[0,i] + pyrr4_onC_trans.iloc[0,i])/2
    descriptors_df.iloc[pyrr4_onO_idx,i] = (pyrr4_onO_cis.iloc[0,i] + pyrr4_onO_trans.iloc[0,i])/2
    descriptors_df.iloc[pyrr5_idx,i] = (pyrr5_cis.iloc[0,i] + pyrr5_trans.iloc[0,i])/2

descriptors_df = descriptors_df.drop(index=[
    alpha_trans.index[0], pyrr3_trans.index[0], pyrr4_onC_trans.index[0], pyrr4_onO_trans.index[0], pyrr5_trans.index[0]
])
descriptors_df = descriptors_df.reset_index(drop=True)

# Pro
alpha_cis = descriptors_df[descriptors_df['names'] == 'pro_cis_pyrr2_1']
alpha_trans = descriptors_df[descriptors_df['names'] == 'Pro_trans_pyrr2_1']
pyrr3_cis = descriptors_df[descriptors_df['names'] == 'pro_cis_pyrr3_1']
pyrr3_trans = descriptors_df[descriptors_df['names'] == 'Pro_trans_pyrr3_1']
pyrr4_cis = descriptors_df[descriptors_df['names'] == 'pro_cis_pyrr4_1']
pyrr4_trans = descriptors_df[descriptors_df['names'] == 'Pro_trans_pyrr4_1']
pyrr5_cis = descriptors_df[descriptors_df['names'] == 'pro_cis_pyrr5_1']
pyrr5_trans = descriptors_df[descriptors_df['names'] == 'Pro_trans_pyrr5_1']

alpha_idx = alpha_cis.index[0]
pyrr3_idx = pyrr3_cis.index[0]
pyrr4_idx = pyrr4_cis.index[0]
pyrr5_idx = pyrr5_cis.index[0]

for i in range(4,descriptors_df.shape[1]):
    descriptors_df.iloc[alpha_idx,i] = (alpha_cis.iloc[0,i] + alpha_trans.iloc[0,i])/2
    descriptors_df.iloc[pyrr3_idx,i] = (pyrr3_cis.iloc[0,i] + pyrr3_trans.iloc[0,i])/2
    descriptors_df.iloc[pyrr4_idx,i] = (pyrr4_cis.iloc[0,i] + pyrr4_trans.iloc[0,i])/2
    descriptors_df.iloc[pyrr5_idx,i] = (pyrr5_cis.iloc[0,i] + pyrr5_trans.iloc[0,i])/2

descriptors_df = descriptors_df.drop(index=[
    alpha_trans.index[0], pyrr3_trans.index[0], pyrr4_trans.index[0], pyrr5_trans.index[0]
])
descriptors_df = descriptors_df.reset_index(drop=True)

In [None]:
descriptors_sorted_rad = pd.DataFrame()
descriptors_sorted_H = pd.DataFrame()

# descriptors for radicals in HAT reactions
for pdb in rad_PDB:
    idx = descriptors_df[descriptors_df['pdb']==pdb].index[0]
    descriptors_sorted_rad = descriptors_sorted_rad.append(descriptors_df.iloc[idx])

# descriptors for H donors in HAT reactions
for pdb in H_PDB:
    idx = descriptors_df[descriptors_df['pdb']==pdb].index[0]
    descriptors_sorted_H = descriptors_sorted_H.append(descriptors_df.iloc[idx])

In [None]:
descriptors_sorted_rad = descriptors_sorted_rad.drop(columns = [descriptors_sorted_rad.columns[0], 'names'])
descriptors_sorted_H = descriptors_sorted_H.drop(columns = [descriptors_sorted_H.columns[0], 'names'])

for column in descriptors_sorted_rad.columns:
    descriptors_sorted_rad.rename(columns = {column: '{}_rad'.format(column)}, inplace = True)

for column in descriptors_sorted_H.columns:
    descriptors_sorted_H.rename(columns = {column: '{}_H'.format(column)}, inplace = True)

In [None]:
indices = complete_dropped.index
descriptors_sorted_rad = descriptors_sorted_rad.set_index(indices)
descriptors_sorted_H = descriptors_sorted_H.set_index(indices)

final_results = complete_dropped.join(descriptors_sorted_rad)
final_results = final_results.join(descriptors_sorted_H)

### Save final dataframe
final_results.to_pickle('data_complete_w_descriptors')

## Prepare data for GNNs
We need the coordinates and nuclear charges of all atoms as input for the GNNs. Execution of this part might take a while.

In [None]:
from kgcnn.utils.adj import coordinates_to_distancematrix, define_adjacency_from_distance, distance_to_gauss_basis, get_angle_indices, sort_edge_indices
from ase.io import read
import pandas as pd

In [None]:
data = pd.read_pickle('data_ceomplete_final')
pdb_start = data['pdb_file_start'].tolist()
pdb_end = data['pdb_file_end'].tolist()
del data

all_nodes_start = []; all_nodes_end = []
all_pos_start = []; all_pos_end = []
all_n_start = []; all_n_end = []

for file_start, file_end in zip(pdb_start, pdb_end):

    mol_start = read(str(file_start.resolve()))
    mol_end = read(str(file_end.resolve()))

    an_start = mol_start.get_atomic_numbers()
    an_end = mol_end.get_atomic_numbers()
    n_start = mol_start.get_global_number_of_atoms()
    n_end = mol_end.get_global_number_of_atoms()
    pos_start = mol_start.positions
    pos_end = mol_end.positions

    # reacting H atom in its final position
    nodes_start = np.concatenate((np.array([0]), an_start), axis=0)
    pos_start_compl = np.concatenate((np.array([pos_end[0]]), pos_start), axis=0)

    # reacting H atom in its initial position
    nodes_end = np.concatenate((np.array([0]), an_end), axis=0)
    pos_end_compl = np.concatenate((np.array([pos_start[0]]), pos_end), axis=0)

    all_nodes_start.append(nodes_start); all_nodes_end.append(nodes_end)
    all_pos_start.append(pos_start_compl); all_pos_end.append(pos_end_compl)
    all_n_start.append(n_start); all_n_end.append(n_end)

# create distcance matrices, adjacency matrices and edge indices, and expand distances in Gauss basis
dist_mat_start = [coordinates_to_distancematrix(x) for x in all_pos_start]
dist_mat_end = [coordinates_to_distancematrix(x) for x in all_pos_end]

adj_mat_start = [define_adjacency_from_distance(x)[0] for x in dist_mat_start]
adj_mat_end = [define_adjacency_from_distance(x)[0] for x in dist_mat_end]

edge_idx_start = [define_adjacency_from_distance(x)[1] for x in dist_mat_start]
edge_idx_end = [define_adjacency_from_distance(x)[1] for x in dist_mat_end]

graph_input = pd.DataFrame(
    zip(all_nodes_start, all_nodes_end, all_pos_start, all_pos_end, all_n_start, all_n_end,
    edge_idx_start, edge_idx_end),
    columns = [
        'nodes_start', 'nodes_end', 'pos_start', 'pos_end', 'total_atoms_start', 'total_atoms_end',
        'egde_idx_start', 'edge_idx_end'
    ]
)

graph_input.to_pickle('graph_input_pickled')

## Train and evaluate final model
Training and evaluation of the final model using K fold cross-validation will take around 8-10 hours.

In [None]:
import numpy as np
import pandas as pd
from utils import K_fold_cross_validation, painn

In [None]:
painn = painn()

data = pd.read_pickle('data_complete_final')
target = data['Ea'].to_numpy()
descriptors = data[[
    'translation', 'rad_BDE', 'H_BDE', 'max_spin_rad',
    'mull_charge_rad', 'bur_vol_iso_rad', 'nBase_rad', 'SpMax_A_rad',
    'ATSC2s_rad', 'ATSC1Z_rad', 'ATSC2i_rad', 'NdNH_rad', 'SMR_VSA4_rad',
    'max_spin_H', 'mull_charge_H', 'bur_vol_iso_H', 'nBase_H', 'SpMax_A_H',
    'ATSC2s_H', 'ATSC1Z_H', 'ATSC2i_H', 'GATS2dv_H', 
    'BCUTdv-1h_H', 'SMR_VSA4_H', 'VSA_EState7_H'
]]
del data

descriptors = np.array(descriptors)

graph_input = pd.read_pickle('graph_input_pickled')
nodes = graph_input['nodes_start'].to_numpy()
pos = graph_input['pos_start'].to_numpy()
edge_idx = graph_input['egde_idx_start'].to_numpy()

dist_mat_start = [coordinates_to_distancematrix(x) for x in pos]
adj_mat_start = [define_adjacency_from_distance(x)[0] for x in dist_mat_start]
edge_idx = [x if x[0,1]==1 else sort_edge_indices(np.concatenate([np.array([[0, 1], [1,0]]), x], axis=0)) for x in edge_idx]
edge_idx = np.array(edge_idx)
node_radical_index = [np.array([[0, 1]]) for _ in edge_idx]  
edge_radical_index = [np.array([[0]]) for _ in node_radical_index]
node_radical_index = np.array(node_radical_index)
edge_radical_index = np.array(edge_radical_index)
del graph_input

input = [nodes, pos, edge_idx, node_radical_index, edge_radical_index, descriptors]

train_mae_per_fold, val_mae_per_fold, \
    test_mae_per_fold = K_fold_cross_validation(input, target, 10, painn, batch_size=256, no_epochs=1000)

results = pd.DataFrame(
    zip(train_mae_per_fold, val_mae_per_fold, test_mae_per_fold),
    columns=['train_maes', 'val_maes', 'test_maes']
)

results.to_csv('maes_per_fold_painn.csv')