### Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import scipy
from scipy.integrate import simps
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from xgboost import XGBRFClassifier
from sklearn import metrics
from tqdm import tqdm, trange
import datetime
import xgboost as xgb
import multiprocessing
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
import pickle
from joblib import dump, load

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

# Important lists
features = ['A-A', 'B-B', 'A-B', 'A-A/B-B', 'A-B/(A-A + B-B)']
features_merged = ['A-A_n', 'B-B_n', 'A-B_n', 'A-A_n/B-B_n', 'A-B_n/(A-A_n + B-B_n)',
                   'A-A_l', 'B-B_l', 'A-B_l', 'A-A_l/B-B_l',
                   'A-B_l/(A-A_l + B-B_l)']

### DES and non-DES hbond lifetimes
Creation and cleaning up of dataframes. These cells take in a path to GROMACS outputs from hydrogen bond calculations, and stores the 
A-A, B-B, and B-B lifetimes in a pandas dataframe. The dataframe can be stored as a csv for later use.


#### DES

In [None]:
#DES
''' This function takes in a path to GROMACS outputs from hydrogen bond calculations, and stores the 
A-A, B-B, and B-B lifetimes in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''

pathway = Path()

hlife_dict = {}

for folderz in pathway.glob('./h*-avg-files/des/*'):
    # print(folderz.stem)
    dict_key = folderz.stem[6:]
    hlife_list = []
    for file in pathway.glob(f"{folderz}/hlife-*"):
        # print(file)
        if len(os.listdir(file)) == 0:
            hlife_list.append(0)
            continue

        for txt in pathway.glob(f"{file}/hlife*.txt"):
            # print(txt)
            txtfile = txt.name
            data = pd.read_csv('{}'.format(txt), sep='\s+',
                               header=None, skiprows=[0, 1])
            data = pd.DataFrame(data)
            x = data[0]/1000
            y = data[2]
            area = 0
            # Change dx to 1 for files prior to 7 Sep 2020
            area = simps(y, dx=0.01)
            hlife_list.append(area)
            # print("The area of {thing} is {val}".format(thing= txt.stem, val=area))
            # print("\n")

    hlife_dict[f"{dict_key}"] = hlife_list
    # print(hlife_dict)
    # print("\n")

DES = []
AA = []
AB = []
BB = []

for i in list(hlife_dict.items()):
    # print(i)
    DES.append(i[0])
    AA.append(i[1][0])
    AB.append(i[1][1])
    BB.append(i[1][2])

des_dict = {
    "DES": DES,
    "A-A": AA,
    "A-B": AB,
    "B-B": BB
}
des_hlife_frame = pd.DataFrame(des_dict, columns=["DES", "A-A", "B-B", "A-B"])
aa_dlife = des_hlife_frame['A-A']
ab_dlife = des_hlife_frame['A-B']
bb_dlife = des_hlife_frame['B-B']
des_hlife_frame['A-A/B-B'] = aa_dlife/bb_dlife
# des_hlife_frame['BB/AA'] = bb_dlife/aa_dlife
des_hlife_frame['A-B/(A-A + B-B)'] = ab_dlife/(aa_dlife + bb_dlife)
des_hlife_frame.describe()


In [None]:
''' This cell deletes the A-A, B-B, and B-B lifetimes with 0.00 values in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''
des_hlife = des_hlife_frame[(des_hlife_frame['B-B'] > 0.0)]
des_hlife.reset_index(drop=True, inplace=True)
# des_hlife

In [None]:
# des_hlife.describe().to_csv('./des_hlife_summary_01-18-2023.csv')
# des_hlife.to_excel('./des_hlife_01-23-2023.xlsx')

In [None]:
# DES non-overlapping histogram
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
des_fig = plt.figure()
des_fig.set_size_inches(12, 10, forward=True)
des_ax = des_fig.add_subplot(1,1,1)
des_ax.set_xlabel("Hydrogen bond lifetime (ns)", fontsize=28, weight='bold')
des_ax.set_ylabel("Number of systems", fontsize=28, weight='bold')
ytick = np.arange(0,24, 2)
xtick = np.arange(0,5.5, 0.5)
plt.yticks(ytick,fontsize=26, weight='bold')
plt.xticks(xtick,fontsize=26, weight='bold')
# plt.title('DES', fontsize=26, weight='bold')
plt.ylim([0,24])
# des_hist = des_slice[['A-A', 'B-B', 'A-B']]
# des_hist.plot.hist(bins=20, alpha=0.5, ylim=[0,16], ax=des_ax)
# binss = np.linspace(0.09871825, 4.54204605, 10)
# binss = np.linspace(0.0, 4.0, 10)
plt.hist([des_hlife['A-A'], des_hlife['B-B'], des_hlife['A-B']], bins=10, label=['A-A', 'B-B', 'A-B'])
# plt.hist([des_slice['A-A'], des_slice['B-B'], des_slice['A-B']], bins=binss, label=['A-A', 'B-B', 'A-B'])
plt.legend(loc='upper left')
des_fig.savefig(f'plots/distributions/des_hlife_nonoverlap_{xdate}.tiff', dpi=350,facecolor='white', bbox_inches='tight')
plt.show()


#### Non-DES

In [None]:
# NON-DES
''' This function takes in a path to GROMACS outputs from hydrogen bond calculations, and stores the 
A-A, B-B, and B-B lifetimes in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''

pathway = Path()

hlife_dict = {}

for folderz in pathway.glob('./h*-avg-files/nondes/*'):
    # print(folderz.stem)
    dict_key = folderz.stem[6:]
    hlife_list = []
    for file in pathway.glob(f"{folderz}/hlife-*-?-?"):
        # print(file)
        if len(os.listdir(file)) == 0 or len(os.listdir(file)) < 3:
            hlife_list.append(0)
            continue

        for txt in pathway.glob(f"{file}/hlife*.txt"):
            # print(txt)
            txtfile = txt.name
            data = pd.read_csv('{}'.format(txt), sep='\s+',
                               header=None, skiprows=[0, 1])
            data = pd.DataFrame(data)
            x = data[0]/1000
            y = data[2]
            area = 0
            # Change dx to 1 for files prior to 7 Sep 2020
            area = simps(y, dx=0.01)
            hlife_list.append(area)
            # print("The area of {thing} is {val}".format(thing= txt.stem, val=area))
            # print("\n")

    hlife_dict[f"{dict_key}"] = hlife_list
    # print(hlife_dict)
    # print("\n")

# print(hlife_dict)
NONDES = []
AA_ = []
AB_ = []
BB_ = []

for i in list(hlife_dict.items()):
    # print(i)
    NONDES.append(i[0])
    AA_.append(i[1][0])
    AB_.append(i[1][1])
    BB_.append(i[1][2])

nondes_dict = {
    "Non-DES": NONDES,
    "A-A": AA_,
    "A-B": AB_,
    "B-B": BB_
}
nondes_hlife_frame = pd.DataFrame(
    nondes_dict, columns=["Non-DES", "A-A", "B-B", "A-B"])
aa_nlife = nondes_hlife_frame['A-A']
ab_nlife = nondes_hlife_frame['A-B']
bb_nlife = nondes_hlife_frame['B-B']
nondes_hlife_frame['A-A/B-B'] = aa_nlife/bb_nlife
# nondes_hlife_frame['BB/AA'] = bb_nlife/aa_nlife
nondes_hlife_frame['A-B/(A-A + B-B)'] = ab_nlife/(aa_nlife + bb_nlife)
# nondes_hlife_frame


In [None]:
''' This cell deletes the A-A, B-B, and B-B lifetimes with 0.00 values in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''
nondes_hlife = nondes_hlife_frame[(nondes_hlife_frame['B-B'] > 0.0)]
nondes_hlife.reset_index(drop=True, inplace=True)
# nondes_hlife

In [None]:
nondes_hlife.describe()

In [None]:
# nondes_hlife.to_excel('./nondes_hlife_01-23-2023.xlsx')

In [None]:
# NON-DES non-overlapping histo 
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
non_des_fig = plt.figure()
non_des_fig.set_size_inches(12, 10, forward=True)
non_des_ax = non_des_fig.add_subplot(1,1,1)
non_des_ax.set_xlabel("Hydrogen bond lifetime (ns)", fontsize=28, weight='bold')
non_des_ax.set_ylabel("Number of systems", fontsize=28, weight='bold')
ytick = np.arange(0,24, 2)
xtick = np.arange(0,10, 0.5)
plt.yticks(ytick,fontsize=26, weight='bold')
plt.xticks(xtick,fontsize=26, weight='bold')
# plt.title('Non-DES', fontsize=26, weight='bold')
# non_des_hist = nondes_slice[['A-A', 'B-B', 'A-B']]
# non_des_hist.plot.hist(bins=20, alpha=0.5, ylim=[0,16], ax =non_des_ax) 
# binss = np.linspace(0.09871825, 4.54204605, 10)
# plt.hist([nondes_slice['A-A'], nondes_slice['B-B'], nondes_slice['A-B']], bins=binss, label=['A-A', 'B-B', 'A-B'])
plt.hist([nondes_hlife['A-A'], nondes_hlife['B-B'], nondes_hlife['A-B']], bins=10, label=['A-A', 'B-B', 'A-B'])
plt.legend(loc='upper left')
non_des_fig.savefig(f'plots/distributions/nondes-hlife_nonoverlap_{xdate}.tiff', dpi=350,facecolor='white', bbox_inches='tight')
plt.show()


### Lignin DES

In [None]:
#DES
''' This function takes in a path to GROMACS outputs from hydrogen bond calculations, and stores the 
A-A, B-B, and B-B lifetimes in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''

pathway = Path()

hlife_dict_lignin = {}

for folderz in pathway.glob('./desfiles-irc/des/*'):
    # print(folderz.stem)
    dict_key = folderz.stem  #[6:]
    hlife_list = []
    for file in pathway.glob(f"{folderz}/hbond-*"):
        # print(file)
        if len(os.listdir(file)) < 5:
            hlife_list.append(0)
            continue

        for txt in pathway.glob(f"{file}/hlife*.txt"):
            # print(txt)
            txtfile = txt.name
            data = pd.read_csv('{}'.format(txt), sep='\s+',
                               header=None, skiprows=[0, 1])
            data = pd.DataFrame(data)
            x = data[0]/1000
            y = data[2]
            area = 0
            # Change dx to 1 for files prior to 7 Sep 2020
            area = simps(y, dx=0.01)
            hlife_list.append(area)
            # print("The area of {thing} is {val}".format(thing= txt.stem, val=area))
            # print("\n")

    hlife_dict_lignin[f"{dict_key}"] = hlife_list
    # print(hlife_dict_lignin)
    # print("\n")

DES = []
AA = []
AB = []
BB = []

for i in list(hlife_dict_lignin.items()):
    # print(i)
    DES.append(i[0])
    AA.append(i[1][0])
    AB.append(i[1][1])
    BB.append(i[1][2])

des_dict = {
    "DES": DES,
    "A-A": AA,
    "A-B": AB,
    "B-B": BB
}
des_hlife_frame_lignin = pd.DataFrame(des_dict, columns=["DES", "A-A", "B-B", "A-B"])
aa_dlife_lignin = des_hlife_frame_lignin['A-A']
ab_dlife_lignin = des_hlife_frame_lignin['A-B']
bb_dlife_lignin = des_hlife_frame_lignin['B-B']
des_hlife_frame_lignin['A-A/B-B'] = aa_dlife_lignin/bb_dlife_lignin
# des_hlife_frame_lignin['BB/AA'] = bb_dlife_lignin/aa_dlife_lignin
des_hlife_frame_lignin['A-B/(A-A + B-B)'] = ab_dlife_lignin/(aa_dlife_lignin + bb_dlife_lignin)
des_hlife_frame_lignin.describe()


In [None]:
''' This cell deletes the A-A, B-B, and B-B lifetimes with 0.00 values in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''
des_hlife_lignin = des_hlife_frame_lignin[(des_hlife_frame_lignin['B-B'] > 0.0)]
des_hlife_lignin.reset_index(drop=True, inplace=True)

In [None]:
# DES non-overlapping histogram
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
des_fig_lignin = plt.figure()
des_fig_lignin.set_size_inches(12, 10, forward=True)
des_ax_lignin = des_fig_lignin.add_subplot(1,1,1)
des_ax_lignin.set_xlabel("Hydrogen bond lifetime (ns)", fontsize=24, weight='bold')
des_ax_lignin.set_ylabel("Number of systems", fontsize=24, weight='bold')
ytick = np.arange(0,24, 2)
xtick = np.arange(0,5.5, 0.5)
plt.yticks(ytick,fontsize=22, weight='bold')
plt.xticks(xtick,fontsize=22, weight='bold')
plt.title('DES', fontsize=22, weight='bold')
plt.ylim([0,12])
# des_hist = des_slice[['A-A', 'B-B', 'A-B']]
# des_hist.plot.hist(bins=20, alpha=0.5, ylim=[0,16], ax=des_ax_lignin)
# binss = np.linspace(0.09871825, 4.54204605, 10)
# binss = np.linspace(0.0, 4.0, 10)
plt.hist([des_hlife_frame_lignin['A-A'], des_hlife_frame_lignin['B-B'], des_hlife_frame_lignin['A-B']], bins=10, label=['A-A', 'B-B', 'A-B'])
# plt.hist([des_slice['A-A'], des_slice['B-B'], des_slice['A-B']], bins=binss, label=['A-A', 'B-B', 'A-B'])
plt.legend(loc='upper left')
# des_fig_lignin.savefig(f'plots/des_hlife_frame_lignin_nonoverlap_{xdate}.png', dpi=350,facecolor='white', bbox_inches='tight')
plt.show()


### Lignin non-DES

In [None]:
# NON-DES
''' This function takes in a path to GROMACS outputs from hydrogen bond calculations, and stores the 
A-A, B-B, and B-B lifetimes in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''

pathway = Path()

hlife_dict_nondes = {}

for folderz in pathway.glob('./desfiles-irc/nondes/*'):
    # print(folderz.stem)
    dict_key = folderz.stem
    hlife_list = []
    for file in pathway.glob(f"{folderz}/hbond-*-?-?"):
        # print(file)
        if len(os.listdir(file)) < 5:
            hlife_list.append(0)
            continue

        for txt in pathway.glob(f"{file}/hlife*.txt"):
            # print(txt)
            txtfile = txt.name
            data = pd.read_csv('{}'.format(txt), sep='\s+',
                               header=None, skiprows=[0, 1])
            data = pd.DataFrame(data)
            x = data[0]/1000
            y = data[2]
            area = 0
            # Change dx to 1 for files prior to 7 Sep 2020
            area = simps(y, dx=0.01)
            hlife_list.append(area)
            # print("The area of {thing} is {val}".format(thing= txt.stem, val=area))
            # print("\n")

    hlife_dict_nondes[f"{dict_key}"] = hlife_list
    # print(hlife_dict_nondes)
    # print("\n")

# print(hlife_dict_nondes)
NONDES = []
AA_ = []
AB_ = []
BB_ = []

for i in list(hlife_dict_nondes.items()):
    # print(i)
    NONDES.append(i[0])
    AA_.append(i[1][0])
    AB_.append(i[1][1])
    BB_.append(i[1][2])

nondes_dict_lignin = {
    "Non-DES": NONDES,
    "A-A": AA_,
    "A-B": AB_,
    "B-B": BB_
}
nondes_hlife_frame_lignin = pd.DataFrame(
    nondes_dict_lignin, columns=["Non-DES", "A-A", "B-B", "A-B"])
aa_nlife_lignin = nondes_hlife_frame_lignin['A-A']
ab_nlife_lignin = nondes_hlife_frame_lignin['A-B']
bb_nlife_lignin = nondes_hlife_frame_lignin['B-B']
nondes_hlife_frame_lignin['A-A/B-B'] = aa_nlife_lignin/bb_nlife_lignin
# nondes_hlife_frame_lignin['BB/AA'] = bb_nlife_lignin/aa_nlife_lignin
nondes_hlife_frame_lignin['A-B/(A-A + B-B)'] = ab_nlife_lignin/(aa_nlife_lignin + bb_nlife_lignin)
# nondes_hlife_frame_lignin


In [None]:
''' This cell deletes the A-A, B-B, and B-B lifetimes with 0.00 values in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''
nondes_hlife_lignin = nondes_hlife_frame_lignin[(nondes_hlife_frame_lignin['B-B'] > 0.0)]
nondes_hlife_lignin.reset_index(drop=True, inplace=True)
# nondes_hlife_lignin

In [None]:
# nondes_hlife_frame.to_excel('./nondes_hlife_01-23-2023.xlsx')

In [None]:
# NON-DES non-overlapping histo 
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
non_des_fig_lignin = plt.figure()
non_des_fig_lignin.set_size_inches(12, 10, forward=True)
non_des_ax_lignin = non_des_fig_lignin.add_subplot(1,1,1)
non_des_ax_lignin.set_xlabel("Hydrogen bond lifetime (ns)", fontsize=24, weight='bold')
non_des_ax_lignin.set_ylabel("Number of systems", fontsize=24, weight='bold')
ytick = np.arange(0,24, 2)
xtick = np.arange(0,10, 0.5)
plt.yticks(ytick,fontsize=22, weight='bold')
plt.xticks(xtick,fontsize=22, weight='bold')
plt.title('Non-DES', fontsize=22, weight='bold')
plt.ylim([0,12])
# non_des_hist = nondes_slice[['A-A', 'B-B', 'A-B']]
# non_des_hist.plot.hist(bins=20, alpha=0.5, ylim=[0,16], ax =non_des_ax_lignin) 
# binss = np.linspace(0.09871825, 4.54204605, 10)
# plt.hist([nondes_slice['A-A'], nondes_slice['B-B'], nondes_slice['A-B']], bins=binss, label=['A-A', 'B-B', 'A-B'])
plt.hist([nondes_hlife_lignin['A-A'], nondes_hlife_lignin['B-B'], nondes_hlife_lignin['A-B']], bins=10, label=['A-A', 'B-B', 'A-B'])
plt.legend(loc='upper left')
# non_des_fig_lignin.savefig(f'plots/distributions/nondes-ligninhlife_nonoverlap_{xdate}.png', dpi=350,facecolor='white', bbox_inches='tight')
plt.show()


### DES and non-DES hbond numbers
Creation and cleaning up of dataframes.

#### DES

In [None]:
''' This function takes in a path to GROMACS outputs from hydrogen bond calculations, and stores the 
A-A, B-B, and B-B numtimes in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''

pathway = Path()

hnum_dict = {}

for folderz in pathway.glob('./h*-avg-files/des/*'):
    # print(folderz.stem)
    dict_key = folderz.stem[6:]
    hnum_list = []
    for file in pathway.glob(f"{folderz}/hlife-*"):
        # print(file)
        if len(os.listdir(file)) == 0:
            hnum_list.append(0)
            continue

        for txt in pathway.glob(f"{file}/hnum*.txt"):
            # print(txt)
            txtfile = txt.name
            data = pd.read_csv('{}'.format(txt), sep='\s+',
                               header=None, skiprows=[0, 1])
            data = pd.DataFrame(data)
            y = data[1]
            avg = 0
            avg = np.average(y)  # Change dx to 1 for files prior to 7 Sep 2020
            hnum_list.append(avg)
            # print("The avg of {thing} is {val}".format(thing= txt.stem, val=avg))
            # print("\n")

    hnum_dict[f"{dict_key}"] = hnum_list
    # print(hnum_dict)
    # print("\n")

DES = []
AA = []
AB = []
BB = []

for i in list(hnum_dict.items()):
    # print(i)
    DES.append(i[0])
    AA.append(i[1][0])
    AB.append(i[1][1])
    BB.append(i[1][2])

des_dict = {
    "DES": DES,
    "A-A": AA,
    "A-B": AB,
    "B-B": BB
}
des_hnum_frame = pd.DataFrame(des_dict, columns=["DES", "A-A", "B-B", "A-B"])
# des_hnum_frame
aa_dnum = des_hnum_frame['A-A']
ab_dnum = des_hnum_frame['A-B']
bb_dnum = des_hnum_frame['B-B']
des_hnum_frame['A-A/B-B'] = aa_dnum/bb_dnum
# des_hnum_frame['BB/AA'] = bb_dnum/aa_dnum
des_hnum_frame['A-B/(A-A + B-B)'] = ab_dnum/(aa_dnum + bb_dnum)
des_hnum_frame.describe()

In [None]:
des_hnum_frame

In [None]:
''' This cell deletes the A-A, B-B, and B-B lifetimes with 0.00 values in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''
des_hnum = des_hnum_frame[(des_hnum_frame['B-B'] > 0.0)]
des_hnum.reset_index(drop=True, inplace=True)
# des_hnum

In [None]:
des_hnum.describe()

In [None]:
# des_hnum.describe().to_excel('./des_hnum_new_summary.xlsx')
# des_hnum.to_excel('./des_hnum_01-23-2023.xlsx')

In [None]:
# non-overlapping histo DES
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
des_fig = plt.figure()
des_fig.set_size_inches(12, 10, forward=True)
des_ax = des_fig.add_subplot(1,1,1)
des_ax.set_xlabel("Hydrogen bond number", fontsize=28, weight='bold')
des_ax.set_ylabel("Number of systems", fontsize=28, weight='bold')
ytick = np.arange(0,40, 4)
xtick = np.arange(0,140, 10)
plt.yticks(ytick,fontsize=26, weight='bold')
plt.xticks(xtick,fontsize=26, weight='bold')
# plt.title('DES', fontsize=22, weight='bold')
plt.ylim([0,40])
# des = des[['AA', 'BB', 'AB']]
# des.plot.hist(bins=20, alpha=0.5, ylim=[0,22], ax=des_ax)
plt.hist([des_hnum['A-A'], des_hnum['B-B'], des_hnum['A-B']], bins=10, label=['A-A', 'B-B', 'A-B'])
# plt.hist([des['A-A'], des['B-B'], des['A-B']], bins=binss, label=['A-A', 'B-B', 'A-B'])
plt.legend(loc='upper right', prop={'weight':'bold'})
des_fig.savefig(f'./plots/distributions/des_hnum_nonoverlap_{xdate}.tiff', dpi=350,facecolor='white', bbox_inches='tight')
plt.show()

#### Lignin DES

In [None]:
''' This function takes in a path to GROMACS outputs from hydrogen bond calculations, and stores the 
A-A, B-B, and B-B numtimes in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''

pathway = Path()

hnum_dict_lignin = {}

for folderz in pathway.glob('./desfiles-irc/des/*'):
    # print(folderz.stem)
    dict_key = folderz.stem
    hnum_list = []
    for file in pathway.glob(f"{folderz}/hbond-*"):
        # print(file)
        if len(os.listdir(file)) < 5:
            hnum_list.append(0)
            continue

        for txt in pathway.glob(f"{file}/hnum*.txt"):
            # print(txt)
            txtfile = txt.name
            data = pd.read_csv('{}'.format(txt), sep='\s+',
                               header=None, skiprows=[0, 1])
            data = pd.DataFrame(data)
            y = data[1]
            avg = 0
            avg = np.average(y)  # Change dx to 1 for files prior to 7 Sep 2020
            hnum_list.append(avg)
            # print("The avg of {thing} is {val}".format(thing= txt.stem, val=avg))
            # print("\n")

    hnum_dict_lignin[f"{dict_key}"] = hnum_list
    # print(hnum_dict_lignin)
    # print("\n")

DES = []
AA = []
AB = []
BB = []

for i in list(hnum_dict_lignin.items()):
    # print(i)
    DES.append(i[0])
    AA.append(i[1][0])
    AB.append(i[1][1])
    BB.append(i[1][2])

des_dict = {
    "DES": DES,
    "A-A": AA,
    "A-B": AB,
    "B-B": BB
}
des_hnum_frame_lignin = pd.DataFrame(des_dict, columns=["DES", "A-A", "B-B", "A-B"])
# des_hnum_frame_lignin
aa_dnum_lignin = des_hnum_frame_lignin['A-A']
ab_dnum_lignin = des_hnum_frame_lignin['A-B']
bb_dnum_lignin = des_hnum_frame_lignin['B-B']
des_hnum_frame_lignin['A-A/B-B'] = aa_dnum_lignin/bb_dnum_lignin
# des_hnum_frame_lignin['BB/AA'] = bb_dnum_lignin/aa_dnum_lignin
des_hnum_frame_lignin['A-B/(A-A + B-B)'] = ab_dnum_lignin/(aa_dnum_lignin + bb_dnum_lignin)
des_hnum_frame_lignin.describe()

In [None]:
des_hnum_frame_lignin

In [None]:
''' This cell deletes the A-A, B-B, and B-B lifetimes with 0.00 values in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''
des_hnum_lignin = des_hnum_frame_lignin[(des_hnum_frame_lignin['B-B'] > 0.0)]
des_hnum_lignin.reset_index(drop=True, inplace=True)
# des_hnum_lignin

In [None]:
des_hnum_lignin.describe()

In [None]:
# des_hnum_lignin.describe().to_excel('./des_lignin_hnum_summary.xlsx')
# des_hnum_lignin.to_excel('./des_lignin_hnum.xlsx')

In [None]:
# non-overlapping histo DES
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
des_fig_lignin = plt.figure()
des_fig_lignin.set_size_inches(12, 8, forward=True)
des_ax_lignin = des_fig_lignin.add_subplot(1,1,1)
des_ax_lignin.set_xlabel("Hydrogen bond number", fontsize=24, weight='bold')
des_ax_lignin.set_ylabel("Number of systems", fontsize=24, weight='bold')
ytick = np.arange(0,40, 2)
xtick = np.arange(0,140, 10)
plt.yticks(ytick,fontsize=22, weight='bold')
plt.xticks(xtick,fontsize=22, weight='bold')
plt.title('DES', fontsize=22, weight='bold')
plt.ylim([0,12])
# des = des[['AA', 'BB', 'AB']]
# des.plot.hist(bins=20, alpha=0.5, ylim=[0,22], ax=des_ax_lignin)
plt.hist([des_hnum_lignin['A-A'], des_hnum_lignin['B-B'], des_hnum_lignin['A-B']], bins=10, label=['A-A', 'B-B', 'A-B'])
# plt.hist([des['A-A'], des['B-B'], des['A-B']], bins=binss, label=['A-A', 'B-B', 'A-B'])
plt.legend(loc='upper right', prop={'weight':'bold'})
des_fig_lignin.savefig(f'des_hnum_lignin_nonoverlap_{xdate}.png', dpi=350,facecolor='white', bbox_inches='tight')
plt.show()

#### non-DES

In [None]:
''' This function takes in a path to GROMACS outputs from hydrogen bond calculations, and stores the 
A-A, B-B, and B-B numtimes in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''

pathway = Path()

hnum_dict_nondes = {}

for folderz in pathway.glob('./h*-avg-files/nondes/*'):
    # print(folderz.stem)
    dict_key = folderz.stem[6:]
    hnum_list_nondes = []
    for file in pathway.glob(f"{folderz}/hlife-???-*"):
        # print(file)
        if len(os.listdir(file)) < 5:
            hnum_list_nondes.append(0)
            continue

        for txt in pathway.glob(f"{file}/hnum*.txt"):
            # print(txt)
            txtfile = txt.name
            data = pd.read_csv('{}'.format(txt), sep='\s+',
                               header=None, skiprows=[0, 1])
            data = pd.DataFrame(data)
            y = data[1]
            avg = 0
            avg = np.average(y)  # Change dx to 1 for files prior to 7 Sep 2020
            hnum_list_nondes.append(avg)
            # print("The avg of {thing} is {val}".format(thing= txt.stem, val=avg))
            # print("\n")

    hnum_dict_nondes[f"{dict_key}"] = hnum_list_nondes
    # print(hnum_dict_nondes)
    # print("\n")

NONDES = []
AA = []
AB = []
BB = []

for i in list(hnum_dict_nondes.items()):
    # print(i)
    NONDES.append(i[0])
    AA.append(i[1][0])
    AB.append(i[1][1])
    BB.append(i[1][2])
    # try:
    #     BB.append(i[1][2])
    # except:
    #     BB.append(0)

nondes_dict = {
    "DES": NONDES,
    "A-A": AA,
    "A-B": AB,
    "B-B": BB
}
nondes_hnum_frame = pd.DataFrame(nondes_dict, columns=["DES", "A-A", "B-B", "A-B"])
# nondes_hnum_frame
aa_nnum = nondes_hnum_frame['A-A']
ab_nnum = nondes_hnum_frame['A-B']
bb_nnum = nondes_hnum_frame['B-B']
nondes_hnum_frame['A-A/B-B'] = aa_nnum/bb_nnum
# nondes_hnum_frame['BB/AA'] = bb_nnum/aa_nnum
nondes_hnum_frame['A-B/(A-A + B-B)'] = ab_nnum/(aa_nnum + bb_nnum)
nondes_hnum_frame.describe()

In [None]:
# nondes_hnum_frame.to_csv('./nondes_investigation.csv')

In [None]:
''' This cell deletes the A-A, B-B, and B-B lifetimes with 0.00 values in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''
nondes_hnum = nondes_hnum_frame[(nondes_hnum_frame['B-B'] > 0.0)]
nondes_hnum.reset_index(drop=True, inplace=True)
# nondes_hnum

In [None]:
# nondes_hnum.describe().to_excel('./nondes_hnum_new_summary.xlsx')
# nondes_hnum.to_excel('./nondes_hnum_01-23-2023.xlsx')

In [None]:
# non-overlapping histo NON-DES
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
non_des_fig = plt.figure()
non_des_fig.set_size_inches(12, 10, forward=True)
non_des_ax = non_des_fig.add_subplot(1,1,1)
non_des_ax.set_xlabel("Hydrogen bond number", fontsize=28, weight='bold')
non_des_ax.set_ylabel("Number of systems", fontsize=28, weight='bold')
ytick = np.arange(0,40, 4)
xtick = np.arange(0,90, 10)
plt.yticks(ytick,fontsize=26, weight='bold')
plt.xticks(xtick,fontsize=26, weight='bold')
# plt.title('Non-DES', fontsize=22, weight='bold')
plt.ylim([0,40])
# non_des_hist = non_des[['AA', 'BB', 'AB']]
# non_des_hist.plot.hist(bins=20, alpha=0.5, ylim=[0,22], ax =non_des_ax) # ylim=[0,22], 
plt.hist([nondes_hnum['A-A'], nondes_hnum['B-B'], nondes_hnum['A-B']], bins=10, label=['A-A', 'B-B', 'A-B'])
plt.legend(loc='upper right', prop={'weight':'bold'})
non_des_fig.savefig(f'./plots/distributions/nondes_hnum_nonoverlap_{xdate}.tiff', dpi=350,facecolor='white', bbox_inches='tight')
plt.show()


#### Lignin non-DES

In [None]:
''' This function takes in a path to GROMACS outputs from hydrogen bond calculations, and stores the 
A-A, B-B, and B-B numtimes in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''

pathway = Path()

hnum_dict_nondes_lignin = {}

for folderz in pathway.glob('./desfiles-irc/nondes/*'):
    # print(folderz.stem)
    dict_key = folderz.stem
    hnum_list_nondes = []
    for file in pathway.glob(f"{folderz}/hbond-???-*"):
        # print(file)
        if len(os.listdir(file)) < 3:
            hnum_list_nondes.append(0)
            continue

        for txt in pathway.glob(f"{file}/hnum*.txt"):
            # print(txt)
            txtfile = txt.name
            data = pd.read_csv('{}'.format(txt), sep='\s+',
                               header=None, skiprows=[0, 1])
            data = pd.DataFrame(data)
            y = data[1]
            avg = 0
            avg = np.average(y)  # Change dx to 1 for files prior to 7 Sep 2020
            hnum_list_nondes.append(avg)
            # print("The avg of {thing} is {val}".format(thing= txt.stem, val=avg))
            # print("\n")

    hnum_dict_nondes_lignin[f"{dict_key}"] = hnum_list_nondes
    # print(hnum_dict_nondes_lignin)
    # print("\n")

NONDES = []
AA = []
AB = []
BB = []

for i in list(hnum_dict_nondes_lignin.items()):
    # print(i)
    NONDES.append(i[0])
    AA.append(i[1][0])
    AB.append(i[1][1])
    BB.append(i[1][2])
    # try:
    #     BB.append(i[1][2])
    # except:
    #     BB.append(0)

nondes_dict_lignin = {
    "DES": NONDES,
    "A-A": AA,
    "A-B": AB,
    "B-B": BB
}
nondes_hnum_frame_lignin = pd.DataFrame(nondes_dict_lignin, columns=["DES", "A-A", "B-B", "A-B"])
# nondes_hnum_frame_lignin
aa_nnum_lignin = nondes_hnum_frame_lignin['A-A']
ab_nnum_lignin = nondes_hnum_frame_lignin['A-B']
bb_nnum_lignin = nondes_hnum_frame_lignin['B-B']
nondes_hnum_frame_lignin['A-A/B-B'] = aa_nnum_lignin/bb_nnum_lignin
# nondes_hnum_frame_lignin['BB/AA'] = bb_nnum_lignin/aa_nnum_lignin
nondes_hnum_frame_lignin['A-B/(A-A + B-B)'] = ab_nnum_lignin/(aa_nnum_lignin + bb_nnum_lignin)
nondes_hnum_frame_lignin.describe()

In [None]:
# nondes_hnum_frame_lignin.to_excel('./nondes_lignin_hnum.xlsx')
# nondes_hnum_frame_lignin.describe().to_excel('./nondes_lignin_hnum_summary.xlsx')

In [None]:
''' This cell deletes the A-A, B-B, and B-B lifetimes with 0.00 values in a pandas dataframe. The dataframe can be stored as a csv for later use.
'''
nondes_hnum_lignin = nondes_hnum_frame_lignin[(nondes_hnum_frame_lignin['B-B'] > 0.0)]
nondes_hnum_lignin.reset_index(drop=True, inplace=True)
# nondes_hnum_lignin

In [None]:
# non-overlapping histo NON-DES
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
non_des_fig_lignin = plt.figure()
non_des_fig_lignin.set_size_inches(12, 8, forward=True)
non_des_ax_lignin = non_des_fig_lignin.add_subplot(1,1,1)
non_des_ax_lignin.set_xlabel("Hydrogen bond number", fontsize=24, weight='bold')
non_des_ax_lignin.set_ylabel("Number of systems", fontsize=24, weight='bold')
ytick = np.arange(0,40, 2)
xtick = np.arange(0,90, 10)
plt.yticks(ytick,fontsize=22, weight='bold')
plt.xticks(xtick,fontsize=22, weight='bold')
plt.title('Non-DES', fontsize=22, weight='bold')
plt.ylim([0,12])
# non_des_hist = non_des[['AA', 'BB', 'AB']]
# non_des_hist.plot.hist(bins=20, alpha=0.5, ylim=[0,22], ax =non_des_ax_lignin) # ylim=[0,22], 
plt.hist([nondes_hnum_lignin['A-A'], nondes_hnum_lignin['B-B'], nondes_hnum_lignin['A-B']], bins=10, label=['A-A', 'B-B', 'A-B'])
plt.legend(loc='upper right', prop={'weight':'bold'})
non_des_fig_lignin.savefig(f'nondes_hnum_lignin_nonoverlap_{xdate}.png', dpi=350,facecolor='white', bbox_inches='tight')
plt.show()


### Utility functions

In [None]:
def dirmaker(path):
    '''
    path is the folder path you want to make if it exists
    '''
    if os.path.isdir(path):
        pass
    else:
        os.mkdir(path)
        pass


#### data generation for hnum or hlife
This generates random data samples of training and testing set for hnum or hlife scenarios.

In [None]:
def data_generator(des, nondes, test_sample_size=8, nondes_batch_size=38, random_state=1):
    '''
    This generates batches of equally sized data samples from two dataframes
    and returns training and testing data sets.
    random_state ensures replicability.
    '''
    des_test_df = des.sample(
        n=test_sample_size, replace=False, random_state=random_state)
    nondes_batch_df = nondes.sample(
        n=nondes_batch_size, replace=False, random_state=random_state)
    nondes_test_df = nondes_batch_df.sample(
        n=test_sample_size, replace=False, random_state=random_state)
    df = [des, nondes_batch_df, des_test_df, nondes_test_df]
    df_train = pd.concat(df).drop_duplicates(keep=False)
    df_train = df_train.reset_index()
    # print(df_train.drop(columns=['output','index']).describe())

    df_test_list = [des_test_df, nondes_test_df]
    df_test = pd.concat(df_test_list).drop_duplicates(keep=False)
    df_test = df_test.reset_index()
    # print(df_test.drop(columns=['output','index']).describe())

    X_train = np.array(df_train.drop(columns=['output', 'index']))
    y_train = np.array(df_train['output'])

    X_test = np.array(df_test.drop(columns=['output', 'index']))
    y_test = np.array(df_test['output'])

    return X_train, y_train, X_test, y_test


#### data generation for merged
This generates only training and testing data for hnum + hlife scenarios.

In [None]:
def df_generator(des, nondes, test_sample_size=8, nondes_batch_size=38, random_state=1):
    '''
    This generates batches of equally sized data samples from two dataframes
    and returns training and testing data sets.
    '''
    des_test_df = des.sample(n=test_sample_size, replace=False, random_state=random_state)
    nondes_batch_df = nondes.sample(n=nondes_batch_size, replace=False, random_state=random_state)
    nondes_test_df = nondes_batch_df.sample(n=test_sample_size, replace=False, random_state=random_state)
    df = [des, nondes_batch_df, des_test_df, nondes_test_df]
    df_train = pd.concat(df).drop_duplicates(keep=False)
    df_train = df_train.reset_index()
    # print(df_train.drop(columns=['output','index']).describe())

    df_test_list = [des_test_df, nondes_test_df]
    df_test = pd.concat(df_test_list).drop_duplicates(keep=False)
    df_test = df_test.reset_index()

    X_train = np.array(df_train.drop(
        columns=['output_l', 'output_n', 'index']))
    y_train = np.array(df_train['output_n'])

    X_test = np.array(df_test.drop(columns=['output_l', 'output_n', 'index']))
    y_test = np.array(df_test['output_n'])

    return X_train, y_train, X_test, y_test


#### data generation for only CV
Generates X_train and y_train that can then be split into train/test by the model.
Useful for CV.

In [None]:
def data_crossval(des, nondes, batch_size=38, random_state=1):
    '''
    This generates batches of equally sized data samples from two dataframes
    and returns training datasets.
    '''
    nondes_batch_df = nondes.sample(n=batch_size, replace=False, random_state=random_state)
    df = [des, nondes_batch_df]
    df_train = pd.concat(df).drop_duplicates(keep=False)
    df_train = df_train.reset_index()

    X_train = np.array(df_train.drop(columns=['output', 'index']))
    y_train = np.array(df_train['output'])

    return X_train, y_train


#### Training loop functions
Functions for training different model types.

In [None]:
def train_ab_dt_ef_gb_rf(model, des_df, nondes_df, file_name, folder_type='unclassified', num=100, rand_seed=1000, features=features):
    '''
    This function trains a model on hnum/hlife/merged data.
    Works with sklearn GradBoost, AdaBoost, RandomForest,
    ExtraTreesForest, and DecisionTrees. It trains
    for num loops.

    model is the model instance,
    des_df and nondes_df are the DES and non-DES data,    
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified.
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV.
    '''
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    # print(xdate)

    roc_auc_list = []
    folder_name = folder_type
    model_name = model.__str__().split('(')[0]
    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"

        
    # file = open(f"./model-logs/hlife_{num}_{xdate}.txt", "w+")
    file = file_name
    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff = [0, 0, 0, 0, 0]

    for x in trange(num):
        print(f"Starting round {x+1}", file=file)
        if folder_name == 'hlife' or folder_name == 'hnum':
            X_train, y_train, X_test, y_test = data_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        elif folder_name == 'merged':
            X_train, y_train, X_test, y_test = df_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        else:
            print("Check folder_type variable")

        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)  # roc_auc_score needs probabilities
        print(f'Prediction probabilities: \n{y_pred_proba}', file=file)
        # print('Prediction probabilities: \n', y_pred_proba)
        print(f'Predictions: {y_pred}', file=file)
        # print('Predictions: ', y_pred)
        target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
        print(metrics.classification_report(y_test, y_pred,
            target_names=target_names), file=file)
        roc_auc = metrics.roc_auc_score(y_test, y_pred_proba[:,1])
        print(f"roc_auc_score: {roc_auc}", file=file)
        roc_auc_list.append(roc_auc)
        print(model.feature_importances_, file=file)
        # print(f"intercept: {model.intercept_}", file=file)

        model_feature_df = pd.DataFrame(
            {'Importance': model.feature_importances_, 'Features': features})
        max_coeff_index = list(model.feature_importances_).index(model.feature_importances_.max())
        print(
            f'max feature: {model.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file)
        coeff[max_coeff_index] += 1
        print('\n', file=file)
        print(model_feature_df, file=file)
        print('\n'*2, file=file)
    
    # save model
    dirmaker(f'./saved-models/{folder_name}/{xdate}')
    dump(model, f'./saved-models/{folder_name}/{xdate}/{model_name}_{folder_name}.joblib')

    # plotting roc_auc score
    fig = plt.figure()
    fig.set_size_inches(12, 10, forward=True)
    fig_ax = fig.add_subplot(1, 1, 1)
    fig_ax.set_xlabel("Number of runs", fontsize=24, weight='bold')
    fig_ax.set_ylabel("ROC-AUC score", fontsize=24, weight='bold')

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set properly


    ytick = np.arange(0, 1.2, 0.2)
    xtick = np.arange(0, num+2, 10)
    plt.yticks(ytick, fontsize=22, weight='bold')
    plt.xticks(xtick, fontsize=22, weight='bold')
    fig_ax.set_ylim(0, 1.0)
    fig_ax.set_xlim(0, num+2)
    # plt.title(f'{model_name} hbond {title_tag}', fontsize=22, weight='bold')


    roc = [z for z in range(1, num+1)]
    print(f"roc_auc scores: {roc_auc_list}", file=file)
    print(
        f"Average roc_auc scores: {np.average(roc_auc_list)}", file=file)
    print(
        f"std dev of roc_auc scores: {np.std(roc_auc_list)}", file=file)
    print(
        f"Best roc_auc score: {max(roc_auc_list)} at index {roc_auc_list.index(max(roc_auc_list)) + 1}", file=file)
    print('\n', file=file)
    coeff_df = pd.DataFrame(
        {'Top Coefficients': coeff, 'Features': features})
    print(f"{coeff_df}", file=file)
    plt.plot(roc, roc_auc_list, '-o', linewidth=2, markersize=8.0, label=f"avg roc_auc: {round(np.average(roc_auc_list),2)}\nstd roc_auc : {round(np.std(roc_auc_list),2)}")
    plt.legend(loc='lower left', fontsize=16)
    plt.show()
    file.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/{model_name}_{folder_name}_{num}-{xdate}.tiff',
                dpi=400, facecolor='white', bbox_inches='tight')


In [None]:
def train_lr(model, des_df, nondes_df, file_name, folder_type='unclassified', num=100, rand_seed=1000, features=features, model_name='model'):
    '''
    This function trains a model on hnum/hlife/merged data.
    Works with sklearn Logistic Reg. It trains for num loops.

    model is the model instance,
    des_df and nondes_df are the DES and non-DES data,    
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV
    '''
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    # print(xdate)

    roc_auc_list = []
    folder_name = folder_type
    model_name = model.__str__().split('(')[0]
    # file = open(f"./model-logs/hlife_{num}_{xdate}.txt", "w+")
    file = file_name

    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff = [0, 0, 0, 0, 0]


    for x in trange(num):
        print(f"Starting round {x+1}", file=file)
        if folder_name == 'hlife' or folder_name == 'hnum':
            X_train, y_train, X_test, y_test = data_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        elif folder_name == 'merged':
            X_train, y_train, X_test, y_test = df_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        else:
            print("Check folder_type variable")

        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)  # roc_auc_score needs probabilities
        print(f'Prediction probabilities: {y_pred_proba}', file=file)
        # print('Prediction probabilities: ', y_pred_proba)
        print(f'Predictions: {y_pred}', file=file)
        # print('Predictions: ', y_pred)
        target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
        print(metrics.classification_report(y_test, y_pred,
            target_names=target_names), file=file)
        roc_auc = metrics.roc_auc_score(y_test, y_pred_proba[:,1])
        print(f"roc_auc_score: {roc_auc}", file=file)
        roc_auc_list.append(roc_auc)
        print(model.coef_, file=file)
        print(f"intercept: {model.intercept_}", file=file)

        model_feature_df = pd.DataFrame(
            {'Coefficients': model.coef_.squeeze(), 'Features': features})
        max_coeff_index = list(model.coef_.squeeze()
                            ).index(model.coef_.max())
        print(
            f'max coeff: {model.coef_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file)
        coeff[max_coeff_index] += 1
        print('\n', file=file)
        print(model_feature_df, file=file)
        print('\n'*2, file=file)
    
    # save model
    dirmaker(f'./saved-models/{folder_name}/{xdate}')
    dump(model, f'./saved-models/{folder_name}/{xdate}/{model_name}_{folder_name}.joblib')

    # plotting roc_auc score
    fig = plt.figure()
    fig.set_size_inches(12, 10, forward=True)
    fig_ax = fig.add_subplot(1, 1, 1)
    fig_ax.set_xlabel("Number of runs", fontsize=24, weight='bold')
    fig_ax.set_ylabel("ROC-AUC score", fontsize=24, weight='bold')

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set propelry


    ytick = np.arange(0, 1.2, 0.2)
    xtick = np.arange(0, num+2, 10)
    plt.yticks(ytick, fontsize=22, weight='bold')
    plt.xticks(xtick, fontsize=22, weight='bold')
    fig_ax.set_ylim(0, 1.0)
    fig_ax.set_xlim(0, num+2)
    # plt.title(f'{model_name} hbond {title_tag}', fontsize=22, weight='bold')


    roc = [z for z in range(1, num+1)]
    print(f"roc_auc scores: {roc_auc_list}", file=file)
    print(
        f"Average roc_auc scores: {np.average(roc_auc_list)}", file=file)
    print(
        f"std dev of roc_auc scores: {np.std(roc_auc_list)}", file=file)
    print(
        f"Best roc_auc score: {max(roc_auc_list)} at index {roc_auc_list.index(max(roc_auc_list)) + 1}", file=file)
    print('\n', file=file)
    coeff_df = pd.DataFrame(
        {'Top Coefficients': coeff, 'Features': features})
    print(f"{coeff_df}", file=file)
    plt.plot(roc, roc_auc_list, '-o', linewidth=2, markersize=8.0, label=f"avg roc_auc: {round(np.average(roc_auc_list),2)}\nstd roc_auc : {round(np.std(roc_auc_list),2)}")
    plt.legend(loc='lower left', fontsize=16)
    plt.show()
    file.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/{model_name}_{folder_name}_{num}-{xdate}.tiff',
                dpi=400, facecolor='white', bbox_inches='tight')


In [None]:
def train_svc(model, des_df, nondes_df, file_name, folder_type='unclassified', num=100, rand_seed=1000, features=features, model_name='model'):
    '''
    This function trains a model on hnum/hlife/merged data.
    Works with sklearn support vector machine

    model is the model instance,
    des_df and nondes_df are the DES and non-DES data,    
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV
    '''
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    # print(xdate)

    roc_auc_list = []
    folder_name = folder_type
    model_name=model_name
    print(model_name)
    # model_name = model.__str__().strip('(')[0]
    # file = open(f"./model-logs/hlife_{num}_{xdate}.txt", "w+")
    file = file_name

    
    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff = [0, 0, 0, 0, 0]


    for x in trange(num):
        print(f"Starting round {x+1}", file=file)
        if folder_name == 'hlife' or folder_name == 'hnum':
            X_train, y_train, X_test, y_test = data_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        elif folder_name == 'merged':
            X_train, y_train, X_test, y_test = df_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        else:
            print("Check folder_type variable")

        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)  # roc_auc_score needs probabilities
        print(f'Prediction probabilities: {y_pred_proba}', file=file)
        # print('Prediction probabilities: ', y_pred_proba)
        print(f'Predictions: {y_pred}', file=file)
        # print('Predictions: ', y_pred)
        target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
        print(metrics.classification_report(y_test, y_pred,
            target_names=target_names), file=file)
        roc_auc = metrics.roc_auc_score(y_test, y_pred_proba[:,1])
        print(f"roc_auc_score: {roc_auc}", file=file)
        roc_auc_list.append(roc_auc)
        print(model._get_coef(), file=file)
        print(f"intercept: {model.intercept_}", file=file)

        model_feature_df = pd.DataFrame(
            {'Coefficients': model._get_coef().squeeze(), 'Features': features})
        max_coeff_index = list(model._get_coef().squeeze()
                            ).index(model._get_coef().max())
        print(
            f'max coeff: {model._get_coef().max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file)
        coeff[max_coeff_index] += 1
        print('\n', file=file)
        print(model_feature_df, file=file)
        print('\n'*2, file=file)

    # save model
    dirmaker(f'./saved-models/{folder_name}/{xdate}')
    dump(model, f'./saved-models/{folder_name}/{xdate}/{model_name}_{folder_name}.joblib')

    # plotting roc_auc score
    fig = plt.figure()
    fig.set_size_inches(12, 10, forward=True)
    fig_ax = fig.add_subplot(1, 1, 1)
    fig_ax.set_xlabel("Number of runs", fontsize=24, weight='bold')
    fig_ax.set_ylabel("ROC-AUC score", fontsize=24, weight='bold')

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set propelry


    ytick = np.arange(0, 1.2, 0.2)
    xtick = np.arange(0, num+2, 10)
    plt.yticks(ytick, fontsize=22, weight='bold')
    plt.xticks(xtick, fontsize=22, weight='bold')
    fig_ax.set_ylim(0, 1.0)
    fig_ax.set_xlim(0, num+2)
    # plt.title(f'{model_name} hbond {title_tag}', fontsize=22, weight='bold')


    roc = [z for z in range(1, num+1)]
    print(f"roc_auc scores: {roc_auc_list}", file=file)
    print(
        f"Average roc_auc scores: {np.average(roc_auc_list)}", file=file)
    print(
        f"std dev of roc_auc scores: {np.std(roc_auc_list)}", file=file)
    print(
        f"Best roc_auc score: {max(roc_auc_list)} at index {roc_auc_list.index(max(roc_auc_list)) + 1}", file=file)
    print('\n', file=file)
    coeff_df = pd.DataFrame(
        {'Top Coefficients': coeff, 'Features': features})
    print(f"{coeff_df}", file=file)
    plt.plot(roc, roc_auc_list, '-o', linewidth=2, markersize=8.0, label=f"avg roc_auc: {round(np.average(roc_auc_list),2)}\nstd roc_auc : {round(np.std(roc_auc_list),2)}")
    plt.legend(loc='lower left', fontsize=16)
    plt.show()
    file.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/{model_name}_{folder_name}_{num}-{xdate}.tiff',
                dpi=400, facecolor='white', bbox_inches='tight')


In [None]:
def train_knn(model, des_df, nondes_df, file_name, folder_type='unclassified', num=100, rand_seed=1000, features=features, model_name='model'):
    '''
    This function trains a model on hnum/hlife/merged data.
    Works with sklearn KNN

    model is the model instance,
    des_df and nondes_df are the DES and non-DES data,    
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified.
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV
    '''
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    # print(xdate)

    roc_auc_list = []
    folder_name = folder_type
    model_name = model.__str__().split('(')[0]
    # file = open(f"./model-logs/hlife_{num}_{xdate}.txt", "w+")
    file = file_name
    for x in trange(num):
        print(f"Starting round {x+1}", file=file)
        if folder_name == 'hlife' or folder_name == 'hnum':
            X_train, y_train, X_test, y_test = data_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        elif folder_name == 'merged':
            X_train, y_train, X_test, y_test = df_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        else:
            print("Check folder_type variable")

        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)  # roc_auc_score needs probabilities
        print(f'Prediction probabilities: {y_pred_proba}', file=file)
        # print('Prediction probabilities: ', y_pred_proba)
        print(f'Predictions: {y_pred}', file=file)
        # print('Predictions: ', y_pred)
        target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
        print(metrics.classification_report(y_test, y_pred,
            target_names=target_names), file=file)
        roc_auc = metrics.roc_auc_score(y_test, y_pred_proba[:,1])
        print(f"roc_auc_score: {roc_auc}", file=file)
        roc_auc_list.append(roc_auc)       
        print('\n'*2, file=file)

    # save model
    dirmaker(f'./saved-models/{folder_name}/{xdate}')
    dump(model, f'./saved-models/{folder_name}/{xdate}/{model_name}_{folder_name}.joblib')
    
    # plotting roc_auc score
    fig = plt.figure()
    fig.set_size_inches(12, 10, forward=True)
    fig_ax = fig.add_subplot(1, 1, 1)
    fig_ax.set_xlabel("Number of runs", fontsize=24, weight='bold')
    fig_ax.set_ylabel("ROC-AUC score", fontsize=24, weight='bold')

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set propelry


    ytick = np.arange(0, 1.2, 0.2)
    xtick = np.arange(0, num+2, 10)
    plt.yticks(ytick, fontsize=22, weight='bold')
    plt.xticks(xtick, fontsize=22, weight='bold')
    fig_ax.set_ylim(0, 1.0)
    fig_ax.set_xlim(0, num+2)
    # plt.title(f'{model_name} hbond {title_tag}', fontsize=22, weight='bold')


    roc = [z for z in range(1, num+1)]
    print(f"roc_auc scores: {roc_auc_list}", file=file)
    print(
        f"Average roc_auc scores: {np.average(roc_auc_list)}", file=file)
    print(
        f"std dev of roc_auc scores: {np.std(roc_auc_list)}", file=file)
    print(
        f"Best roc_auc score: {max(roc_auc_list)} at index {roc_auc_list.index(max(roc_auc_list)) + 1}", file=file)
    print('\n', file=file)
    
    plt.plot(roc, roc_auc_list, '-o', linewidth=2, markersize=8.0, label=f"avg roc_auc: {round(np.average(roc_auc_list),2)}\nstd roc_auc : {round(np.std(roc_auc_list),2)}")
    plt.legend(loc='lower left', fontsize=16)
    plt.show()
    file.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/{model_name}_{folder_name}_{num}-{xdate}.tiff',
                dpi=400, facecolor='white', bbox_inches='tight')


In [None]:
def train_xgb(model, des_df, nondes_df, file_name, folder_type='unclassified', num=100, rand_seed=1000, features=features):
    '''
    This function trains a model on hnum/hlife/merged data.
    Works with sklearn XGBClassifier or XGBRFClassifier.

    model is the model instance,
    des_df and nondes_df are the DES and non-DES data,    
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified.
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV.
    '''
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    xgb_roc_auc = []
    folder_name = folder_type
    model_name = model.__str__().split('(')[0]
    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"

            
    file = file_name
    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff = [0, 0, 0, 0, 0]


    for x in trange(num):
        print(f"Starting round {x+1}", file=file)
        if folder_name == 'hlife' or folder_name == 'hnum':
            X_train, y_train, X_test, y_test = data_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        elif folder_name == 'merged':
            X_train, y_train, X_test, y_test = df_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        else:
            print("Check folder_type variable")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)  # roc_auc_score needs probabilities
        print(f'Prediction probabilities: \n{y_pred_proba}', file=file)
        # print('Prediction probabilities: \n', y_pred_proba)
        print(f'Predictions: {y_pred}', file=file)
        # print('Predictions: ', y_pred)
        target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
        print(metrics.classification_report(y_test, y_pred,
            target_names=target_names), file=file)
        roc_auc = metrics.roc_auc_score(y_test, y_pred_proba[:,1])
        print(f"roc_auc_score: {roc_auc}", file=file)
        xgb_roc_auc.append(roc_auc)
        print(model.feature_importances_, file=file)

        model_feature_df = pd.DataFrame(
            {'Importance': model.feature_importances_, 'Features': features})
        max_coeff_index = list(model.feature_importances_).index(
            model.feature_importances_.max())
        print(
            f'max feature: {model.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file)
        coeff[max_coeff_index] += 1
        print('\n', file=file)
        print(model_feature_df, file=file)
        print('\n'*2, file=file)

    # save model
    dirmaker(f'./saved-models/{folder_name}/{xdate}')
    dump(model, f'./saved-models/{folder_name}/{xdate}/{model_name}_{folder_name}.joblib')
    
    # plotting roc_auc score
    fig = plt.figure()
    fig.set_size_inches(12, 10, forward=True)
    fig_ax = fig.add_subplot(1, 1, 1)
    fig_ax.set_xlabel("Number of runs", fontsize=24, weight='bold')
    fig_ax.set_ylabel("ROC-AUC score", fontsize=24, weight='bold')

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set properly


    ytick = np.arange(0, 1.2, 0.2)
    xtick = np.arange(0, num+2, 10)
    plt.yticks(ytick, fontsize=22, weight='bold')
    plt.xticks(xtick, fontsize=22, weight='bold')
    fig_ax.set_ylim(0, 1.0)
    fig_ax.set_xlim(0, num+2)
    # plt.title(f'{model_name} hbond {title_tag}', fontsize=22, weight='bold')

    roc = [z for z in range(1, num+1)]
    print(f"roc_auc scores: {xgb_roc_auc}", file=file)
    print(
        f"Average roc_auc scores: {np.average(xgb_roc_auc)}", file=file)
    print(
        f"std dev of roc_auc scores: {np.std(xgb_roc_auc)}", file=file)
    print(
        f"Best roc_auc score: {max(xgb_roc_auc)} at index {xgb_roc_auc.index(max(xgb_roc_auc)) + 1}", file=file)
    print('\n', file=file)
    coeff_df_xgb = pd.DataFrame({'Top Features': coeff, 'Features': features})
    print(f"{coeff_df_xgb}", file=file)
    file.close()

    plt.plot(roc, xgb_roc_auc, '-o', linewidth=2, markersize=8.0, label=f"avg roc_auc: {round(np.average(xgb_roc_auc),2)}\nstd roc_auc: {round(np.std(xgb_roc_auc),2)}")

    savedir = f'./plots/roc-auc/{folder_name}/{xdate}'
    plt.legend(loc='lower left', fontsize=16)
    
    dirmaker(savedir)
    fig.savefig(f'{savedir}/{model_name}_{num}-{xdate}.tiff',
                dpi=400, facecolor='white', bbox_inches='tight')
    plt.show()


In [None]:
def train_xgb_eval(model, des_df, nondes_df, file_name, folder_type='unclassified', num=100, rand_seed=1000, features=features):
    '''
    This function trains a model on hnum/hlife/merged data.
    Works with sklearn XGBClassifier and XGBRFClassifier. It trains
    for num loops.

    model is the model instance,
    des_df and nondes_df are the DES and non-DES data,    
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified.
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV.
    '''
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    xgb_roc_auc = []
    folder_name = folder_type
    model_name = model.__str__().split('(')[0]
    train_eval_scores = []
    test_eval_scores = []

    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"

            
    file = file_name
    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff = [0, 0, 0, 0, 0]


    for x in trange(num):
        print(f"Starting round {x+1}", file=file)
        if folder_name == 'hlife' or folder_name == 'hnum':
            X_train, y_train, X_test, y_test = data_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        elif folder_name == 'merged':
            X_train, y_train, X_test, y_test = df_generator(
                des_df, nondes_df, test_sample_size=8, nondes_batch_size=38, random_state=x)
        else:
            print("Check folder_type variable")

        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (
        X_test, y_test)], verbose=False)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)  # roc_auc_score needs probabilities
        print(f'Prediction probabilities: \n{y_pred_proba}', file=file)
        # print('Prediction probabilities: \n', y_pred_proba)
        print(f'Predictions: {y_pred}', file=file)
        # print('Predictions: ', y_pred)
        target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
        print(metrics.classification_report(y_test, y_pred,
            target_names=target_names), file=file)
        roc_auc = metrics.roc_auc_score(y_test, y_pred_proba[:,1])
        print(f"roc_auc_score: {roc_auc}", file=file)
        xgb_roc_auc.append(roc_auc)
        print(model.feature_importances_, file=file)

        model_feature_df = pd.DataFrame(
            {'Importance': model.feature_importances_, 'Features': features})
        max_coeff_index = list(model.feature_importances_).index(model.feature_importances_.max())
        print(f'max feature: {model.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file)
        coeff[max_coeff_index] += 1
        print('\n', file=file)
        print(model_feature_df, file=file)        
        print('\n'*2, file=file)

        train_eval = list(model.evals_result()['validation_0'].items())
        print(f'Number of training auc scores: {len(train_eval[0][1])}', file=file_name)
        print(f'auc scores of training set: {train_eval[0][1]}', file=file_name)
        print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
        {round(np.std(train_eval[0][1]),2)} \n', file=file_name)

        if model_name.__contains__("XGBClassifier"):
            train_eval_scores.append(np.average(train_eval[0][1]))
        elif model_name.__contains__("XGBRFClassifier"):
            train_eval_scores.append(train_eval[0][1])

        val_eval = list(model.evals_result()
                        ['validation_1'].items())
        print(f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_name)
        print(f'auc scores of testing set: {val_eval[0][1]}', file=file_name)
        print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
        {round(np.std(val_eval[0][1]),2)} \n', file=file_name)
        # test_eval_scores.append(val_eval[0][1])

        if model_name.__contains__("XGBClassifier"):
            test_eval_scores.append(np.average(val_eval[0][1]))
        elif model_name.__contains__("XGBRFClassifier"):
            test_eval_scores.append(val_eval[0][1])

        print('\n'*2, file=file)

    
    print('test scores', len(test_eval_scores))
    print('train scores',len(train_eval_scores))

    # print('test scores', test_eval_scores)
    # print('train scores',train_eval_scores)

    # save model
    dirmaker(f'./saved-models/{folder_name}/{xdate}')
    dump(model, f'./saved-models/{folder_name}/{xdate}/{model_name}_{folder_name}.joblib')

    # plotting roc_auc score
    fig = plt.figure()
    fig.set_size_inches(12, 10, forward=True)
    fig_ax = fig.add_subplot(1, 1, 1)
    fig_ax.set_xlabel("Number of runs", fontsize=24, weight='bold')
    fig_ax.set_ylabel("ROC-AUC score", fontsize=24, weight='bold')

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set properly


    ytick = np.arange(0, 1.2, 0.2)
    xtick = np.arange(0, num+2, 10)
    plt.yticks(ytick, fontsize=22, weight='bold')
    plt.xticks(xtick, fontsize=22, weight='bold')
    fig_ax.set_ylim(0, 1.0)
    fig_ax.set_xlim(0, num+2)
    # plt.title(f'{model_name} hbond {title_tag}', fontsize=22, weight='bold')

    roc = [z for z in range(1, num+1)]
    print(f"roc_auc scores: {xgb_roc_auc}", file=file)
    print(
        f"Average roc_auc scores: {np.average(xgb_roc_auc)}", file=file)
    print(
        f"std dev of roc_auc scores: {np.std(xgb_roc_auc)}", file=file)
    print(
        f"Best roc_auc score: {max(xgb_roc_auc)} at index {xgb_roc_auc.index(max(xgb_roc_auc)) + 1}", file=file)
    print('\n', file=file)
    coeff_df_xgb = pd.DataFrame({'Top Features': coeff, 'Features': features})
    print(f"{coeff_df_xgb}", file=file)
    file.close()

    fig_ax.plot(range(1, len(train_eval_scores)+1), train_eval_scores, '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval_scores),2)}\nstd training  roc_auc: {round(np.std(train_eval_scores),2)}")

    fig_ax.plot(range(1, len(test_eval_scores)+1), test_eval_scores, '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc:  {round(np.average(test_eval_scores),2)}\nstd testing  roc_auc:  {round(np.std(test_eval_scores),2)}")

    plt.legend(loc='lower left')

    # plt.plot(roc, xgb_roc_auc, '-o', linewidth=2, markersize=8.0, label=f"avg roc_auc: {round(np.average(xgb_roc_auc),2)}\n \
    # std roc_auc: {round(np.std(xgb_roc_auc),2)}")

    savedir = f'./plots/roc-auc/{folder_name}/{xdate}'
    plt.legend(loc='best', fontsize=16)

    dirmaker(savedir)
    fig.savefig(f'{savedir}/{model_name}_eval_{num}-{xdate}.tiff',
                dpi=400, facecolor='white', bbox_inches='tight')
    plt.show()

### Grid search functions

#### kingmaker_xgb function

In [None]:

def kingmaker_xgb(xgb_model, p_grid, des_df, nondes_df, cv, file_name, folder_type='unclassified', n_repeat=10, rand_seed=1000, features=features):
    '''
    This function runs a grid search CV on an XGB classifier.
    xgb_model is the classifier instance,
    p_grid is the dictionary of values for the grid search,
    cv is the cross-validation instance,
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV
    '''
    folder_name = folder_type
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")

    if folder_name == 'hlife' or folder_name == 'hnum':
        X_train, y_train, X_test, y_test = data_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    elif folder_name == 'merged':
        X_train, y_train, X_test, y_test = df_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    else:
        print("Check folder_type variable")

    xgb_clf = GridSearchCV(xgb_model, param_grid=p_grid,
                           verbose=1, scoring='roc_auc', cv=cv, refit=True)

    xgb_model = xgb_clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (
        X_test, y_test)], verbose=False)  # , early_stopping_rounds=50
    xgb_roc_auc_grid = []

    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff_xgb_grid = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff_xgb_grid = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff_xgb_grid = [0, 0, 0, 0, 0]

    print('Best score', xgb_model.best_score_, file=file_name)
    print('Best params', xgb_model.best_params_, file=file_name)
    y_pred_xgb_proba = xgb_model.best_estimator_.predict_proba(X_test)  # roc_auc_score needs probabilities
    print(y_pred_xgb_proba)
    y_pred_xgb = xgb_model.best_estimator_.predict(X_test)  # roc_auc_score needs probabilities
    print(y_pred_xgb)
    target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
    print(metrics.classification_report(y_test, y_pred_xgb,
          target_names=target_names), file=file_name)
    roc_auc_xgb = metrics.roc_auc_score(y_test, y_pred_xgb_proba[:,1])
    print(f"roc_auc_score: {roc_auc_xgb}", file=file_name)
    xgb_roc_auc_grid.append(roc_auc_xgb)
    print(xgb_model.best_estimator_.feature_importances_, file=file_name)
    # print(f"intercept: {xgb_model.intercept_}", file=file_name)

    xgb_model_feature_df = pd.DataFrame(
        {'Importance': xgb_model.best_estimator_.feature_importances_, 'features': features})
    max_coeff_index = list(xgb_model.best_estimator_.feature_importances_).index(
        xgb_model.best_estimator_.feature_importances_.max())
    print(
        f'max feature: {xgb_model.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_name)
    coeff_xgb_grid[max_coeff_index] += 1
    print('\n', file=file_name)
    print(xgb_model_feature_df, file=file_name)
    print('\n'*2, file=file_name)

    # roc = [ z for z in range(1,num+1)]
    print(f"roc_auc scores on test set: {xgb_roc_auc_grid}", file=file_name)
    print(
        f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_grid)}", file=file_name)
    print(
        f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_grid)}", file=file_name)
    print(
        f"Best roc_auc score on test set: {max(xgb_roc_auc_grid)} at index {xgb_roc_auc_grid.index(max(xgb_roc_auc_grid)) + 1}", file=file_name)
    print(
        f"Best model's roc_auc score from early stopping: {xgb_model.best_estimator_.best_score}", file=file_name)
    print(
        f"Best model's iteration from early stopping: {xgb_model.best_estimator_.best_iteration}", file=file_name)
    # print(f"model's eval_results: {xgb_model.best_estimator_.evals_result()}", file=file_name)
    train_eval = list(xgb_model.best_estimator_.evals_result()[
                      'validation_0'].items())
    print(
        f'Number of training auc scores: {len(train_eval[0][1])}', file=file_name)
    print(f'auc scores of training set: {train_eval[0][1]}', file=file_name)
    print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
    {round(np.std(train_eval[0][1]),2)} \n', file=file_name)

    val_eval = list(xgb_model.best_estimator_.evals_result()
                    ['validation_1'].items())
    print(
        f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_name)
    print(f'auc scores of testing set: {val_eval[0][1]}', file=file_name)
    print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
    {round(np.std(val_eval[0][1]),2)} \n', file=file_name)

    print('\n', file=file_name)
    coeff_df_xgb = pd.DataFrame(
        {'Top features': coeff_xgb_grid, 'features': features})
    print(f"Coefficients: {coeff_df_xgb} \n", file=file_name)

    print(f'Best estimator: {xgb_model.best_estimator_} \n', file=file_name)
    print(f'Best params: {xgb_model.best_params_} \n', file=file_name)
    print(
        f"Best estimator's score from early stopping: {xgb_model.best_estimator_.best_score} \n", file=file_name)
    # plotting roc_auc score
    fig = plt.figure()
    fig_ax = fig.add_subplot(1, 1, 1)
    fig.set_size_inches(12, 8, forward=True)
    fig_ax.set_xlabel("Number of runs")
    fig_ax.set_ylabel("Cross-validation ROC-AUC score")
    fig_ax.set_ylim(0, 1.1)

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set properly

    plt.title(
        f'XGBoost (Repeated KFold) hbond {title_tag}', fontsize=12, weight='bold')

    fig_ax.plot(range(1, len(train_eval[0][1])+1), train_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval[0][1]),2)}\nstd training roc_auc : {round(np.std(train_eval[0][1]),2)}")

    fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\nstd testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

    plt.legend(loc='best')
    file_name.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/XGB_{folder_name}_{n_repeat}grid-{xdate}_{rand_seed}.png',
                dpi=500, facecolor='white', bbox_inches='tight')

    xgb.plot_importance(xgb_model.best_estimator_).set_yticklabels(features)
    plt.show()


In [None]:

def kingmaker_xgbrf(xgb_model, p_grid, des_df, nondes_df, cv, file_name, folder_type='unclassified', n_repeat=10, 
rand_seed=1000, features=features, model_name="XGBRFClassifier"):
    '''
    This function runs a grid search CV on an XGB classifier.
    xgb_model is the classifier instance,
    p_grid is the dictionary of values for the grid search,
    cv is the cross-validation instance,
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV
    '''
    folder_name = folder_type
    model_name = model_name
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")

    if folder_name == 'hlife' or folder_name == 'hnum':
        X_train, y_train, X_test, y_test = data_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    elif folder_name == 'merged':
        X_train, y_train, X_test, y_test = df_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    else:
        print("Check folder_type variable")

    xgb_clf = GridSearchCV(xgb_model, param_grid=p_grid,
                           verbose=1, scoring='roc_auc', cv=cv, refit=True)

    xgb_model = xgb_clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (
        X_test, y_test)], verbose=False)  # , early_stopping_rounds=50
    xgb_roc_auc_grid = []

    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff_xgb_grid = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff_xgb_grid = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff_xgb_grid = [0, 0, 0, 0, 0]

    print('Best score', xgb_model.best_score_, file=file_name)
    print('Best params', xgb_model.best_params_, file=file_name)
    y_pred_xgb_proba = xgb_model.best_estimator_.predict_proba(X_test)  # roc_auc_score needs probabilities
    print(y_pred_xgb_proba)
    y_pred_xgb = xgb_model.best_estimator_.predict(X_test)  # roc_auc_score needs probabilities
    print(y_pred_xgb)
    target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
    print(metrics.classification_report(y_test, y_pred_xgb,
          target_names=target_names), file=file_name)
    roc_auc_xgb = metrics.roc_auc_score(y_test, y_pred_xgb_proba[:,1])
    print(f"roc_auc_score: {roc_auc_xgb}", file=file_name)
    xgb_roc_auc_grid.append(roc_auc_xgb)
    print(xgb_model.best_estimator_.feature_importances_, file=file_name)
    # print(f"intercept: {xgb_model.intercept_}", file=file_name)

    xgb_model_feature_df = pd.DataFrame(
        {'Importance': xgb_model.best_estimator_.feature_importances_, 'features': features})
    max_coeff_index = list(xgb_model.best_estimator_.feature_importances_).index(
        xgb_model.best_estimator_.feature_importances_.max())
    print(
        f'max feature: {xgb_model.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_name)
    coeff_xgb_grid[max_coeff_index] += 1
    print('\n', file=file_name)
    print(xgb_model_feature_df, file=file_name)
    print('\n'*2, file=file_name)

    # roc = [ z for z in range(1,num+1)]
    print(f"roc_auc scores on test set: {xgb_roc_auc_grid}", file=file_name)
    print(
        f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_grid)}", file=file_name)
    print(
        f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_grid)}", file=file_name)
    print(
        f"Best roc_auc score on test set: {max(xgb_roc_auc_grid)} at index {xgb_roc_auc_grid.index(max(xgb_roc_auc_grid)) + 1}", file=file_name)
    # print(
    #     f"Best model's roc_auc score from early stopping: {xgb_model.best_estimator_.best_score}", file=file_name)
    # print(
    #     f"Best model's iteration from early stopping: {xgb_model.best_estimator_.best_iteration}", file=file_name)
    # print(f"model's eval_results: {xgb_model.best_estimator_.evals_result()}", file=file_name)
    train_eval = list(xgb_model.best_estimator_.evals_result()[
                      'validation_0'].items())
    print(
        f'Number of training auc scores: {len(train_eval[0][1])}', file=file_name)
    print(f'auc scores of training set: {train_eval[0][1]}', file=file_name)
    print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
    {round(np.std(train_eval[0][1]),2)} \n', file=file_name)

    val_eval = list(xgb_model.best_estimator_.evals_result()
                    ['validation_1'].items())
    print(
        f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_name)
    print(f'auc scores of testing set: {val_eval[0][1]}', file=file_name)
    print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
    {round(np.std(val_eval[0][1]),2)} \n', file=file_name)

    print('\n', file=file_name)
    coeff_df_xgb = pd.DataFrame(
        {'Top features': coeff_xgb_grid, 'features': features})
    print(f"Coefficients: {coeff_df_xgb} \n", file=file_name)

    print(f'Best estimator: {xgb_model.best_estimator_} \n', file=file_name)
    print(f'Best params: {xgb_model.best_params_} \n', file=file_name)
    # print(
    #     f"Best estimator's score from early stopping: {xgb_model.best_estimator_.best_score} \n", file=file_name)
    # plotting roc_auc score
    fig = plt.figure()
    fig_ax = fig.add_subplot(1, 1, 1)
    fig.set_size_inches(12, 8, forward=True)
    fig_ax.set_xlabel("Number of runs")
    fig_ax.set_ylabel("Cross-validation ROC-AUC score")
    fig_ax.set_ylim(0, 1.1)

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set properly

    plt.title(
        f'{model_name} (Repeated KFold) hbond {title_tag}', fontsize=12, weight='bold')

    fig_ax.plot(range(1, len(train_eval[0][1])+1), train_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval[0][1]),2)}\n \
    std training roc_auc : {round(np.std(train_eval[0][1]),2)}")

    fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\n \
    std testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

    plt.legend(loc='best')
    file_name.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/{model_name}_{folder_name}_{n_repeat}grid-{xdate}_{rand_seed}.png',
                dpi=500, facecolor='white', bbox_inches='tight')

    xgb.plot_importance(xgb_model.best_estimator_).set_yticklabels(features)
    plt.show()


#### kingmaker_generic function

In [None]:

def kingmaker_generic(model, p_grid, des_df, nondes_df, cv, file_name, folder_type='unclassified', n_repeat=10, rand_seed=1000, features=features):
    '''
    This function runs a grid search CV on sklearn GradBoost,
    AdaBoost, RandomForest, ExtraTreesForest, DecisionTrees.
    model is the classifier instance,
    p_grid is the dictionary of values for the grid search,
    cv is the cross-validation instance,
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV
    '''
    folder_name = folder_type
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    model_name = model.__str__().strip('()')
    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"

        
    if folder_name == 'hlife' or folder_name == 'hnum':
        X_train, y_train, X_test, y_test = data_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    elif folder_name == 'merged':
        X_train, y_train, X_test, y_test = df_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    else:
        print("Check folder_type variable")

    clf = GridSearchCV(model, param_grid=p_grid,
                           verbose=1, scoring='roc_auc', cv=cv, refit=True, return_train_score = True)

    # model = clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (
    #     X_test, y_test)], verbose=False, early_stopping_rounds=50)
    model = clf.fit(X_train, y_train)
    roc_auc_grid = []

    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff_grid = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff_grid = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff_grid = [0, 0, 0, 0, 0]

    
    
    results = pd.DataFrame(model.cv_results_)
    # print(results)

    # take the most relevant columns and sort (for readability)
    results = results.loc[:, ('rank_test_score', 'mean_test_score', 'params')]
    results.sort_values(by='rank_test_score', ascending=True, inplace=True)
    print(results.head(1))    
    print(results, file=file_name)
    print('\n'*2, file=file_name)

    print('Best score', model.best_score_, file=file_name)
    print('Best params', model.best_params_, file=file_name)

    y_test_pred_proba = model.best_estimator_.predict_proba(X_test)  # roc_auc_score needs probabilities
    # print('Prediction probabilities: ', y_test_pred_proba)
    print(f'Prediction probabilities: {y_test_pred_proba}', file=file_name)

    y_test_pred = model.best_estimator_.predict(X_test) 
    # print('Predictions: ', y_test_pred)
    print(f'Predictions: {y_test_pred}', file=file_name)
    target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
    print(metrics.classification_report(y_test, y_test_pred,
          target_names=target_names), file=file_name)
    roc_auc = metrics.roc_auc_score(y_test, y_test_pred_proba[:,1])
    print(f"roc_auc_score: {roc_auc}", file=file_name)
    roc_auc_grid.append(roc_auc)
    print(model.best_estimator_.feature_importances_, file=file_name)
    # print(f"intercept: {model.intercept_}", file=file_name)

    model_feature_df = pd.DataFrame(
        {'Importance': model.best_estimator_.feature_importances_, 'features': features})
    max_coeff_index = list(model.best_estimator_.feature_importances_).index(
        model.best_estimator_.feature_importances_.max())
    print(
        f'max feature: {model.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_name)
    coeff_grid[max_coeff_index] += 1
    print('\n', file=file_name)
    print(model_feature_df, file=file_name)
    print('\n'*2, file=file_name)

    # roc = [ z for z in range(1,num+1)]
    print(f"roc_auc scores on test set: {roc_auc_grid}", file=file_name)
    print(
        f"Average roc_auc scores on test set: {np.average(roc_auc_grid)}", file=file_name)
    print(
        f"std dev of roc_auc scores on test set: {np.std(roc_auc_grid)}", file=file_name)
    print(
        f"Best roc_auc score on test set: {max(roc_auc_grid)} at index {roc_auc_grid.index(max(roc_auc_grid)) + 1}", file=file_name)
    # print(
    #     f"Best model's roc_auc score from early stopping: {model.best_estimator_.best_score_}", file=file_name)
    # print(
    #     f"Best model's iteration from early stopping: {model.best_estimator_.best_iteration_}", file=file_name)
    # print(f"model's eval_results: {model.best_estimator_.evals_result()}", file=file_name)
    # train_eval = list(model.best_estimator_.evals_result()[
    #                   'validation_0'].items())
    # print(
    #     f'Number of training auc scores: {len(train_eval[0][1])}', file=file_name)
    # print(f'auc scores of training set: {train_eval[0][1]}', file=file_name)
    # print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
    # {round(np.std(train_eval[0][1]),2)} \n', file=file_name)

    # val_eval = list(model.best_estimator_.evals_result()
    #                 ['validation_1'].items())
    # print(
    #     f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_name)
    # print(f'auc scores of testing set: {val_eval[0][1]}', file=file_name)
    # print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
    # {round(np.std(val_eval[0][1]),2)} \n', file=file_name)

    print('\n', file=file_name)
    # coeff_df = pd.DataFrame(
    #     {'Top features': coeff_grid, 'features': features})
    # print(f"Coefficients: {coeff_df} \n", file=file_name)

    print(f'Best estimator: {model.best_estimator_} \n', file=file_name)
    print(f'Best params: {model.best_params_} \n', file=file_name)
    print(f'Best estimator: {model.best_estimator_} \n')
    print(f'Best params: {model.best_params_} \n')
    print('\n', file=file_name)
    print(model.cv_results_, file=file_name)
    # print(
    #     f"Best estimator's score from early stopping: {model.best_estimator_.best_score} \n", file=file_name)
    # plotting roc_auc score
    fig = plt.figure()
    fig_ax = fig.add_subplot(1, 1, 1)
    fig.set_size_inches(12, 8, forward=True)
    fig_ax.set_xlabel("Number of runs")
    fig_ax.set_ylabel("Cross-validation ROC-AUC score")
    fig_ax.set_ylim(0, 1.1)

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set properly

    plt.title(
        f'{model_name} (Repeated KFold) hbond {title_tag}', fontsize=12, weight='bold')

    fig_ax.plot(range(1, len(roc_auc_grid)+1), roc_auc_grid, '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(roc_auc_grid),2)}\n \
    std training roc_auc : {round(np.std(roc_auc_grid),2)}")

    # fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\n \
    # std testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

    plt.legend(loc='best')
    file_name.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/{model_name}_{folder_name}_{n_repeat}grid-{xdate}_{rand_seed}.png',
                dpi=500, facecolor='white', bbox_inches='tight')

    plt.show()


In [None]:

def kingmaker_svc(model, p_grid, des_df, nondes_df, cv, file_name, folder_type='unclassified', n_repeat=10, rand_seed=1000, features=features):
    '''
    This function runs a grid search CV on sklearn Logistic 
    Regression.
    model is the classifier instance,
    p_grid is the dictionary of values for the grid search,
    cv is the cross-validation instance,
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV
    '''
    folder_name = folder_type
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    model_name = model.__str__().split('(')[0]
    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"

        
    if folder_name == 'hlife' or folder_name == 'hnum':
        X_train, y_train, X_test, y_test = data_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    elif folder_name == 'merged':
        X_train, y_train, X_test, y_test = df_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    else:
        print("Check folder_type variable")
    

    # print(X_train.shape, y_train.shape)
    clf = GridSearchCV(model, param_grid=p_grid,
                           verbose=1, scoring='roc_auc', cv=cv, refit=True, return_train_score = True)
    
    model = clf.fit(X_train, y_train)
    roc_auc_grid = []

    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff_grid = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff_grid = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff_grid = [0, 0, 0, 0, 0]

    
    print(model.cv_results_)
    results = pd.DataFrame(model.cv_results_)
    # print(results)

    # take the most relevant columns and sort (for readability)
    results = results.loc[:, ('rank_test_score', 'mean_test_score', 'params')]
    results.sort_values(by='rank_test_score', ascending=True, inplace=True)
    print(results.head(1))    
    print(results, file=file_name)
    print('\n'*2, file=file_name)

    print('Best score', model.best_score_, file=file_name)
    print('Best params', model.best_params_, file=file_name)

    y_test_pred_proba = model.best_estimator_.predict_proba(X_test)  # roc_auc_score needs probabilities
    # print('Prediction probabilities: ', y_test_pred_proba)
    print(f'Prediction probabilities: {y_test_pred_proba}', file=file_name)

    y_test_pred = model.best_estimator_.predict(X_test) 
    # print('Predictions: ', y_test_pred)
    print(f'Predictions: {y_test_pred}', file=file_name)
    target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
    print(metrics.classification_report(y_test, y_test_pred,
          target_names=target_names), file=file_name)
    roc_auc = metrics.roc_auc_score(y_test, y_test_pred_proba[:,1])
    print(f"roc_auc_score: {roc_auc}", file=file_name)
    roc_auc_grid.append(roc_auc)
    print(model.best_estimator_._get_coef(), file=file_name)
    print(f"intercept: {model.best_estimator_.intercept_}", file=file_name)


    model_feature_df = pd.DataFrame(
        {'Importance': model.best_estimator_._get_coef().squeeze(), 'features': features})
    max_coeff_index = list(model.best_estimator_._get_coef().squeeze()).index(
        model.best_estimator_._get_coef().max())
    print(
        f'max feature: {model.best_estimator_._get_coef().max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_name)
    coeff_grid[max_coeff_index] += 1
    print('\n', file=file_name)
    print(model_feature_df, file=file_name)
    print('\n'*2, file=file_name)

    # roc = [ z for z in range(1,num+1)]
    print(f"roc_auc scores on test set: {roc_auc_grid}", file=file_name)
    print(
        f"Average roc_auc scores on test set: {np.average(roc_auc_grid)}", file=file_name)
    print(
        f"std dev of roc_auc scores on test set: {np.std(roc_auc_grid)}", file=file_name)
    print(
        f"Best roc_auc score on test set: {max(roc_auc_grid)} at index {roc_auc_grid.index(max(roc_auc_grid)) + 1}", file=file_name)
    
    print('\n', file=file_name)
    coeff_df = pd.DataFrame(
        {'Top features': coeff_grid, 'features': features})
        
    print(f"Coefficients: {coeff_df} \n", file=file_name)

    print(f'Best estimator: {model.best_estimator_} \n', file=file_name)
    print(f'Best params: {model.best_params_} \n', file=file_name)
    print(f'Best estimator: {model.best_estimator_} \n')
    print(f'Best params: {model.best_params_} \n')

    fig = plt.figure()
    fig_ax = fig.add_subplot(1, 1, 1)
    fig.set_size_inches(12, 8, forward=True)
    fig_ax.set_xlabel("Number of runs")
    fig_ax.set_ylabel("Cross-validation ROC-AUC score")
    fig_ax.set_ylim(0, 1.1)

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set properly

    plt.title(
        f'{model_name} (Repeated KFold) hbond {title_tag}', fontsize=12, weight='bold')

    fig_ax.plot(range(1, len(roc_auc_grid)+1), roc_auc_grid, '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(roc_auc_grid),2)}\n \
    std training roc_auc : {round(np.std(roc_auc_grid),2)}")

    plt.legend(loc='best')
    file_name.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/{model_name}_{folder_name}_{n_repeat}grid-{xdate}_{rand_seed}.png',
                dpi=500, facecolor='white', bbox_inches='tight')

    plt.show()


In [None]:

def kingmaker_knn(model, p_grid, des_df, nondes_df, cv, file_name, folder_type='unclassified', n_repeat=10, rand_seed=1000, features=features):
    '''
    This function runs a grid search CV on sklearn KNN.
    model is the classifier instance,
    p_grid is the dictionary of values for the grid search,
    cv is the cross-validation instance,
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV
    '''
    folder_name = folder_type
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    model_name = model.__str__().split('(')[0]
    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"

        
    if folder_name == 'hlife' or folder_name == 'hnum':
        X_train, y_train, X_test, y_test = data_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    elif folder_name == 'merged':
        X_train, y_train, X_test, y_test = df_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    else:
        print("Check folder_type variable")
    

    # print(X_train.shape, y_train.shape)
    clf = GridSearchCV(model, param_grid=p_grid,
                           verbose=1, scoring='roc_auc', cv=cv, refit=True, return_train_score = True)
    
    model = clf.fit(X_train, y_train)
    roc_auc_grid = []

    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff_grid = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff_grid = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff_grid = [0, 0, 0, 0, 0]

    
    print(model.cv_results_)
    results = pd.DataFrame(model.cv_results_)
    # print(results)

    # take the most relevant columns and sort (for readability)
    results = results.loc[:, ('rank_test_score', 'mean_test_score', 'params')]
    results.sort_values(by='rank_test_score', ascending=True, inplace=True)
    print(results.head(1))    
    print(results, file=file_name)
    print('\n'*2, file=file_name)

    print('Best score', model.best_score_, file=file_name)
    print('Best params', model.best_params_, file=file_name)

    y_test_pred_proba = model.best_estimator_.predict_proba(X_test)  # roc_auc_score needs probabilities
    # print('Prediction probabilities: ', y_test_pred_proba)
    print(f'Prediction probabilities: {y_test_pred_proba}', file=file_name)

    y_test_pred = model.best_estimator_.predict(X_test) 
    # print('Predictions: ', y_test_pred)
    print(f'Predictions: {y_test_pred}', file=file_name)
    target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
    print(metrics.classification_report(y_test, y_test_pred,
          target_names=target_names), file=file_name)
    roc_auc = metrics.roc_auc_score(y_test, y_test_pred_proba[:,1])
    print(f"roc_auc_score: {roc_auc}", file=file_name)
    roc_auc_grid.append(roc_auc)
    # print(model.best_estimator_._get_coef(), file=file_name)
    # print(f"intercept: {model.best_estimator_.intercept_}", file=file_name)


    # model_feature_df = pd.DataFrame(
    #     {'Importance': model.best_estimator_._get_coef().squeeze(), 'features': features})
    # max_coeff_index = list(model.best_estimator_._get_coef().squeeze()).index(
    #     model.best_estimator_._get_coef().max())
    # print(
    #     f'max feature: {model.best_estimator_._get_coef().max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_name)
    # coeff_grid[max_coeff_index] += 1
    # print('\n', file=file_name)
    # print(model_feature_df, file=file_name)
    # print('\n'*2, file=file_name)

    # roc = [ z for z in range(1,num+1)]
    print(f"roc_auc scores on test set: {roc_auc_grid}", file=file_name)
    print(
        f"Average roc_auc scores on test set: {np.average(roc_auc_grid)}", file=file_name)
    print(
        f"std dev of roc_auc scores on test set: {np.std(roc_auc_grid)}", file=file_name)
    print(
        f"Best roc_auc score on test set: {max(roc_auc_grid)} at index {roc_auc_grid.index(max(roc_auc_grid)) + 1}", file=file_name)
    
    print('\n', file=file_name)
    # coeff_df = pd.DataFrame(
    #     {'Top features': coeff_grid, 'features': features})
        
    # print(f"Coefficients: {coeff_df} \n", file=file_name)

    print(f'Best estimator: {model.best_estimator_} \n', file=file_name)
    print(f'Best params: {model.best_params_} \n', file=file_name)
    print(f'Best estimator: {model.best_estimator_} \n')
    print(f'Best params: {model.best_params_} \n')

    fig = plt.figure()
    fig_ax = fig.add_subplot(1, 1, 1)
    fig.set_size_inches(12, 8, forward=True)
    fig_ax.set_xlabel("Number of runs")
    fig_ax.set_ylabel("Cross-validation ROC-AUC score")
    fig_ax.set_ylim(0, 1.1)

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set properly

    plt.title(
        f'{model_name} (Repeated KFold) hbond {title_tag}', fontsize=12, weight='bold')

    fig_ax.plot(range(1, len(roc_auc_grid)+1), roc_auc_grid, '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(roc_auc_grid),2)}\n \
    std training roc_auc : {round(np.std(roc_auc_grid),2)}")

    plt.legend(loc='best')
    file_name.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/{model_name}_{folder_name}_{n_repeat}grid-{xdate}_{rand_seed}.png',
                dpi=500, facecolor='white', bbox_inches='tight')

    plt.show()


In [None]:

def kingmaker_lr(model, p_grid, des_df, nondes_df, cv, file_name, folder_type='unclassified', n_repeat=10, rand_seed=1000, features=features):
    '''
    This function runs a grid search CV on sklearn Logistic 
    Regression.
    model is the classifier instance,
    p_grid is the dictionary of values for the grid search,
    cv is the cross-validation instance,
    file_name is the txt file for logs,
    folder_type can be hlife, hnum, merged, or unclassified
    n_repeats is passed to grid search,
    rand_seed ensures replicability of CV
    '''
    folder_name = folder_type
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")
    model_name = model.__str__().strip('()')
    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"

        
    if folder_name == 'hlife' or folder_name == 'hnum':
        X_train, y_train, X_test, y_test = data_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    elif folder_name == 'merged':
        X_train, y_train, X_test, y_test = df_generator(
            des_df, nondes_df, test_sample_size=8, nondes_batch_size=38)
    else:
        print("Check folder_type variable")

    clf = GridSearchCV(model, param_grid=p_grid,
                           verbose=1, scoring='roc_auc', cv=cv, refit=True, return_train_score = True)
    
    model = clf.fit(X_train, y_train)
    roc_auc_grid = []

    if folder_name == 'hlife' or folder_name == 'hnum':  # hlife/hnum have only 5 features
        coeff_grid = [0, 0, 0, 0, 0]
    elif folder_name == 'merged':  # merged has 10 features
        coeff_grid = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    else:
        coeff_grid = [0, 0, 0, 0, 0]

    
    print(model.cv_results_)
    results = pd.DataFrame(model.cv_results_)
    # print(results)

    # take the most relevant columns and sort (for readability)
    results = results.loc[:, ('rank_test_score', 'mean_test_score', 'params')]
    results.sort_values(by='rank_test_score', ascending=True, inplace=True)
    print(results.head(1))    
    print(results, file=file_name)
    print('\n'*2, file=file_name)

    print('Best score', model.best_score_, file=file_name)
    print('Best params', model.best_params_, file=file_name)

    y_test_pred_proba = model.best_estimator_.predict_proba(X_test)  # roc_auc_score needs probabilities
    # print('Prediction probabilities: ', y_test_pred_proba)
    print(f'Prediction probabilities: {y_test_pred_proba}', file=file_name)

    y_test_pred = model.best_estimator_.predict(X_test) 
    # print('Predictions: ', y_test_pred)
    print(f'Predictions: {y_test_pred}', file=file_name)
    target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
    print(metrics.classification_report(y_test, y_test_pred,
          target_names=target_names), file=file_name)
    roc_auc = metrics.roc_auc_score(y_test, y_test_pred_proba[:,1])
    print(f"roc_auc_score: {roc_auc}", file=file_name)
    roc_auc_grid.append(roc_auc)
    print(model.best_estimator_.coef_, file=file_name)
    print(f"intercept: {model.best_estimator_.intercept_}", file=file_name)


    model_feature_df = pd.DataFrame(
        {'Importance': model.best_estimator_.coef_.squeeze(), 'features': features})
    max_coeff_index = list(model.best_estimator_.coef_.squeeze()).index(
        model.best_estimator_.coef_.max())
    print(
        f'max feature: {model.best_estimator_.coef_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_name)
    coeff_grid[max_coeff_index] += 1
    print('\n', file=file_name)
    print(model_feature_df, file=file_name)
    print('\n'*2, file=file_name)

    # roc = [ z for z in range(1,num+1)]
    print(f"roc_auc scores on test set: {roc_auc_grid}", file=file_name)
    print(
        f"Average roc_auc scores on test set: {np.average(roc_auc_grid)}", file=file_name)
    print(
        f"std dev of roc_auc scores on test set: {np.std(roc_auc_grid)}", file=file_name)
    print(
        f"Best roc_auc score on test set: {max(roc_auc_grid)} at index {roc_auc_grid.index(max(roc_auc_grid)) + 1}", file=file_name)
    
    print('\n', file=file_name)
    coeff_df = pd.DataFrame(
        {'Top features': coeff_grid, 'features': features})
        
    print(f"Coefficients: {coeff_df} \n", file=file_name)

    print(f'Best estimator: {model.best_estimator_} \n', file=file_name)
    print(f'Best params: {model.best_params_} \n', file=file_name)
    print(f'Best estimator: {model.best_estimator_} \n')
    print(f'Best params: {model.best_params_} \n')

    fig = plt.figure()
    fig_ax = fig.add_subplot(1, 1, 1)
    fig.set_size_inches(12, 8, forward=True)
    fig_ax.set_xlabel("Number of runs")
    fig_ax.set_ylabel("Cross-validation ROC-AUC score")
    fig_ax.set_ylim(0, 1.1)

    if folder_name == 'hlife':
        title_tag = 'lifetimes'
    elif folder_name == 'hnum':
        title_tag = 'numbers'
    elif folder_name == 'merged':
        title_tag = 'numbers + lifetimes'
    else:
        title_tag = ''  # this could be an indication that the folder_type was not set properly

    plt.title(
        f'{model_name} (Repeated KFold) hbond {title_tag}', fontsize=12, weight='bold')

    fig_ax.plot(range(1, len(roc_auc_grid)+1), roc_auc_grid, '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(roc_auc_grid),2)}\n \
    std training roc_auc : {round(np.std(roc_auc_grid),2)}")

    plt.legend(loc='best')
    file_name.close()
    dirmaker(f'./plots/roc-auc/{folder_name}/{xdate}')
    fig.savefig(f'plots/roc-auc/{folder_name}/{xdate}/{model_name}_{folder_name}_{n_repeat}grid-{xdate}_{rand_seed}.png',
                dpi=500, facecolor='white', bbox_inches='tight')

    plt.show()


## hbond lifetimes
Train models using hlife data alone.

#### hlife house cleaning

In [None]:
nondes_df_hlife = nondes_hlife.drop(columns=['Non-DES'])
nondes_df_hlife['output'] = 0
# nondes_df_hlife

In [None]:
des_df_hlife = des_hlife.drop(columns=['DES'])
des_df_hlife['output'] = 1
# des_df_hlife

## Training loops

#### XGB

In [None]:
# XGBClassifier

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

models = {    
    "XGB": XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.1,
              enable_categorical=False, eval_metric='auc', gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=2,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=10, n_jobs=12, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='hist', 
              validate_parameters=1, verbosity=None
)
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hlife'

tag = "XGB"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]

print(f"{tag} training for {n_repeat} runs")

if model_name.__contains__("XGBClassifier"):
    model_name = "XGBClassifier"

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

train_xgb(model_type, des_df_hlife, nondes_df_hlife, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features)


#### XGBRF

In [None]:

# XGBRF

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

models = {    
    "XGBRF": XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=0.4,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='auc', feature_types=None, gamma=None, gpu_id=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.01, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=2, max_leaves=None,
                min_child_weight=None, monotone_constraints=None,
                n_estimators=10, n_jobs=12, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, random_state=None) 
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hlife'

tag = "XGBRF"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]

print(f"{tag} training for {n_repeat} runs")

if model_name.__contains__("XGBClassifier"):
    model_name = "XGBClassifier"

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

train_xgb(model_type, des_df_hlife, nondes_df_hlife, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features)


#### XGB and XGBRF eval

In [None]:
# XGBClassifier

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

models = {    
    "XGB": XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.1,
              enable_categorical=False, eval_metric='auc', gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=2,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=10, n_jobs=12, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='hist', 
              validate_parameters=1, verbosity=None),
     "XGBRF": XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=0.4,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='auc', feature_types=None, gamma=None, gpu_id=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.01, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=2, max_leaves=None,
                min_child_weight=None, monotone_constraints=None,
                n_estimators=10, n_jobs=12, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, random_state=None)
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hlife'

tag = "XGBRF"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]

print(f"{tag} training for {n_repeat} runs")

if model_name.__contains__("XGBClassifier"):
    model_name = "XGBClassifier"

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_eval_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

# Use this for getting validation metrics plotted alongside training metrics
train_xgb_eval(model_type, des_df_hlife, nondes_df_hlife, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features)


In [None]:
# KNN old

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})


models = {
    "KNN": KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=15,
                     weights='distance')
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hlife'

tag = "KNN"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(model_name)
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_knn(model_type, des_df_hlife, nondes_df_hlife, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features, model_name=model_name)

#### Logistic Regression

In [None]:
# Logistic Regression

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})


models = {
    "LR": LogisticRegression(max_iter=400)
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hlife'
# cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)

tag = "LR"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_lr(model_type, des_df_hlife, nondes_df_hlife, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features, model_name=model_name)

#### AB, DT, EF, GB, RF

In [None]:
# AB, DT, EF, GB, RF

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "RF": RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=3,
                       min_samples_split=4, n_estimators=30),
    "EF": ExtraTreesClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=3,
                     min_samples_split=6, n_estimators=20),
    "GB": GradientBoostingClassifier(learning_rate=0.01, max_depth=4, max_features='log2',
                           n_estimators=20),
    "AB": AdaBoostClassifier(learning_rate=0.01, n_estimators=80),
    "DT": DecisionTreeClassifier(max_depth=10, max_features='log2', min_samples_leaf=3,
                       min_samples_split=6)
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hlife'

# tag = "DT"
for x in models.keys():
    tag = x
    model_type = models[tag]
    model_name = model_type.__str__().split('(')[0]

    print(f"{tag} training for {n_repeat} runs")

    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"

    dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
    file_name = open(
        f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

    train_ab_dt_ef_gb_rf(model_type, des_df_hlife, nondes_df_hlife, 
    file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features)

### KNN

In [None]:
# KNN

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})


models = {
    "KNN": KNeighborsClassifier(metric='manhattan', n_jobs=-1, weights='distance')
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hlife'

tag = "KNN"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(model_name)
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_knn(model_type, des_df_hlife, nondes_df_hlife, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features, model_name=model_name)

### SVC

In [None]:
# SVC

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})


models = {
    "SVC": SVC(kernel='linear', probability=True)
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hlife'

tag = "SVC"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(model_name)
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_svc(model_type, des_df_hlife, nondes_df_hlife, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features, model_name=model_name)

## Hyperparam optimisation

#### XGB

In [None]:

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 300.0,
    'axes.linewidth':2.0,
})

models = {
     "XGB": XGBClassifier(
            tree_method='hist',
            use_label_encoder=False,
            eval_metric='auc',
            objective='binary:logistic',
            n_jobs=multiprocessing.cpu_count())  
}

params = { 'XGB':{'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }
}

print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "XGB"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
folder_type = 'hlife'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_hlife_xgb_repeatK = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


kingmaker_xgb(model_generic, p_repeatK, des_df_hlife, nondes_df_hlife, cv=cv,
           file_name=file_hlife_xgb_repeatK, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### XGBRF

In [None]:
# XGBRF
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 300.0,
    'axes.linewidth':2.0,
})

models = {
    "XGBRF": XGBRFClassifier(
            tree_method='hist',
            eval_metric='auc',
            objective='binary:logistic',
            n_jobs=multiprocessing.cpu_count())  
}

params = {
           'XGBRF':{'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }
}

print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "XGBRF"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
print(model_name)
folder_type = 'hlife'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_hlife_xgb_repeatK = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


kingmaker_xgbrf(model_generic, p_repeatK, des_df_hlife, nondes_df_hlife, cv=cv, features=features,
           file_name=file_hlife_xgb_repeatK, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### LR

In [None]:
# LR
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "LR": LogisticRegression()
}

params = {
    "LR": {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'tol': [1e-4, 1e-6],
             'max_iter': [100, 200, 300, 400, 500],
             'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "LR"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().strip('()')
folder_type = 'hlife'

dirmaker(f'./model-logs/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

kingmaker_lr(model_generic, p_repeatK, des_df_hlife, nondes_df_hlife, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)




#### KNN

In [None]:
# KNN
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "KNN": KNeighborsClassifier(n_jobs=-1)
}

params = {
    "KNN": {'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'leaf_size' : [30, 40, 50, 60]
               }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "KNN"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
folder_type = 'hlife'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

kingmaker_knn(model_generic, p_repeatK, des_df_hlife, nondes_df_hlife, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### SVC

In [None]:
# SVC
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "SVC": SVC(probability=True),
    "SVC1": SVC(probability=True)
}

# 'gamma': ['scale', 'auto'],

params = {
    "SVC": {'gamma': ['scale', 'auto'],
            'tol': [1e-3, 1e-4, 1e-5, 1e-6],
             'degree': [3, 5, 7],
             'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
             }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "SVC"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
folder_type = 'hlife'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

kingmaker_svc(model_generic, p_repeatK, des_df_hlife, nondes_df_hlife, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### The others

In [None]:

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "RF": RandomForestClassifier(),
    "EF": ExtraTreesClassifier(),
    "GB": GradientBoostingClassifier(),
    "AB": AdaBoostClassifier(),
    "DT": DecisionTreeClassifier(),
    "XGB": XGBClassifier(),
}

params = {
    "RF": {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["auto", "sqrt", "log2"],
             },
    "EF": {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["auto", "sqrt", "log2"],
             },
    "GB": {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["auto", "sqrt", "log2"],
             'learning_rate': [0.01, 0.1],
             },
    "AB": {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
             'learning_rate': [0.001, 0.01, 0.1, 1.0],
             },
    "DT": {'max_depth': [2, 4, 6, 8, 10],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["auto", "sqrt", "log2"],
             }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "RF"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().strip('()')
folder_type = 'hlife'

if model_name.__contains__("XGBClassifier"):
    model_name = "XGBClassifier"


dirmaker(f'./model-logs/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

kingmaker_generic(model_generic, p_repeatK, des_df_hlife, nondes_df_hlife, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


### XGBoost with Repeated KFold

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.metrics import roc_auc_score

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})  # 'figure.figsize': [14.0, 10.0],


print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
xgb_roc_auc_repeatK = []
coeff_xgb_repeatK = [0, 0, 0, 0, 0]
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=5, n_repeats=n_repeat, random_state=rand_seed)

dirmaker(f'./model-logs/hlife/{xdate}')
file_hlife_xgb_repeatK = open(
    f"./model-logs/hlife/{xdate}/XGB_hlife_{n_repeat}repeatKFold_{xdate}_{rand_seed}.txt", "w+")

xgb_class_hliferepeatK = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    objective='binary:logistic',
    n_jobs=multiprocessing.cpu_count() // 2)  # tree_method='gpu_hist', enable_categorical=True, predictor='gpu_predictor',

p_repeatK = {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             # ,0.6,0.7,0.8,0.9,1.0
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }


X_train_hlife, y_train_hlife, X_test_hlife, y_test_hlife = data_generator(
    des_df_hlife, nondes_df_hlife, test_sample_size=8, nondes_batch_size=38)

xgb_clf = GridSearchCV(xgb_class_hliferepeatK,
                       param_grid=p_repeatK, verbose=1, scoring='roc_auc', cv=cv)

xgb_class_hlife_repeatK = xgb_clf.fit(X_train_hlife, y_train_hlife, eval_set=[(X_train_hlife, y_train_hlife), (
    X_test_hlife, y_test_hlife)], verbose=False, early_stopping_rounds=50)  # , early_stopping_rounds=50

print('Best score', xgb_class_hlife_repeatK.best_score_,
      file=file_hlife_xgb_repeatK)
print('Best params', xgb_class_hlife_repeatK.best_params_,
      file=file_hlife_xgb_repeatK)
y_pred_hlife_xgb = xgb_class_hlife_repeatK.best_estimator_.predict(
    X_test_hlife)
target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
print(metrics.classification_report(y_test_hlife, y_pred_hlife_xgb,
      target_names=target_names), file=file_hlife_xgb_repeatK)
roc_auc_xgb = metrics.roc_auc_score(y_test_hlife, y_pred_hlife_xgb)
print(f"roc_auc_score: {roc_auc_xgb}", file=file_hlife_xgb_repeatK)
xgb_roc_auc_repeatK.append(roc_auc_xgb)
print(xgb_class_hlife_repeatK.best_estimator_.feature_importances_,
      file=file_hlife_xgb_repeatK)
# print(f"intercept: {xgb_class_hlife_repeatK.intercept_}", file=file_hlife_xgb_repeatK)

xgb_class_hlife_repeatK_feature_df = pd.DataFrame(
    {'Importance': xgb_class_hlife_repeatK.best_estimator_.feature_importances_, 'features': features})
max_coeff_index = list(xgb_class_hlife_repeatK.best_estimator_.feature_importances_).index(
    xgb_class_hlife_repeatK.best_estimator_.feature_importances_.max())
print(
    f'max feature: {xgb_class_hlife_repeatK.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_hlife_xgb_repeatK)
coeff_xgb_repeatK[max_coeff_index] += 1
print('\n', file=file_hlife_xgb_repeatK)
print(xgb_class_hlife_repeatK_feature_df, file=file_hlife_xgb_repeatK)
print('\n'*2, file=file_hlife_xgb_repeatK)

# roc = [ z for z in range(1,num+1)]
print(
    f"roc_auc scores on test set: {xgb_roc_auc_repeatK}", file=file_hlife_xgb_repeatK)
print(
    f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_repeatK)}", file=file_hlife_xgb_repeatK)
print(
    f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_repeatK)}", file=file_hlife_xgb_repeatK)
print(
    f"Best roc_auc score on test set: {max(xgb_roc_auc_repeatK)} at index {xgb_roc_auc_repeatK.index(max(xgb_roc_auc_repeatK)) + 1}", file=file_hlife_xgb_repeatK)
print(
    f"Best model's roc_auc score from early stopping: {xgb_class_hlife_repeatK.best_estimator_.best_score}", file=file_hlife_xgb_repeatK)
print(
    f"Best model's iteration from early stopping: {xgb_class_hlife_repeatK.best_estimator_.best_iteration}", file=file_hlife_xgb_repeatK)
# print(f"model's eval_results: {xgb_class_hlife_repeatK.best_estimator_.evals_result()}", file=file_hlife_xgb_repeatK)
train_eval = list(xgb_class_hlife_repeatK.best_estimator_.evals_result()[
                  'validation_0'].items())
print(
    f'Number of training auc scores: {len(train_eval[0][1])}', file=file_hlife_xgb_repeatK)
print(
    f'auc scores of training set: {train_eval[0][1]}', file=file_hlife_xgb_repeatK)
print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
{round(np.std(train_eval[0][1]),2)} \n', file=file_hlife_xgb_repeatK)

val_eval = list(xgb_class_hlife_repeatK.best_estimator_.evals_result()[
                'validation_1'].items())
print(
    f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_hlife_xgb_repeatK)
print(
    f'auc scores of testing set: {val_eval[0][1]}', file=file_hlife_xgb_repeatK)
print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
{round(np.std(val_eval[0][1]),2)} \n', file=file_hlife_xgb_repeatK)

print('\n', file=file_hlife_xgb_repeatK)
coeff_df_xgb = pd.DataFrame(
    {'Top features': coeff_xgb_repeatK, 'features': features})
print(f"Coefficients: {coeff_df_xgb} \n", file=file_hlife_xgb_repeatK)

print(f'Best estimator: {xgb_class_hlife_repeatK.best_estimator_} \n',
      file=file_hlife_xgb_repeatK)
print(f'Best params: {xgb_class_hlife_repeatK.best_params_} \n',
      file=file_hlife_xgb_repeatK)
print(
    f"Best estimator's score from early stopping: {xgb_class_hlife_repeatK.best_estimator_.best_score} \n", file=file_hlife_xgb_repeatK)
# plotting roc_auc score
fig = plt.figure()
fig_ax = fig.add_subplot(1, 1, 1)
fig.set_size_inches(12, 8, forward=True)
fig_ax.set_xlabel("Number of runs")
fig_ax.set_ylabel("Cross-validation ROC-AUC score")
fig_ax.set_ylim(0, 1.1)
plt.title('XGBoost (Repeated KFold) hbond lifetime',
          fontsize=12, weight='bold')

fig_ax.plot(range(1, len(train_eval[0][1])+1), train_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval[0][1]),2)}\n \
std training roc_auc : {round(np.std(train_eval[0][1]),2)}")

fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\n \
std testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

plt.legend(loc='best')
file_hlife_xgb_repeatK.close()
dirmaker(f'./plots/roc-auc/hlife/{xdate}')
fig.savefig(f'plots/roc-auc/hlife/{xdate}/XGB_hlife_{n_repeat}repeatK-{xdate}_{rand_seed}.png',
            dpi=500, facecolor='white', bbox_inches='tight')

xgb.plot_importance(
    xgb_class_hlife_repeatK.best_estimator_).set_yticklabels(features)
plt.show()


### XGBoost with Repeated stratified KFold

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.metrics import roc_auc_score

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})  # 'figure.figsize': [14.0, 10.0],


print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
xgb_roc_auc_stratK = []
coeff_xgb_stratK = [0, 0, 0, 0, 0]
rand_seed = 1000
n_repeat = 10
cv = RepeatedStratifiedKFold(
    n_splits=10, n_repeats=n_repeat, random_state=rand_seed)

dirmaker(f'./model-logs/hlife/{xdate}')
file_hlife_xgb_stratK = open(
    f"./model-logs/hlife/{xdate}/XGB_hlife_{n_repeat}stratKFold_{xdate}_{rand_seed}.txt", "w+")

xgb_class_hlifestratK = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    objective='binary:logistic',
    n_jobs=-1)  # tree_method='gpu_hist', enable_categorical=True, predictor='gpu_predictor',

p_stratK = {'max_depth': [2, 4, 6, 8, 10],
            'n_estimators': [10, 20, 30, 40, 50, 100],
            'learning_rate': [0.01, 0.1],
            # ,0.6,0.7,0.8,0.9,1.0
            'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
            }


X_train_hlife, y_train_hlife, X_test_hlife, y_test_hlife = data_generator(
    des_df_hlife, nondes_df_hlife, test_sample_size=8, nondes_batch_size=38)

xgb_clf = GridSearchCV(xgb_class_hlifestratK,
                       param_grid=p_stratK, verbose=1, scoring='roc_auc', cv=cv)

xgb_class_hlife_stratK = xgb_clf.fit(X_train_hlife, y_train_hlife, eval_set=[(X_train_hlife, y_train_hlife), (
    X_test_hlife, y_test_hlife)], verbose=False, early_stopping_rounds=50)  # , early_stopping_rounds=50

print('Best score', xgb_class_hlife_stratK.best_score_, file=file_hlife_xgb_stratK)
print('Best params', xgb_class_hlife_stratK.best_params_,
      file=file_hlife_xgb_stratK)
y_pred_hlife_xgb = xgb_class_hlife_stratK.best_estimator_.predict(X_test_hlife)
target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
print(metrics.classification_report(y_test_hlife, y_pred_hlife_xgb,
      target_names=target_names), file=file_hlife_xgb_stratK)
roc_auc_xgb = metrics.roc_auc_score(y_test_hlife, y_pred_hlife_xgb)
print(f"roc_auc_score: {roc_auc_xgb}", file=file_hlife_xgb_stratK)
xgb_roc_auc_stratK.append(roc_auc_xgb)
print(xgb_class_hlife_stratK.best_estimator_.feature_importances_,
      file=file_hlife_xgb_stratK)
# print(f"intercept: {xgb_class_hlife_stratK.intercept_}", file=file_hlife_xgb_stratK)

xgb_class_hlife_stratK_feature_df = pd.DataFrame(
    {'Importance': xgb_class_hlife_stratK.best_estimator_.feature_importances_, 'features': features})
max_coeff_index = list(xgb_class_hlife_stratK.best_estimator_.feature_importances_).index(
    xgb_class_hlife_stratK.best_estimator_.feature_importances_.max())
print(
    f'max feature: {xgb_class_hlife_stratK.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_hlife_xgb_stratK)
coeff_xgb_stratK[max_coeff_index] += 1
print('\n', file=file_hlife_xgb_stratK)
print(xgb_class_hlife_stratK_feature_df, file=file_hlife_xgb_stratK)
print('\n'*2, file=file_hlife_xgb_stratK)

# roc = [ z for z in range(1,num+1)]
print(
    f"roc_auc scores on test set: {xgb_roc_auc_stratK}", file=file_hlife_xgb_stratK)
print(
    f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_stratK)}", file=file_hlife_xgb_stratK)
print(
    f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_stratK)}", file=file_hlife_xgb_stratK)
print(
    f"Best roc_auc score on test set: {max(xgb_roc_auc_stratK)} at index {xgb_roc_auc_stratK.index(max(xgb_roc_auc_stratK)) + 1}", file=file_hlife_xgb_stratK)
print(
    f"Best model's roc_auc score from early stopping: {xgb_class_hlife_stratK.best_estimator_.best_score}", file=file_hlife_xgb_stratK)
print(
    f"Best model's iteration from early stopping: {xgb_class_hlife_stratK.best_estimator_.best_iteration}", file=file_hlife_xgb_stratK)
# print(f"model's eval_results: {xgb_class_hlife_stratK.best_estimator_.evals_result()}", file=file_hlife_xgb_stratK)
train_eval = list(xgb_class_hlife_stratK.best_estimator_.evals_result()[
                  'validation_0'].items())
print(
    f'Number of training auc scores: {len(train_eval[0][1])}', file=file_hlife_xgb_stratK)
print(
    f'auc scores of training set: {train_eval[0][1]}', file=file_hlife_xgb_stratK)
print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
{round(np.std(train_eval[0][1]),2)} \n', file=file_hlife_xgb_stratK)

val_eval = list(xgb_class_hlife_stratK.best_estimator_.evals_result()[
                'validation_1'].items())
print(
    f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_hlife_xgb_stratK)
print(
    f'auc scores of testing set: {val_eval[0][1]}', file=file_hlife_xgb_stratK)
print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
{round(np.std(val_eval[0][1]),2)} \n', file=file_hlife_xgb_stratK)

print('\n', file=file_hlife_xgb_stratK)
coeff_df_xgb = pd.DataFrame(
    {'Top features': coeff_xgb_stratK, 'features': features})
print(f"Coefficients: {coeff_df_xgb} \n", file=file_hlife_xgb_stratK)

print(f'Best estimator: {xgb_class_hlife_stratK.best_estimator_} \n',
      file=file_hlife_xgb_stratK)
print(f'Best params: {xgb_class_hlife_stratK.best_params_} \n',
      file=file_hlife_xgb_stratK)
print(
    f"Best estimator's score from early stopping: {xgb_class_hlife_stratK.best_estimator_.best_score} \n", file=file_hlife_xgb_stratK)
# plotting roc_auc score
fig = plt.figure()
fig_ax = fig.add_subplot(1, 1, 1)
fig.set_size_inches(12, 8, forward=True)
fig_ax.set_xlabel("Number of runs")
fig_ax.set_ylabel("Cross-validation ROC-AUC score")
fig_ax.set_ylim(0, 1.1)
plt.title('XGBoost (Repeated stratified KFold) hbond lifetime',
          fontsize=12, weight='bold')

fig_ax.plot(range(1, len(train_eval[0][1])+1), train_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval[0][1]),2)}\n \
std training roc_auc : {round(np.std(train_eval[0][1]),2)}")

fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\n \
std testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

plt.legend(loc='best')
file_hlife_xgb_stratK.close()
dirmaker(f'./plots/roc-auc/hlife/{xdate}')
fig.savefig(f'plots/roc-auc/hlife/{xdate}/XGB_hlife_{n_repeat}stratKsearch-{xdate}_{rand_seed}.png',
            dpi=500, facecolor='white', bbox_inches='tight')

xgb.plot_importance(
    xgb_class_hlife_stratK.best_estimator_).set_yticklabels(features)
plt.show()


### Nested and non-Nested CVs
Generates X_train and y_train that can then be split into train/test by the model.
Useful for CV.

In [None]:
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np

# Number of random trials
NUM_TRIALS = 30

# Load the dataset
X_train, y_train = data_crossval(des_df_hlife, nondes_df_hlife, batch_size=38)

# Set up possible values of parameters to optimize over
p_grid = {"C": [1, 10, 100, 1000], "gamma": [0.0001, 0.001, 0.01, 0.1]}
# p_grid = {"C": [1, 10, 100, 1000]}

# p_grid = [
#   {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
#   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
#  ]

# We will use a Support Vector Classifier with "rbf" kernel
svm = SVC(kernel="rbf")
# svm = SVC(kernel="linear")
# svm = SVC()

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=outer_cv)
    clf.fit(X_train, y_train)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv)
    nested_score = cross_val_score(clf, X=X_train, y=y_train, cv=outer_cv, scoring="roc_auc")
    nested_scores[i] = nested_score.mean()

score_difference = non_nested_scores - nested_scores

print(
    "Average difference of {:6f} with std. dev. of {:6f}.".format(
        score_difference.mean(), score_difference.std()
    )
)

# Plot scores on each trial for nested and non-nested CV
plt.figure(figsize=(10, 10))
plt.subplot(211)
(non_nested_scores_line,) = plt.plot(non_nested_scores, color="r")
(nested_line,) = plt.plot(nested_scores, color="b")
plt.ylabel("score", fontsize="14")
plt.legend(
    [non_nested_scores_line, nested_line],
    ["Non-Nested CV", "Nested CV"],
    bbox_to_anchor=(0, 0.4, 0.5, 0),
)
plt.title(
    "Non-Nested and Nested Cross Validation on Iris Dataset",
    x=0.5,
    y=1.1,
    fontsize="15",
)

# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend(
    [difference_plot],
    ["Non-Nested CV - Nested CV score"],
    bbox_to_anchor=(0, 1, 0.8, 0),
)
plt.ylabel("score difference", fontsize="14")

plt.show()


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score
from sklearn.metrics import roc_auc_score

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})  # 'figure.figsize': [14.0, 10.0],


print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
xgb_roc_auc_repeatK_nestedCV = []
coeff_xgb_repeatK_nestedCV = [0, 0, 0, 0, 0]
rand_seed = 100
n_repeat = 10
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeat, random_state=rand_seed)
outer_cv = RepeatedKFold(n_splits=4, n_repeats=n_repeat, random_state=rand_seed)

dirmaker(f'./model-logs/hlife/{xdate}')
file_hlife_xgb_repeatK_nestedCV = open(
    f"./model-logs/hlife/{xdate}/XGB_hlife_{n_repeat}repeatK_nestedCV_{xdate}_{rand_seed}.txt", "w+")

xgb_class_hliferepeatK_nestedCV = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    objective='binary:logistic',
    n_jobs=-1) 

p_repeatK_nestedCV = {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }


X_train_hlife, y_train_hlife, X_test_hlife, y_test_hlife = data_generator(
    des_df_hlife, nondes_df_hlife, test_sample_size=8, nondes_batch_size=38)

xgb_clf = GridSearchCV(xgb_class_hliferepeatK_nestedCV,
                       param_grid=p_repeatK_nestedCV, verbose=1, scoring='roc_auc', cv=inner_cv, refit=True)

xgb_class_hlife_repeatK_nestedCV = cross_val_score(xgb_clf, X_train_hlife, y_train_hlife, scoring='roc_auc', cv=outer_cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(xgb_class_hlife_repeatK_nestedCV), np.std(xgb_class_hlife_repeatK_nestedCV)))

print('Best score', xgb_class_hlife_repeatK_nestedCV.best_score_,
      file=file_hlife_xgb_repeatK_nestedCV)
print('Best params', xgb_class_hlife_repeatK_nestedCV.best_params_,
      file=file_hlife_xgb_repeatK_nestedCV)
y_pred_hlife_xgb = xgb_class_hlife_repeatK_nestedCV.best_estimator_.predict(
    X_test_hlife)
target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
print(metrics.classification_report(y_test_hlife, y_pred_hlife_xgb,
      target_names=target_names), file=file_hlife_xgb_repeatK_nestedCV)
roc_auc_xgb = metrics.roc_auc_score(y_test_hlife, y_pred_hlife_xgb)
print(f"roc_auc_score: {roc_auc_xgb}", file=file_hlife_xgb_repeatK_nestedCV)
xgb_roc_auc_repeatK_nestedCV.append(roc_auc_xgb)
print(xgb_class_hlife_repeatK_nestedCV.best_estimator_.feature_importances_,
      file=file_hlife_xgb_repeatK_nestedCV)
# print(f"intercept: {xgb_class_hlife_repeatK_nestedCV.intercept_}", file=file_hlife_xgb_repeatK_nestedCV)

xgb_class_hlife_repeatK_nestedCV_feature_df = pd.DataFrame(
    {'Importance': xgb_class_hlife_repeatK_nestedCV.best_estimator_.feature_importances_, 'features': features})
max_coeff_index = list(xgb_class_hlife_repeatK_nestedCV.best_estimator_.feature_importances_).index(
    xgb_class_hlife_repeatK_nestedCV.best_estimator_.feature_importances_.max())
print(
    f'max feature: {xgb_class_hlife_repeatK_nestedCV.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_hlife_xgb_repeatK_nestedCV)
coeff_xgb_repeatK_nestedCV[max_coeff_index] += 1
print('\n', file=file_hlife_xgb_repeatK_nestedCV)
print(xgb_class_hlife_repeatK_nestedCV_feature_df, file=file_hlife_xgb_repeatK_nestedCV)
print('\n'*2, file=file_hlife_xgb_repeatK_nestedCV)

# roc = [ z for z in range(1,num+1)]
print(
    f"roc_auc scores on test set: {xgb_roc_auc_repeatK_nestedCV}", file=file_hlife_xgb_repeatK_nestedCV)
print(
    f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_repeatK_nestedCV)}", file=file_hlife_xgb_repeatK_nestedCV)
print(
    f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_repeatK_nestedCV)}", file=file_hlife_xgb_repeatK_nestedCV)
print(
    f"Best roc_auc score on test set: {max(xgb_roc_auc_repeatK_nestedCV)} at index {xgb_roc_auc_repeatK_nestedCV.index(max(xgb_roc_auc_repeatK_nestedCV)) + 1}", file=file_hlife_xgb_repeatK_nestedCV)
print(
    f"Best model's roc_auc score from early stopping: {xgb_class_hlife_repeatK_nestedCV.best_estimator_.best_score}", file=file_hlife_xgb_repeatK_nestedCV)
print(
    f"Best model's iteration from early stopping: {xgb_class_hlife_repeatK_nestedCV.best_estimator_.best_iteration}", file=file_hlife_xgb_repeatK_nestedCV)
# print(f"model's eval_results: {xgb_class_hlife_repeatK_nestedCV.best_estimator_.evals_result()}", file=file_hlife_xgb_repeatK_nestedCV)
train_eval = list(xgb_class_hlife_repeatK_nestedCV.best_estimator_.evals_result()[
                  'validation_0'].items())
print(
    f'Number of training auc scores: {len(train_eval[0][1])}', file=file_hlife_xgb_repeatK_nestedCV)
print(
    f'auc scores of training set: {train_eval[0][1]}', file=file_hlife_xgb_repeatK_nestedCV)
print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
{round(np.std(train_eval[0][1]),2)} \n', file=file_hlife_xgb_repeatK_nestedCV)

val_eval = list(xgb_class_hlife_repeatK_nestedCV.best_estimator_.evals_result()[
                'validation_1'].items())
print(
    f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_hlife_xgb_repeatK_nestedCV)
print(
    f'auc scores of testing set: {val_eval[0][1]}', file=file_hlife_xgb_repeatK_nestedCV)
print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
{round(np.std(val_eval[0][1]),2)} \n', file=file_hlife_xgb_repeatK_nestedCV)

print('\n', file=file_hlife_xgb_repeatK_nestedCV)
coeff_df_xgb = pd.DataFrame(
    {'Top features': coeff_xgb_repeatK_nestedCV, 'features': features})
print(f"Coefficients: {coeff_df_xgb} \n", file=file_hlife_xgb_repeatK_nestedCV)

print(f'Best estimator: {xgb_class_hlife_repeatK_nestedCV.best_estimator_} \n',
      file=file_hlife_xgb_repeatK_nestedCV)
print(f'Best params: {xgb_class_hlife_repeatK_nestedCV.best_params_} \n',
      file=file_hlife_xgb_repeatK_nestedCV)
print(
    f"Best estimator's score from early stopping: {xgb_class_hlife_repeatK_nestedCV.best_estimator_.best_score} \n", file=file_hlife_xgb_repeatK_nestedCV)
# plotting roc_auc score
fig = plt.figure()
fig_ax = fig.add_subplot(1, 1, 1)
fig.set_size_inches(12, 8, forward=True)
fig_ax.set_xlabel("Number of runs")
fig_ax.set_ylabel("Cross-validation ROC-AUC score")
fig_ax.set_ylim(0, 1.1)
plt.title('XGBoost (Repeated KFold) hbond lifetime',
          fontsize=12, weight='bold')

fig_ax.plot(range(1, len(train_eval[0][1])+1), train_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval[0][1]),2)}\n \
std training roc_auc : {round(np.std(train_eval[0][1]),2)}")

fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\n \
std testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

plt.legend(loc='best')
file_hlife_xgb_repeatK_nestedCV.close()
dirmaker(f'./plots/roc-auc/hlife/{xdate}')
fig.savefig(f'plots/roc-auc/hlife/{xdate}/XGB_hlife_{n_repeat}repeatK_nestedCV-{xdate}_{rand_seed}.png',
            dpi=500, facecolor='white', bbox_inches='tight')

xgb.plot_importance(
    xgb_class_hlife_repeatK_nestedCV.best_estimator_).set_yticklabels(features)
plt.show()


In [None]:
xgb_clf.best_estimator_

## hnumber only

#### non-des hnum

In [None]:
nondes_df_hnum = nondes_hnum.drop(columns=['DES'])  # should change this to non-DES
nondes_df_hnum['output'] = 0
# nondes_df_hnum

#### des hnum

In [None]:
des_df_hnum = des_hnum.drop(columns=['DES'])  # should change this to DES
des_df_hnum['output'] = 1
# des_df_hnum

## Hyperparameter optimization
Using gridsearch cv to find the best model params

#### Generic GridSearch funtion

#### XGB

In [None]:

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 300.0,
    'axes.linewidth':2.0,
})

models = {
     "XGB": XGBClassifier(
            tree_method='hist',
            eval_metric='auc',
            objective='binary:logistic',
            n_jobs=multiprocessing.cpu_count(), early_stopping_rounds=50)
}

params = { 'XGB':{'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }
}

print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "XGB"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
folder_type = 'hnum'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_hnum_xgb_repeatK = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


kingmaker_xgb(model_generic, p_repeatK, des_df_hnum, nondes_df_hnum, cv=cv,
           file_name=file_hnum_xgb_repeatK, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### XGBRF

In [None]:
# XGBRF
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 300.0,
    'axes.linewidth':2.0,
})

models = {
    "XGBRF": XGBRFClassifier(
            tree_method='hist',
            eval_metric='auc',
            objective='binary:logistic',
            n_jobs=multiprocessing.cpu_count())  
}

params = {
           'XGBRF':{'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }
}

print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "XGBRF"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
print(model_name)
folder_type = 'hnum'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_hnum_xgb_repeatK = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


kingmaker_xgbrf(model_generic, p_repeatK, des_df_hnum, nondes_df_hnum, cv=cv, features=features,
           file_name=file_hnum_xgb_repeatK, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### LR

In [None]:
# LR
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "LR": LogisticRegression()
}

params = {
    "LR": {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'tol': [1e-4, 1e-6],
             'max_iter': [100, 200, 300, 400, 500],
             'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "LR"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().strip('()')
folder_type = 'hnum'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

kingmaker_lr(model_generic, p_repeatK, des_df_hnum, nondes_df_hnum, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### The others

In [None]:

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "RF": RandomForestClassifier(),
    "EF": ExtraTreesClassifier(),
    "GB": GradientBoostingClassifier(),
    "AB": AdaBoostClassifier(),
    "DT": DecisionTreeClassifier()
}

params = {
    "RF": {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["sqrt", "log2"],
             },
    "EF": {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["sqrt", "log2"],
             },
    "GB": {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["sqrt", "log2"],
             'learning_rate': [0.01, 0.1],
             },
    "AB": {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
             'learning_rate': [0.001, 0.01, 0.1, 1.0],
             },
    "DT": {'max_depth': [2, 4, 6, 8, 10],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': [ "sqrt", "log2"],
             }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
# cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)

for x in models.keys():
    cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)

    tag = x
    print(f"{tag} Parallel Parameter optimization")

    model_generic = models[tag]  
    p_repeatK = params[tag]
    model_name = model_generic.__str__().strip('()')
    folder_type = 'hnum'

    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"


    dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
    file_name = open(
        f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

    kingmaker_generic(model_generic, p_repeatK, des_df_hnum, nondes_df_hnum, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### KNN

In [None]:
# KNN
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "KNN": KNeighborsClassifier(n_jobs=-1)
}

params = {
    "KNN": {'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'leaf_size' : [30, 40, 50, 60]
               }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "KNN"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
folder_type = 'hnum'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

kingmaker_knn(model_generic, p_repeatK, des_df_hnum, nondes_df_hnum, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### SVC

In [None]:
# SVC
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "SVC": SVC(probability=True),
    "SVC1": SVC(probability=True)
}

# 'gamma': ['scale', 'auto'],
# 'gamma': [0.001, 0.0001],
# "C": [1, 10, 100, 1000]
#             
params = {
    "SVC": { 
             'degree': [3, 5, 7],
             'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
             'tol': [1e-3, 1e-4, 1e-5, 1e-6],
             "C": [1, 10]
             }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "SVC"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
folder_type = 'hnum'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

kingmaker_svc(model_generic, p_repeatK, des_df_hnum, nondes_df_hnum, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


In [None]:
SVC().__str__().strip('()')

## Training loops

#### XGB

In [None]:

# XGB

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

models = {    
    "XGB-old": XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.1,
              enable_categorical=False, eval_metric='auc', gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='hist',
              validate_parameters=1, verbosity=None),
    "XGB": XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.4, 
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, monotone_constraints=None,
              n_estimators=50, n_jobs=12, num_parallel_tree=None,
              predictor=None, random_state=None)
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hnum'

tag = "XGB"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]

print(f"{tag} training for {n_repeat} runs")

if model_name.__contains__("XGBClassifier"):
    model_name = "XGBClassifier"

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

train_xgb(model_type, des_df_hnum, nondes_df_hnum, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features)


#### XGBRF

In [None]:

# XGBRF

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

models = {    
    "XGBRF-old": XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=0.1,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='auc', feature_types=None, gamma=None, gpu_id=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.01, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=6, max_leaves=None,
                min_child_weight=None, monotone_constraints=None,
                n_estimators=100, n_jobs=12, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, random_state=None),
    "XGBRF": XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=0.4,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='auc', feature_types=None, gamma=None, gpu_id=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.01, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=8, max_leaves=None,
                min_child_weight=None, monotone_constraints=None,
                n_estimators=100, n_jobs=12, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, random_state=None)  
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hnum'

tag = "XGBRF"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]

print(f"{tag} training for {n_repeat} runs")

if model_name.__contains__("XGBClassifier"):
    model_name = "XGBClassifier"

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

train_xgb(model_type, des_df_hnum, nondes_df_hnum, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features)


#### XGB and XGBRF eval

In [None]:
# XGBClassifier

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

models = {    
    "XGB": XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.4, 
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, monotone_constraints=None,
              n_estimators=50, n_jobs=12, num_parallel_tree=None,
              predictor=None, random_state=None),
     "XGBRF": XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=0.4,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='auc', feature_types=None, gamma=None, gpu_id=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.01, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=8, max_leaves=None,
                min_child_weight=None, monotone_constraints=None,
                n_estimators=100, n_jobs=12, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, random_state=None)
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hnum'

tag = "XGB"  #"XGBRF"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]

print(f"{tag} training for {n_repeat} runs")

if model_name.__contains__("XGBClassifier"):
    model_name = "XGBClassifier"

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_eval_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

# Use this for getting validation metrics plotted alongside training metrics
train_xgb_eval(model_type, des_df_hnum, nondes_df_hnum, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features)


#### Logistic Regression

In [None]:
# Logistic Regression

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
})


models = {
    "LR-old": LogisticRegression(max_iter=100, penalty='l2', solver='newton-cg', tol=0.0001),
    "LR": LogisticRegression(solver='newton-cg')
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hnum'
# cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)

tag = "LR"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_lr(model_type, des_df_hnum, nondes_df_hnum, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features, model_name=model_name)

#### AB, DT, EF, GB, RF

In [None]:
# AB, DT, EF, GB, RF

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
})

models = {
    "RF": RandomForestClassifier(max_depth=2, max_features='log2', min_samples_leaf=3,
                       min_samples_split=4, n_estimators=40),
    "EF": ExtraTreesClassifier(max_depth=4, max_features='log2', min_samples_leaf=2,
                     n_estimators=40),
    "GB": GradientBoostingClassifier(learning_rate=0.01, max_depth=2, max_features='sqrt',
                           min_samples_leaf=3, n_estimators=10),
    "AB": AdaBoostClassifier(learning_rate=0.01, n_estimators=100),
    "DT": DecisionTreeClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=3,
                       min_samples_split=4)
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hnum'
# cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)

# tag = "RF"
for x in models.keys():
    tag = x
    model_type = models[tag]
    model_name = model_type.__str__().split('(')[0]

    print(f"{tag} training for {n_repeat} runs")

    if model_name.__contains__("XGBClassifier"):
        model_name = "XGBClassifier"

    dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
    file_name = open(
        f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

    train_ab_dt_ef_gb_rf(model_type, des_df_hnum, nondes_df_hnum, 
    file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features)

#### KNN

In [None]:
# KNN

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
})


models = {
    "KNN-old": KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=15,
                     weights='distance'),
    "KNN": KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=7,
                     weights='distance')                 
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hnum'

tag = "KNN"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(model_name)
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_knn(model_type, des_df_hnum, nondes_df_hnum, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features, model_name=model_name)

#### SVC

In [None]:
# SVC

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
})


models = {
    "SVC": SVC(C=1, probability=True)
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='hnum'

tag = "SVC"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(f'{model_name}')
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_svc(model_type, des_df_hnum, nondes_df_hnum, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features, model_name=model_name)

### XGBoost with Repeated KFold

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.metrics import roc_auc_score

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})  # 'figure.figsize': [14.0, 10.0],


print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
xgb_roc_auc_repeatK = []
coeff_xgb_repeatK = [0, 0, 0, 0, 0]
# num = 10
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=100)

dirmaker(f'./model-logs/hnum/{xdate}')
file_hnum_xgb_repeatK = open(
    f"./model-logs/hnum/{xdate}/XGB_hnum_repeatKFold_{xdate}_100.txt", "w+")

xgb_class_hnumrepeatK = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    objective='binary:logistic',
    n_jobs=multiprocessing.cpu_count() // 2)  # tree_method='gpu_hist', enable_categorical=True, predictor='gpu_predictor',

p_repeatK = {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             # ,0.6,0.7,0.8,0.9,1.0
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }


X_train_hnum, y_train_hnum, X_test_hnum, y_test_hnum = data_generator(
    des_df_hnum, nondes_df_hnum, test_sample_size=8, nondes_batch_size=38)

xgb_clf = GridSearchCV(xgb_class_hnumrepeatK,
                       param_grid=p_repeatK, verbose=1, scoring='roc_auc', cv=cv)

xgb_class_hnum_repeatK = xgb_clf.fit(X_train_hnum, y_train_hnum, eval_set=[(X_train_hnum, y_train_hnum), (
    X_test_hnum, y_test_hnum)], verbose=False, early_stopping_rounds=50)  # , early_stopping_rounds=50

print('Best score', xgb_class_hnum_repeatK.best_score_, file=file_hnum_xgb_repeatK)
print('Best params', xgb_class_hnum_repeatK.best_params_,
      file=file_hnum_xgb_repeatK)
y_pred_hnum_xgb = xgb_class_hnum_repeatK.best_estimator_.predict(X_test_hnum)
target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
print(metrics.classification_report(y_test_hnum, y_pred_hnum_xgb,
      target_names=target_names), file=file_hnum_xgb_repeatK)
roc_auc_xgb = metrics.roc_auc_score(y_test_hnum, y_pred_hnum_xgb)
print(f"roc_auc_score: {roc_auc_xgb}", file=file_hnum_xgb_repeatK)
xgb_roc_auc_repeatK.append(roc_auc_xgb)
print(xgb_class_hnum_repeatK.best_estimator_.feature_importances_,
      file=file_hnum_xgb_repeatK)
# print(f"intercept: {xgb_class_hnum_repeatK.intercept_}", file=file_hnum_xgb_repeatK)

xgb_class_hnum_repeatK_feature_df = pd.DataFrame(
    {'Importance': xgb_class_hnum_repeatK.best_estimator_.feature_importances_, 'features': features})
max_coeff_index = list(xgb_class_hnum_repeatK.best_estimator_.feature_importances_).index(
    xgb_class_hnum_repeatK.best_estimator_.feature_importances_.max())
print(
    f'max feature: {xgb_class_hnum_repeatK.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_hnum_xgb_repeatK)
coeff_xgb_repeatK[max_coeff_index] += 1
print('\n', file=file_hnum_xgb_repeatK)
print(xgb_class_hnum_repeatK_feature_df, file=file_hnum_xgb_repeatK)
print('\n'*2, file=file_hnum_xgb_repeatK)

# roc = [ z for z in range(1,num+1)]
print(
    f"roc_auc scores on test set: {xgb_roc_auc_repeatK}", file=file_hnum_xgb_repeatK)
print(
    f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_repeatK)}", file=file_hnum_xgb_repeatK)
print(
    f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_repeatK)}", file=file_hnum_xgb_repeatK)
print(
    f"Best roc_auc score on test set: {max(xgb_roc_auc_repeatK)} at index {xgb_roc_auc_repeatK.index(max(xgb_roc_auc_repeatK)) + 1}", file=file_hnum_xgb_repeatK)
print(
    f"Best model's roc_auc score from early stopping: {xgb_class_hnum_repeatK.best_estimator_.best_score}", file=file_hnum_xgb_repeatK)
print(
    f"Best model's iteration from early stopping: {xgb_class_hnum_repeatK.best_estimator_.best_iteration}", file=file_hnum_xgb_repeatK)
# print(f"model's eval_results: {xgb_class_hnum_repeatK.best_estimator_.evals_result()}", file=file_hnum_xgb_repeatK)
train_eval = list(xgb_class_hnum_repeatK.best_estimator_.evals_result()[
                  'validation_0'].items())
print(
    f'Number of training auc scores: {len(train_eval[0][1])}', file=file_hnum_xgb_repeatK)
print(
    f'auc scores of training set: {train_eval[0][1]}', file=file_hnum_xgb_repeatK)
print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
{round(np.std(train_eval[0][1]),2)} \n', file=file_hnum_xgb_repeatK)

val_eval = list(xgb_class_hnum_repeatK.best_estimator_.evals_result()[
                'validation_1'].items())
print(
    f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_hnum_xgb_repeatK)
print(
    f'auc scores of testing set: {val_eval[0][1]}', file=file_hnum_xgb_repeatK)
print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
{round(np.std(val_eval[0][1]),2)} \n', file=file_hnum_xgb_repeatK)

print('\n', file=file_hnum_xgb_repeatK)
coeff_df_xgb = pd.DataFrame(
    {'Top features': coeff_xgb_repeatK, 'features': features})
print(f"Coefficients: {coeff_df_xgb} \n", file=file_hnum_xgb_repeatK)

print(f'Best estimator: {xgb_class_hnum_repeatK.best_estimator_} \n',
      file=file_hnum_xgb_repeatK)
print(f'Best params: {xgb_class_hnum_repeatK.best_params_} \n',
      file=file_hnum_xgb_repeatK)
print(
    f"Best estimator's score from early stopping: {xgb_class_hnum_repeatK.best_estimator_.best_score} \n", file=file_hnum_xgb_repeatK)
# plotting roc_auc score
fig = plt.figure()
fig_ax = fig.add_subplot(1, 1, 1)
fig.set_size_inches(12, 8, forward=True)
fig_ax.set_xlabel("Number of runs")
fig_ax.set_ylabel("ROC-AUC score")
fig_ax.set_ylim(0, 1.1)
plt.title('XGBoost (Repeated KFold) hbond number', fontsize=12, weight='bold')

fig_ax.plot(range(1, len(train_eval[0][1])+1), train_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval[0][1]),2)}\n \
std training roc_auc : {round(np.std(train_eval[0][1]),2)}")

fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\n \
std testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

plt.legend(loc='best')
file_hnum_xgb_repeatK.close()
dirmaker(f'./plots/roc-auc/hnum/{xdate}')
fig.savefig(f'plots/roc-auc/hnum/{xdate}/XGB_hnum_repeatK-{xdate}_100.png',
            dpi=500, facecolor='white', bbox_inches='tight')

xgb.plot_importance(
    xgb_class_hnum_repeatK.best_estimator_).set_yticklabels(features)
plt.show()


In [None]:
# X_train_hnum, y_train_hnum, X_test_hnum, y_test_hnum
# len(y_test_hnum)
list(xgb_class_hnum_repeatK.best_estimator_.evals_result()
     ['validation_0'].items())


### XGBoost with Repeated stratified KFold

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.metrics import roc_auc_score

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})  # 'figure.figsize': [14.0, 10.0],


print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
xgb_roc_auc_stratK = []
coeff_xgb_stratK = [0, 0, 0, 0, 0]
# num = 10
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1000)

dirmaker(f'./model-logs/hnum/{xdate}')
file_hnum_xgb_stratK = open(
    f"./model-logs/hnum/{xdate}/XGB_hnum_stratKFold_{xdate}.txt", "w+")

xgb_class_hnumstratK = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    objective='binary:logistic',
    n_jobs=multiprocessing.cpu_count() // 2)  # tree_method='gpu_hist', enable_categorical=True, predictor='gpu_predictor',

p_stratK = {'max_depth': [2, 4, 6, 8, 10],
            'n_estimators': [10, 20, 30, 40, 50, 100],
            'learning_rate': [0.01, 0.1],
            # ,0.6,0.7,0.8,0.9,1.0
            'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
            }


X_train_hnum, y_train_hnum, X_test_hnum, y_test_hnum = data_generator(
    des_df_hnum, nondes_df_hnum, test_sample_size=8, nondes_batch_size=38)

xgb_clf = GridSearchCV(
    xgb_class_hnumstratK, param_grid=p_stratK, verbose=1, scoring='roc_auc', cv=cv)

xgb_class_hnum_stratK = xgb_clf.fit(X_train_hnum, y_train_hnum, eval_set=[(X_train_hnum, y_train_hnum), (
    X_test_hnum, y_test_hnum)], verbose=False, early_stopping_rounds=50)  # , early_stopping_rounds=50

print('Best score', xgb_class_hnum_stratK.best_score_, file=file_hnum_xgb_stratK)
print('Best params', xgb_class_hnum_stratK.best_params_, file=file_hnum_xgb_stratK)
y_pred_hnum_xgb = xgb_class_hnum_stratK.best_estimator_.predict(X_test_hnum)
target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
print(metrics.classification_report(y_test_hnum, y_pred_hnum_xgb,
      target_names=target_names), file=file_hnum_xgb_stratK)
roc_auc_xgb = metrics.roc_auc_score(y_test_hnum, y_pred_hnum_xgb)
print(f"roc_auc_score: {roc_auc_xgb}", file=file_hnum_xgb_stratK)
xgb_roc_auc_stratK.append(roc_auc_xgb)
print(xgb_class_hnum_stratK.best_estimator_.feature_importances_,
      file=file_hnum_xgb_stratK)
# print(f"intercept: {xgb_class_hnum_stratK.intercept_}", file=file_hnum_xgb_stratK)

xgb_class_hnum_stratK_feature_df = pd.DataFrame(
    {'Importance': xgb_class_hnum_stratK.best_estimator_.feature_importances_, 'features': features})
max_coeff_index = list(xgb_class_hnum_stratK.best_estimator_.feature_importances_).index(
    xgb_class_hnum_stratK.best_estimator_.feature_importances_.max())
print(
    f'max feature: {xgb_class_hnum_stratK.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features[max_coeff_index]}]', file=file_hnum_xgb_stratK)
coeff_xgb_stratK[max_coeff_index] += 1
print('\n', file=file_hnum_xgb_stratK)
print(xgb_class_hnum_stratK_feature_df, file=file_hnum_xgb_stratK)
print('\n'*2, file=file_hnum_xgb_stratK)

# roc = [ z for z in range(1,num+1)]
print(
    f"roc_auc scores on test set: {xgb_roc_auc_stratK}", file=file_hnum_xgb_stratK)
print(
    f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_stratK)}", file=file_hnum_xgb_stratK)
print(
    f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_stratK)}", file=file_hnum_xgb_stratK)
print(
    f"Best roc_auc score on test set: {max(xgb_roc_auc_stratK)} at index {xgb_roc_auc_stratK.index(max(xgb_roc_auc_stratK)) + 1}", file=file_hnum_xgb_stratK)
print(
    f"Best model's roc_auc score from early stopping: {xgb_class_hnum_stratK.best_estimator_.best_score}", file=file_hnum_xgb_stratK)
print(
    f"Best model's iteration from early stopping: {xgb_class_hnum_stratK.best_estimator_.best_iteration}", file=file_hnum_xgb_stratK)
# print(f"model's eval_results: {xgb_class_hnum_stratK.best_estimator_.evals_result()}", file=file_hnum_xgb_stratK)
train_eval = list(xgb_class_hnum_stratK.best_estimator_.evals_result()[
                  'validation_0'].items())
print(
    f'Number of training auc scores: {len(train_eval[0][1])}', file=file_hnum_xgb_stratK)
print(
    f'auc scores of training set: {train_eval[0][1]}', file=file_hnum_xgb_stratK)
print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
{round(np.std(train_eval[0][1]),2)} \n', file=file_hnum_xgb_stratK)

val_eval = list(xgb_class_hnum_stratK.best_estimator_.evals_result()[
                'validation_1'].items())
print(
    f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_hnum_xgb_stratK)
print(
    f'auc scores of testing set: {val_eval[0][1]}', file=file_hnum_xgb_stratK)
print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
{round(np.std(val_eval[0][1]),2)} \n', file=file_hnum_xgb_stratK)

print('\n', file=file_hnum_xgb_stratK)
coeff_df_xgb = pd.DataFrame(
    {'Top features': coeff_xgb_stratK, 'features': features})
print(f"Coefficients: {coeff_df_xgb} \n", file=file_hnum_xgb_stratK)

print(f'Best estimator: {xgb_class_hnum_stratK.best_estimator_} \n',
      file=file_hnum_xgb_stratK)
print(f'Best params: {xgb_class_hnum_stratK.best_params_} \n',
      file=file_hnum_xgb_stratK)
print(
    f"Best estimator's score from early stopping: {xgb_class_hnum_stratK.best_estimator_.best_score} \n", file=file_hnum_xgb_stratK)
# plotting roc_auc score
fig = plt.figure()
fig_ax = fig.add_subplot(1, 1, 1)
fig.set_size_inches(12, 8, forward=True)
fig_ax.set_xlabel("Number of runs")
fig_ax.set_ylabel("ROC-AUC score")
fig_ax.set_ylim(0, 1.1)
plt.title('XGBoost (Repeated stratified KFold) hbond number',
          fontsize=12, weight='bold')

fig_ax.plot(range(1, len(train_eval[0][1])+1), train_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval[0][1]),2)}\n \
std training roc_auc : {round(np.std(train_eval[0][1]),2)}")

fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\n \
std testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

plt.legend(loc='best')
file_hnum_xgb_stratK.close()
dirmaker(f'./plots/roc-auc/hnum/{xdate}')
fig.savefig(f'plots/roc-auc/hnum/{xdate}/XGB_hnum_stratKsearch-{xdate}.png',
            dpi=500, facecolor='white', bbox_inches='tight')

xgb.plot_importance(
    xgb_class_hnum_stratK.best_estimator_).set_yticklabels(features)
plt.show()


In [None]:
xgb_class_hnum_stratK.best_estimator_.evals_result()


## Hbond number + lifetime
Models are trained n merged hbond number and lifetime data

#### des

In [None]:
des_df_hlife_edited = des_df_hlife.rename(columns={'A-A': 'A-A_l', 'B-B': 'B-B_l', 'A-B': 'A-B_l',
                                          'A-A/B-B': 'A-A_l/B-B_l', 'A-B/(A-A + B-B)': 'A-B_l/(A-A_l + B-B_l)', 'output': 'output_l'})
# des_df_hlife_edited

In [None]:
des_df_hnum_edited = des_df_hnum.rename(columns={'A-A': 'A-A_n', 'B-B': 'B-B_n', 'A-B': 'A-B_n',
                                        'A-A/B-B': 'A-A_n/B-B_n', 'A-B/(A-A + B-B)': 'A-B_n/(A-A_n + B-B_n)', 'output': 'output_n'})
# des_df_hnum_edited

#### nondes

In [None]:
nondes_df_hlife_edited = nondes_df_hlife.rename(
    columns={'A-A': 'A-A_l', 'B-B': 'B-B_l', 'A-B': 'A-B_l', 'A-A/B-B': 'A-A_l/B-B_l', 'A-B/(A-A + B-B)': 'A-B_l/(A-A_l + B-B_l)', 'output': 'output_l'})
# nondes_df_hlife_edited

In [None]:
nondes_df_hnum_edited = nondes_df_hnum.rename(
    columns={'A-A': 'A-A_n', 'B-B': 'B-B_n', 'A-B': 'A-B_n', 'A-A/B-B': 'A-A_n/B-B_n', 'A-B/(A-A + B-B)': 'A-B_n/(A-A_n + B-B_n)', 'output': 'output_n'})
# nondes_df_hnum_edited

#### merged

In [None]:
des_df_merged_list = [des_df_hnum_edited, des_df_hlife_edited]
des_df_merged = pd.concat(des_df_merged_list, axis=1)
# des_df_merged

In [None]:
nondes_df_merged_list = [nondes_df_hnum_edited, nondes_df_hlife_edited]
nondes_df_merged = pd.concat(nondes_df_merged_list, axis=1)
# nondes_df_merged

## Hyperparam opt

### GridSearch

#### XGB

In [None]:

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 300.0,
    'axes.linewidth':2.0,
})

models = {
     "XGB": XGBClassifier(
            tree_method='hist',
            eval_metric='auc',
            objective='binary:logistic',
            n_jobs=multiprocessing.cpu_count(), early_stopping_rounds=50)  
}

params = { 'XGB':{'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }
}

print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "XGB"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
folder_type = 'merged'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_merged_xgb_repeatK = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


kingmaker_xgb(model_generic, p_repeatK, des_df_merged, nondes_df_merged, cv=cv,
           file_name=file_merged_xgb_repeatK, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed, features=features_merged)


#### XGBRF

In [None]:
# XGBRF
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 300.0,
    'axes.linewidth':2.0,
})

models = {
    "XGBRF": XGBRFClassifier(
            tree_method='hist',
            eval_metric='auc',
            objective='binary:logistic',
            n_jobs=multiprocessing.cpu_count())  
}

params = {
           'XGBRF':{'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }
}

print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "XGBRF"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
print(model_name)
folder_type = 'merged'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_merged_xgb_repeatK = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


kingmaker_xgbrf(model_generic, p_repeatK, des_df_merged, nondes_df_merged, cv=cv, features=features_merged,
           file_name=file_merged_xgb_repeatK, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed)


#### LR

In [None]:
# LR
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "LR": LogisticRegression()
}

params = {
    "LR": {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'tol': [1e-4, 1e-6],
             'max_iter': [100, 200, 300, 400, 500],
             'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "LR"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().strip('()')
folder_type = 'merged'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

kingmaker_lr(model_generic, p_repeatK, des_df_merged, nondes_df_merged, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed, features=features_merged)


#### The others

In [None]:

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "RF": RandomForestClassifier(),
    "EF": ExtraTreesClassifier(),
    "GB": GradientBoostingClassifier(),
    "AB": AdaBoostClassifier(),
    "DT": DecisionTreeClassifier()
}

params = {
    "RF": {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["auto", "sqrt", "log2"],
             },
    "EF": {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["auto", "sqrt", "log2"],
             },
    "GB": {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["auto", "sqrt", "log2"],
             'learning_rate': [0.01, 0.1],
             },
    "AB": {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
             'learning_rate': [0.001, 0.01, 0.1, 1.0],
             },
    "DT": {'max_depth': [2, 4, 6, 8, 10],
             'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2,4,6],
             'max_features': ["auto", "sqrt", "log2"],
             }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)

for x in models.keys():
    tag = x
    print(f"{tag} Parallel Parameter optimization")

    model_generic = models[tag]  
    p_repeatK = params[tag]
    model_name = model_generic.__str__().split('(')[0]

    folder_type = 'merged'

    dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
    file_name = open(
        f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

    kingmaker_generic(model_generic, p_repeatK, des_df_merged, nondes_df_merged, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed, features=features_merged)


#### SVC

In [None]:
# SVC
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "SVC": SVC(probability=True),
    "SVC1": SVC(probability=True)
}

# 'gamma': ['scale', 'auto'],
# 'gamma': [0.001, 0.0001],
# "C": [1, 10, 100, 1000]
#             
params = {
    "SVC": { 
             'degree': [3, 5, 7],
             'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
             'tol': [1e-3, 1e-4, 1e-5, 1e-6],
             "C": [1, 10]
             }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "SVC"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
folder_type = 'merged'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")



kingmaker_svc(model_generic, p_repeatK, des_df_merged, nondes_df_merged, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed, features=features_merged)

#### KNN

In [None]:
# KNN
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})

models = {
    "KNN": KNeighborsClassifier(n_jobs=-1)
}

params = {
    "KNN": {'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'leaf_size' : [30, 40, 50, 60]
               }
}


xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 10
cv = RepeatedKFold(n_splits=6, n_repeats=n_repeat, random_state=rand_seed)
tag = "KNN"
print(f"{tag} Parallel Parameter optimization")

model_generic = models[tag]  
p_repeatK = params[tag]
model_name = model_generic.__str__().split('(')[0]
folder_type = 'merged'

dirmaker(f'./model-logs/gridsearch/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/gridsearch/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

kingmaker_knn(model_generic, p_repeatK, des_df_merged, nondes_df_merged, cv=cv,
           file_name=file_name, folder_type=folder_type, n_repeat=n_repeat, rand_seed=rand_seed, features=features_merged)


### Training loops

#### Logistic Regression

In [None]:
# Logistic Regression

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
})


models = {
    "LR": LogisticRegression(penalty='l1', solver='saga'),
    "LR2": LogisticRegression(max_iter=100, penalty='l2', solver='newton-cg', tol=0.0001)  # old hnum data with errors 
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='merged'

tag = "LR"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_lr(model_type, des_df_merged, nondes_df_merged, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features_merged, model_name=model_name)

#### AB, DT, EF, GB, RF

In [None]:
# AB, DT, EF, GB, RF

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
})

models = {
    "RF": RandomForestClassifier(max_depth=2, max_features='auto', min_samples_leaf=3,
                       min_samples_split=6, n_estimators=50),
    "EF": ExtraTreesClassifier(max_depth=2, max_features='log2', min_samples_leaf=3,
                     n_estimators=20),
    "GB": GradientBoostingClassifier(learning_rate=0.01, max_depth=2, max_features='log2',
                           min_samples_split=4, n_estimators=10),
    "AB": AdaBoostClassifier(learning_rate=0.01, n_estimators=80),
    "DT": DecisionTreeClassifier(max_depth=2, max_features='auto', min_samples_leaf=2)
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='merged'

for x in models.keys():
    tag = x
    model_type = models[tag]
    model_name = model_type.__str__().split('(')[0]

    print(f"{tag} training for {n_repeat} runs")

    dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
    file_name = open(
        f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

    train_ab_dt_ef_gb_rf(model_type, des_df_merged, nondes_df_merged, 
    file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features_merged)

#### SVC

In [None]:
# SVC

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
})


models = {
    "SVC": SVC(C=1, probability=True)
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='merged'

tag = "SVC"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_svc(model_type, des_df_merged, nondes_df_merged, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features_merged, model_name=model_name)

#### KNN

In [None]:
# KNN

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
})


models = {
    "KNN-old": KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=13,
                     weights='distance'),
    "KNN": KNeighborsClassifier(n_jobs=-1, n_neighbors=9, weights='distance')                 
}

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='merged'

tag = "KNN"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]
print(model_name)
print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")


train_knn(model_type, des_df_merged, nondes_df_merged, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features_merged, model_name=model_name)

#### XGB

In [None]:


# XGB training loop
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

models = {    
    "XGB-old": XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3,
              enable_categorical=False, eval_metric='auc', gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=40, n_jobs=12, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None),
    "XGB": XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, 
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, monotone_constraints=None,
              n_estimators=20, n_jobs=12, num_parallel_tree=None,
              predictor=None, random_state=None)           
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='merged'

tag = "XGB"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]

print(f"{tag} training for {n_repeat} runs")

if model_name.__contains__("XGBClassifier"):
    model_name = "XGBClassifier"

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

train_xgb(model_type, des_df_merged, nondes_df_merged, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features_merged)


#### XGBRF

In [None]:

# XGBRF

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

models = {    
    "XGBRF-old": XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=0.5,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='auc', feature_types=None, gamma=None, gpu_id=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.01, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=2, max_leaves=None,
                min_child_weight=None, monotone_constraints=None,
                n_estimators=100, n_jobs=12, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, random_state=None),
    "XGBRF": XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=0.5,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='auc', feature_types=None, gamma=None, gpu_id=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.01, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=2, max_leaves=None,
                min_child_weight=None, monotone_constraints=None,
                n_estimators=30, n_jobs=12, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, random_state=None) 
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='merged'

tag = "XGBRF"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]

print(f"{tag} training for {n_repeat} runs")

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

train_xgb(model_type, des_df_merged, nondes_df_merged, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features_merged)


#### XGB and XGBRF eval

In [None]:
# XGBClassifier

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

models = {    
    "XGB": XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, 
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, monotone_constraints=None,
              n_estimators=20, n_jobs=12, num_parallel_tree=None,
              predictor=None, random_state=None),
     "XGBRF": XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=0.5,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='auc', feature_types=None, gamma=None, gpu_id=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.01, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=2, max_leaves=None,
                min_child_weight=None, monotone_constraints=None,
                n_estimators=30, n_jobs=12, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, random_state=None) 
}



xdate = datetime.datetime.now().strftime("%m-%d-%Y")
rand_seed = 100
n_repeat = 100
folder_type='merged'

tag = "XGBRF"
model_type = models[tag]
model_name = model_type.__str__().split('(')[0]

print(f"{tag} training for {n_repeat} runs")

if model_name.__contains__("XGBClassifier"):
    model_name = "XGBClassifier"

dirmaker(f'./model-logs/training/{folder_type}/{xdate}')
file_name = open(
    f"./model-logs/training/{folder_type}/{xdate}/{model_name}_eval_{folder_type}_{n_repeat}_{xdate}_{rand_seed}.txt", "w+")

# Use this for getting validation metrics plotted alongside training metrics
train_xgb_eval(model_type, des_df_merged, nondes_df_merged, 
file_name=file_name, folder_type=folder_type, num=n_repeat, rand_seed=rand_seed, features=features_merged)


#### XGBoost with GridSearchCV
This uses a for-loop to get more repetitions

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})  # 'figure.figsize': [14.0, 10.0],


print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
xgb_roc_auc_grid = []
coeff_xgb_grid = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
num = 10
file_merged_xgb_grid = open(
    f"./model-logs/XGB_merged_gridsearch_{num}_{xdate}.txt", "w+")

xgb_class_mergedgrid = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    objective='binary:logistic',
    n_jobs=multiprocessing.cpu_count() // 2)  # tree_method='gpu_hist', enable_categorical=True, predictor='gpu_predictor',

p_grid = {'max_depth': [2, 4, 6, 8, 10],
          'n_estimators': [10, 20, 30, 40, 50, 100],
          'learning_rate': [0.01, 0.1],
          'colsample_bytree': [0.1, 0.2, 0.3],
          }

for x in trange(num):
    X_train_merged, y_train_merged, X_test_merged, y_test_merged = df_generator(
        des_df_merged, nondes_df_merged, test_sample_size=8, nondes_batch_size=38)
    # xgb_class_merged_grid = XGBClassifier(
    #     tree_method='hist',
    #     use_label_encoder=False,
    #     eval_metric='auc',
    #     objective='binary:logistic',
    #     n_jobs=multiprocessing.cpu_count() // 2)
    xgb_clf = GridSearchCV(xgb_class_mergedgrid,
                           param_grid=p_grid, verbose=1, scoring='roc_auc')

    xgb_class_merged_grid = xgb_clf.fit(X_train_merged, y_train_merged, eval_set=[(
        X_test_merged, y_test_merged)], verbose=False, early_stopping_rounds=50)  # , early_stopping_rounds=50

    print('Best score', xgb_class_merged_grid.best_score_,
          file=file_merged_xgb_grid)
    print('Best params', xgb_class_merged_grid.best_params_,
          file=file_merged_xgb_grid)
    y_pred_merged_xgb = xgb_class_merged_grid.best_estimator_.predict(
        X_test_merged)
    target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
    print(metrics.classification_report(y_test_merged, y_pred_merged_xgb,
          target_names=target_names), file=file_merged_xgb_grid)
    roc_auc_xgb = metrics.roc_auc_score(y_test_merged, y_pred_merged_xgb)
    print(f"roc_auc_score: {roc_auc_xgb}", file=file_merged_xgb_grid)
    xgb_roc_auc_grid.append(roc_auc_xgb)
    print(xgb_class_merged_grid.best_estimator_.feature_importances_,
          file=file_merged_xgb_grid)
    # print(f"intercept: {xgb_class_merged_grid.intercept_}", file=file_merged_xgb_grid)

    xgb_class_merged_grid_feature_df = pd.DataFrame(
        {'Importance': xgb_class_merged_grid.best_estimator_.feature_importances_, 'Features_merged': features_merged})
    max_coeff_index = list(xgb_class_merged_grid.best_estimator_.feature_importances_).index(
        xgb_class_merged_grid.best_estimator_.feature_importances_.max())
    print(
        f'max feature: {xgb_class_merged_grid.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features_merged[max_coeff_index]}]', file=file_merged_xgb_grid)
    coeff_xgb_grid[max_coeff_index] += 1
    print('\n', file=file_merged_xgb_grid)
    print(xgb_class_merged_grid_feature_df, file=file_merged_xgb_grid)
    print('\n'*2, file=file_merged_xgb_grid)


# plotting roc_auc score
fig = plt.figure()
fig_ax = fig.add_subplot(1, 1, 1)
fig.set_size_inches(12, 8, forward=True)
fig_ax.set_xlabel("Number of runs")  # , fontsize=14, weight='bold')
fig_ax.set_ylabel("ROC-AUC score")  # , fontsize=14, weight='bold')
fig_ax.set_ylim(0, 1.0)
fig_ax.set_xlim(0, num+2)
plt.title('XGBoost hbond number + lifetime', fontsize=12, weight='bold')

roc = [z for z in range(1, num+1)]
print(
    f"roc_auc scores on test set: {xgb_roc_auc_grid}", file=file_merged_xgb_grid)
print(
    f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_grid)}", file=file_merged_xgb_grid)
print(
    f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_grid)}", file=file_merged_xgb_grid)
print(
    f"Best roc_auc score on test set: {max(xgb_roc_auc_grid)} at index {xgb_roc_auc_grid.index(max(xgb_roc_auc_grid)) + 1}", file=file_merged_xgb_grid)
print(
    f"Best model's roc_auc score: {xgb_class_merged_grid.best_estimator_.best_score}", file=file_merged_xgb_grid)
print(
    f"Best model's iteration: {xgb_class_merged_grid.best_estimator_.best_iteration}", file=file_merged_xgb_grid)
print(
    f"model's eval_results: {xgb_class_merged_grid.best_estimator_.evals_result()}", file=file_merged_xgb_grid)
print('\n', file=file_merged_xgb_grid)
coeff_df_xgb = pd.DataFrame(
    {'Top Features_merged': coeff_xgb_grid, 'Features_merged': features_merged})
print(f"Coefficients: {coeff_df_xgb}", file=file_merged_xgb_grid)
fig_ax.plot(roc, xgb_roc_auc_grid, '-o', linewidth=2, markersize=8.0, label=f"avg roc_auc: {round(np.average(xgb_roc_auc_grid),2)} \n \
std roc_auc : {round(np.std(xgb_roc_auc_grid),2)}")
plt.legend(loc='upper left')
file_merged_xgb_grid.close()
dirmaker(f'./plots/roc-auc/merged/{xdate}')
# fig.savefig(f'plots/roc-auc/merged/XGB_merged_gridsearch_{num}-{xdate}.png', dpi=500,facecolor='white', bbox_inches='tight')
xgb.plot_importance(
    xgb_class_merged_grid.best_estimator_).set_yticklabels(features_merged)
plt.show()


In [None]:
XGBClassifier().get_params().keys()


In [None]:
xgb_class_merged_grid.best_params_


### XGBoost GridSearchCV with RepeatedKFold
This obviates the need for using for-loops

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.metrics import roc_auc_score

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})  # 'figure.figsize': [14.0, 10.0],


print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
xgb_roc_auc_repeatK = []
coeff_xgb_repeatK = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# num = 10
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1000)

dirmaker(f'./model-logs/merged/{xdate}')
file_merged_xgb_repeatK = open(
    f"./model-logs/merged/{xdate}/XGB_merged_repeatKFold_{xdate}.txt", "w+")

xgb_class_mergedrepeatK = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    objective='binary:logistic',
    n_jobs=multiprocessing.cpu_count() // 2)  # tree_method='gpu_hist', enable_categorical=True, predictor='gpu_predictor',

p_repeatK = {'max_depth': [2, 4, 6, 8, 10],
             'n_estimators': [10, 20, 30, 40, 50, 100],
             'learning_rate': [0.01, 0.1],
             # ,0.6,0.7,0.8,0.9,1.0
             'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
             }


X_train_merged, y_train_merged, X_test_merged, y_test_merged = df_generator(
    des_df_merged, nondes_df_merged, test_sample_size=8, nondes_batch_size=38)

xgb_clf = GridSearchCV(xgb_class_mergedrepeatK,
                       param_grid=p_repeatK, verbose=1, scoring='roc_auc', cv=cv)

xgb_class_merged_repeatK = xgb_clf.fit(X_train_merged, y_train_merged, eval_set=[(X_train_merged, y_train_merged), (
    X_test_merged, y_test_merged)], verbose=False, early_stopping_rounds=50)  # , early_stopping_rounds=50

print('Best score', xgb_class_merged_repeatK.best_score_,
      file=file_merged_xgb_repeatK)
print('Best params', xgb_class_merged_repeatK.best_params_,
      file=file_merged_xgb_repeatK)
y_pred_merged_xgb = xgb_class_merged_repeatK.best_estimator_.predict(
    X_test_merged)
target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
print(metrics.classification_report(y_test_merged, y_pred_merged_xgb,
      target_names=target_names), file=file_merged_xgb_repeatK)
roc_auc_xgb = metrics.roc_auc_score(y_test_merged, y_pred_merged_xgb)
print(f"roc_auc_score: {roc_auc_xgb}", file=file_merged_xgb_repeatK)
xgb_roc_auc_repeatK.append(roc_auc_xgb)
print(xgb_class_merged_repeatK.best_estimator_.feature_importances_,
      file=file_merged_xgb_repeatK)
# print(f"intercept: {xgb_class_merged_repeatK.intercept_}", file=file_merged_xgb_repeatK)

xgb_class_merged_repeatK_feature_df = pd.DataFrame(
    {'Importance': xgb_class_merged_repeatK.best_estimator_.feature_importances_, 'Features_merged': features_merged})
max_coeff_index = list(xgb_class_merged_repeatK.best_estimator_.feature_importances_).index(
    xgb_class_merged_repeatK.best_estimator_.feature_importances_.max())
print(
    f'max feature: {xgb_class_merged_repeatK.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features_merged[max_coeff_index]}]', file=file_merged_xgb_repeatK)
coeff_xgb_repeatK[max_coeff_index] += 1
print('\n', file=file_merged_xgb_repeatK)
print(xgb_class_merged_repeatK_feature_df, file=file_merged_xgb_repeatK)
print('\n'*2, file=file_merged_xgb_repeatK)

# roc = [ z for z in range(1,num+1)]
print(
    f"roc_auc scores on test set: {xgb_roc_auc_repeatK}", file=file_merged_xgb_repeatK)
print(
    f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_repeatK)}", file=file_merged_xgb_repeatK)
print(
    f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_repeatK)}", file=file_merged_xgb_repeatK)
print(
    f"Best roc_auc score on test set: {max(xgb_roc_auc_repeatK)} at index {xgb_roc_auc_repeatK.index(max(xgb_roc_auc_repeatK)) + 1}", file=file_merged_xgb_repeatK)
print(
    f"Best model's roc_auc score from early stopping: {xgb_class_merged_repeatK.best_estimator_.best_score}", file=file_merged_xgb_repeatK)
print(
    f"Best model's iteration from early stopping: {xgb_class_merged_repeatK.best_estimator_.best_iteration}", file=file_merged_xgb_repeatK)
# print(f"model's eval_results: {xgb_class_merged_repeatK.best_estimator_.evals_result()}", file=file_merged_xgb_repeatK)
train_eval = list(xgb_class_merged_repeatK.best_estimator_.evals_result()[
                  'validation_0'].items())
print(
    f'Number of training auc scores: {len(train_eval[0][1])}', file=file_merged_xgb_repeatK)
print(
    f'auc scores of training set: {train_eval[0][1]}', file=file_merged_xgb_repeatK)
print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
{round(np.std(train_eval[0][1]),2)} \n', file=file_merged_xgb_repeatK)

val_eval = list(xgb_class_merged_repeatK.best_estimator_.evals_result()[
                'validation_1'].items())
print(
    f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_merged_xgb_repeatK)
print(
    f'auc scores of testing set: {val_eval[0][1]}', file=file_merged_xgb_repeatK)
print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
{round(np.std(val_eval[0][1]),2)} \n', file=file_merged_xgb_repeatK)

print('\n', file=file_merged_xgb_repeatK)
coeff_df_xgb = pd.DataFrame(
    {'Top Features_merged': coeff_xgb_repeatK, 'Features_merged': features_merged})
print(f"Coefficients: {coeff_df_xgb} \n", file=file_merged_xgb_repeatK)

print(f'Best estimator: {xgb_class_merged_repeatK.best_estimator_} \n',
      file=file_merged_xgb_repeatK)
print(f'Best params: {xgb_class_merged_repeatK.best_params_} \n',
      file=file_merged_xgb_repeatK)
print(
    f"Best estimator's score from early stopping: {xgb_class_merged_repeatK.best_estimator_.best_score} \n", file=file_merged_xgb_repeatK)
# plotting roc_auc score
fig = plt.figure()
fig_ax = fig.add_subplot(1, 1, 1)
fig.set_size_inches(12, 8, forward=True)
fig_ax.set_xlabel("Number of runs")
fig_ax.set_ylabel("ROC-AUC score")
fig_ax.set_ylim(0, 1.1)
plt.title('XGBoost (Repeated KFold) hbond number + lifetime',
          fontsize=12, weight='bold')

fig_ax.plot(range(1, len(train_eval[0][1])+1), train_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval[0][1]),2)}\n \
std training roc_auc : {round(np.std(train_eval[0][1]),2)}")

fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\n \
std testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

plt.legend(loc='best')
file_merged_xgb_repeatK.close()
dirmaker(f'./plots/roc-auc/merged/{xdate}')
fig.savefig(f'plots/roc-auc/merged/{xdate}/XGB_merged_repeatKsearch-{xdate}.png',
            dpi=500, facecolor='white', bbox_inches='tight')

xgb.plot_importance(
    xgb_class_merged_repeatK.best_estimator_).set_yticklabels(features_merged)
plt.show()


### XGBoost GridSearchCV with Stratified RepeatedKFold
This obviates the need for using for-loops

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.metrics import roc_auc_score

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    'font.size': 18,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
})  # 'figure.figsize': [14.0, 10.0],


print("Parallel Parameter optimization")
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
xgb_roc_auc_stratK = []
coeff_xgb_stratK = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# num = 10
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1000)

dirmaker(f'./model-logs/merged/{xdate}')
file_merged_xgb_stratK = open(
    f"./model-logs/merged/{xdate}/XGB_merged_stratKFold_{xdate}.txt", "w+")

xgb_class_mergedstratK = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    objective='binary:logistic',
    n_jobs=multiprocessing.cpu_count() // 2)  # tree_method='gpu_hist', enable_categorical=True, predictor='gpu_predictor',

p_stratK = {'max_depth': [2, 4, 6, 8, 10],
            'n_estimators': [10, 20, 30, 40, 50, 100],
            'learning_rate': [0.01, 0.1],
            # ,0.6,0.7,0.8,0.9,1.0
            'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
            }


X_train_merged, y_train_merged, X_test_merged, y_test_merged = df_generator(
    des_df_merged, nondes_df_merged, test_sample_size=8, nondes_batch_size=38)

xgb_clf = GridSearchCV(xgb_class_mergedstratK,
                       param_grid=p_stratK, verbose=1, scoring='roc_auc', cv=cv)

xgb_class_merged_stratK = xgb_clf.fit(X_train_merged, y_train_merged, eval_set=[(X_train_merged, y_train_merged), (
    X_test_merged, y_test_merged)], verbose=False, early_stopping_rounds=50)  # , early_stopping_rounds=50

print('Best score', xgb_class_merged_stratK.best_score_,
      file=file_merged_xgb_stratK)
print('Best params', xgb_class_merged_stratK.best_params_,
      file=file_merged_xgb_stratK)
y_pred_merged_xgb = xgb_class_merged_stratK.best_estimator_.predict(
    X_test_merged)
target_names = ['non-DES', 'DES']  # non-DES is 0, DES is 1
print(metrics.classification_report(y_test_merged, y_pred_merged_xgb,
      target_names=target_names), file=file_merged_xgb_stratK)
roc_auc_xgb = metrics.roc_auc_score(y_test_merged, y_pred_merged_xgb)
print(f"roc_auc_score: {roc_auc_xgb}", file=file_merged_xgb_stratK)
xgb_roc_auc_stratK.append(roc_auc_xgb)
print(xgb_class_merged_stratK.best_estimator_.feature_importances_,
      file=file_merged_xgb_stratK)
# print(f"intercept: {xgb_class_merged_stratK.intercept_}", file=file_merged_xgb_stratK)

xgb_class_merged_stratK_feature_df = pd.DataFrame(
    {'Importance': xgb_class_merged_stratK.best_estimator_.feature_importances_, 'Features_merged': features_merged})
max_coeff_index = list(xgb_class_merged_stratK.best_estimator_.feature_importances_).index(
    xgb_class_merged_stratK.best_estimator_.feature_importances_.max())
print(
    f'max feature: {xgb_class_merged_stratK.best_estimator_.feature_importances_.max()} at index {max_coeff_index} [{features_merged[max_coeff_index]}]', file=file_merged_xgb_stratK)
coeff_xgb_stratK[max_coeff_index] += 1
print('\n', file=file_merged_xgb_stratK)
print(xgb_class_merged_stratK_feature_df, file=file_merged_xgb_stratK)
print('\n'*2, file=file_merged_xgb_stratK)

# roc = [ z for z in range(1,num+1)]
print(
    f"roc_auc scores on test set: {xgb_roc_auc_stratK}", file=file_merged_xgb_stratK)
print(
    f"Average roc_auc scores on test set: {np.average(xgb_roc_auc_stratK)}", file=file_merged_xgb_stratK)
print(
    f"std dev of roc_auc scores on test set: {np.std(xgb_roc_auc_stratK)}", file=file_merged_xgb_stratK)
print(
    f"Best roc_auc score on test set: {max(xgb_roc_auc_stratK)} at index {xgb_roc_auc_stratK.index(max(xgb_roc_auc_stratK)) + 1}", file=file_merged_xgb_stratK)
print(
    f"Best model's roc_auc score from early stopping: {xgb_class_merged_stratK.best_estimator_.best_score}", file=file_merged_xgb_stratK)
print(
    f"Best model's iteration from early stopping: {xgb_class_merged_stratK.best_estimator_.best_iteration} \n", file=file_merged_xgb_stratK)
# print(f"model's eval_results: {xgb_class_merged_stratK.best_estimator_.evals_result()}", file=file_merged_xgb_stratK)
train_eval = list(xgb_class_merged_stratK.best_estimator_.evals_result()[
                  'validation_0'].items())
print(
    f'Number of training auc scores: {len(train_eval[0][1])}', file=file_merged_xgb_stratK)
print(
    f'auc scores of training set: {train_eval[0][1]}', file=file_merged_xgb_stratK)
print(f'Average and std-dev of auc scores of training set: {round(np.average(train_eval[0][1]),2)}, \
{round(np.std(train_eval[0][1]),2)} \n', file=file_merged_xgb_stratK)

val_eval = list(xgb_class_merged_stratK.best_estimator_.evals_result()[
                'validation_1'].items())
print(
    f'Number of testing auc scores: {len(val_eval[0][1])}', file=file_merged_xgb_stratK)
print(
    f'auc scores of testing set: {val_eval[0][1]}', file=file_merged_xgb_stratK)
print(f'Average and std-dev of auc scores of testing set: {round(np.average(val_eval[0][1]),2)}, \
{round(np.std(val_eval[0][1]),2)} \n', file=file_merged_xgb_stratK)

print('\n', file=file_merged_xgb_stratK)
coeff_df_xgb = pd.DataFrame(
    {'Top Features_merged': coeff_xgb_stratK, 'Features_merged': features_merged})
print(f"Coefficients: {coeff_df_xgb}", file=file_merged_xgb_stratK)

print(f'Best estimator: {xgb_class_merged_stratK.best_estimator_} \n',
      file=file_merged_xgb_stratK)
print(f'Best params: {xgb_class_merged_stratK.best_params_} \n',
      file=file_merged_xgb_stratK)
print(
    f"Best estimator's score from early stopping: {xgb_class_merged_stratK.best_estimator_.best_score} \n", file=file_merged_xgb_stratK)

# plotting roc_auc score
fig = plt.figure()
fig_ax = fig.add_subplot(1, 1, 1)
fig.set_size_inches(12, 8, forward=True)
fig_ax.set_xlabel("Number of runs")  # , fontsize=14, weight='bold')
fig_ax.set_ylabel("ROC-AUC score")  # , fontsize=14, weight='bold')
fig_ax.set_ylim(0, 1.1)
# fig_ax.set_xlim(0, num+2)
plt.title('XGBoost hbond number + lifetime', fontsize=12, weight='bold')
fig_ax.plot(range(1, len(train_eval[0][1])+1), train_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg training roc_auc: {round(np.average(train_eval[0][1]),2)}\n \
std training roc_auc : {round(np.std(train_eval[0][1]),2)}")

fig_ax.plot(range(1, len(val_eval[0][1])+1), val_eval[0][1], '-o', linewidth=2, markersize=8.0, label=f"avg testing roc_auc: {round(np.average(val_eval[0][1]),2)}\n \
std testing roc_auc : {round(np.std(val_eval[0][1]),2)}")

plt.legend(loc='best')
file_merged_xgb_stratK.close()
dirmaker(f'./plots/roc-auc/merged/{xdate}')
fig.savefig(f'plots/roc-auc/merged/{xdate}/XGB_merged_stratKsearch-{xdate}.png',
            dpi=500, facecolor='white', bbox_inches='tight')

xgb.plot_importance(
    xgb_class_merged_stratK.best_estimator_).set_yticklabels(features_merged)
plt.show()


## Boxplots

In [None]:
des_hlife_sum = des_df_hlife.describe()
des_hnum_sum = des_df_hnum.describe()
nondes_hlife_sum = nondes_df_hlife.describe()
nondes_hnum_sum = nondes_df_hnum.describe()

In [None]:
des_hlife_sum
# des_hnum_sum
# nondes_hlife_sum
# nondes_hnum_sum

In [None]:
# # All tags
# taggsss = list(nondes_df_hnum.columns)
# taggsss.remove('output')
# taggsss

# # Excluding the ratios
# taggsss = list(nondes_df_hnum.columns)
# taggsss.remove('output')
# taggsss.remove('A-A/B-B')
# taggsss.remove('A-B/(A-A + B-B)')
# taggsss

# Excluding all but the ratios
taggsss = list(nondes_df_hnum.columns)
taggsss.remove('output')
taggsss.remove('A-A')
taggsss.remove('B-B')
taggsss.remove('A-B')
# taggsss

In [None]:
def df_boxplot(df, list_of_tags):
    taggss = list_of_tags
    list_baba = []

    for i in range(len(taggss)):
        boxplot_dict = {
            'label' : taggss[i],  # hbond features
            'whislo': df[taggss[i]].loc['min'],    # Bottom whisker position
            'q1'    : df[taggss[i]].loc['25%'],    # First quartile (25th percentile)
            'med'   : df[taggss[i]].loc['50%'],    # Median         (50th percentile)
            'q3'    : df[taggss[i]].loc['75%'],    # Third quartile (75th percentile)
            'whishi': df[taggss[i]].loc['max'],    # Top whisker position
            'fliers': []        # Outliers
        }

        list_baba.append(boxplot_dict)
    
    return list_baba

In [None]:
deshnum_list = df_boxplot(des_hnum_sum, taggsss)
nondeshnum_list = df_boxplot(nondes_hnum_sum, taggsss)
deshlife_list = df_boxplot(des_hlife_sum, taggsss)
nondeshlife_list = df_boxplot(nondes_hlife_sum, taggsss)

In [None]:

def boxplotter(df_list, title_tag, fig_title, data_type='hnum'):
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")

    fig_1, ax_1 = plt.subplots(1,1)
    fig_1.set_size_inches(12,10, forward=True)  

    ax_1.bxp(df_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=3.5),
    flierprops=dict(linestyle='-', linewidth=5.5),
                medianprops=dict(linestyle='-', linewidth=3.5),
                whiskerprops=dict(linestyle='-', linewidth=3.5),
                capprops=dict(linestyle='-', linewidth=3.5))
    
    if data_type=='hnum':
        upper=105
    elif data_type=='hlife':
        upper=10
    else:
        upper=105
        
                        
    ax_1.set_ylim(bottom=0, top=upper)
    title = title_tag
    # ax_1.set_title(f'{title_tag}', weight='bold')
    dirmaker(f'./plots/boxplots/{xdate}')
    # plt.savefig(f"./plots/boxplots/{xdate}/{fig_title}_ratios_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=350)
    plt.savefig(f"./plots/boxplots/{xdate}/{fig_title}_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=350)
    plt.show()

In [None]:

def boxplotter_ratios(df_list, title_tag, fig_title, data_type='hnum'):
    xdate = datetime.datetime.now().strftime("%m-%d-%Y")

    fig_1, ax_1 = plt.subplots(1,1)
    fig_1.set_size_inches(12,10, forward=True)  

    ax_1.bxp(df_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=3.5),
    flierprops=dict(linestyle='-', linewidth=5.5),
                medianprops=dict(linestyle='-', linewidth=3.5),
                whiskerprops=dict(linestyle='-', linewidth=3.5),
                capprops=dict(linestyle='-', linewidth=3.5))
    
    if data_type=='hnum':
        upper=60
    elif data_type=='hlife':
        upper=31
    else:
        upper=105
        
                        
    ax_1.set_ylim(bottom=0, top=upper)
    title = title_tag
    # ax_1.set_title(f'{title_tag}', weight='bold')
    dirmaker(f'./plots/boxplots/{xdate}')
    plt.savefig(f"./plots/boxplots/{xdate}/{fig_title}_ratios_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=350)
    plt.show()

In [None]:
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 30,
    "ytick.labelsize": 30,
    'font.size': 34,
    'axes.labelweight': 'bold',
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

titles = {
        'des_hnum':'DES hydrogen bond numbers per molecule',
        'nondes_hnum':'non-DES hydrogen bond numbers per molecule',
        'des_hlife':'DES hydrogen bond lifetimes per molecule',
        'nondes_hlife':'non-DES hydrogen bond lifetimes per molecule',
    }

df_lists = {
        'des_hnum':deshnum_list,
        'nondes_hnum':nondeshnum_list,
        'des_hlife':deshlife_list,
        'nondes_hlife':nondeshlife_list,
    }

for key in df_lists.keys():
    titletag = key
    dtype = titletag.split('_')[1]
    # boxplotter(df_lists[titletag], titles[titletag], titletag, data_type=dtype)
    boxplotter_ratios(df_lists[titletag], titles[titletag], titletag, data_type=dtype)

## Scatter plots

In [None]:
des_df_hlife_scatter = des_df_hlife.drop(['A-A', 'B-B', 'A-B', 'output'], axis=1)
nondes_df_hlife_scatter = nondes_df_hlife.drop(['A-A', 'B-B', 'A-B', 'output'], axis=1)

In [None]:
nondes_df_hlife_scatter

## hlife

In [None]:
# ratio of inter-component hlife to intra-component hlife vs ratio of intra-component hlife
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 30,
    "ytick.labelsize": 30,
    'font.size': 34,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

xdate = datetime.datetime.now().strftime("%m-%d-%Y")

fig_2, ax_2 = plt.subplots(1,1)
fig_2.set_size_inches(12,10, forward=True)  

ax_2.scatter(des_df_hlife_scatter['A-A/B-B'], des_df_hlife_scatter['A-B/(A-A + B-B)'], s=200, label='DES', edgecolors='black')
ax_2.scatter(nondes_df_hlife_scatter['A-A/B-B'], nondes_df_hlife_scatter['A-B/(A-A + B-B)'], s=200, label='non-DES', alpha=0.5, edgecolors='black')
ax_2.set_xlabel('\n A-A/B-B')
ax_2.set_ylabel('A-B/(A-A + B-B) \n')
ax_2.set_ylim(top=2.6, bottom=0)
ax_2.set_xlim(right=61)
ax_2.legend(loc='upper right')
dirmaker(f'./plots/scatterplots/{xdate}')
plt.savefig(f"./plots/scatterplots/{xdate}/hlife_ratios_inter_vs_intra_{xdate}_alpha5.tiff", facecolor="white", bbox_inches="tight", dpi=350)

In [None]:
# ratio of intra-component hlife vs ratio of inter-component hlife to intra-component hlife
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 30,
    "ytick.labelsize": 30,
    'font.size': 34,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

xdate = datetime.datetime.now().strftime("%m-%d-%Y")

fig_2, ax_2 = plt.subplots(1,1)
fig_2.set_size_inches(12,10, forward=True)  

ax_2.scatter(des_df_hlife_scatter['A-B/(A-A + B-B)'], des_df_hlife_scatter['A-A/B-B'], s=200, label='DES', edgecolors='black')  # c='blue', 
ax_2.scatter(nondes_df_hlife_scatter['A-B/(A-A + B-B)'], nondes_df_hlife_scatter['A-A/B-B'], s=200, label='non-DES', alpha=0.5, edgecolors='black')  # c='red', 
ax_2.set_ylabel('A-A/B-B \n')
ax_2.set_xlabel('\n A-B/(A-A + B-B) ')
ax_2.legend(loc='upper right')

ax_2.set_xlim(right=2.6)
ax_2.set_ylim(top=61)
dirmaker(f'./plots/scatterplots/{xdate}')
plt.savefig(f"./plots/scatterplots/{xdate}/hlife_ratios_intra_vs_inter_{xdate}_alpha5.tiff", facecolor="white", bbox_inches="tight", dpi=350)

## hnum

In [None]:
des_df_hnum_scatter = des_df_hnum.drop(['A-A', 'B-B', 'A-B', 'output'], axis=1)
nondes_df_hnum_scatter = nondes_df_hnum.drop(['A-A', 'B-B', 'A-B', 'output'], axis=1)

In [None]:
# ratio of inter-component hnum to intra-component hnum vs ratio of intra-component hnum
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 30,
    "ytick.labelsize": 30,
    'font.size': 34,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

xdate = datetime.datetime.now().strftime("%m-%d-%Y")

fig_2, ax_2 = plt.subplots(1,1)
fig_2.set_size_inches(12,10, forward=True)  

ax_2.scatter(des_df_hnum_scatter['A-A/B-B'], des_df_hnum_scatter['A-B/(A-A + B-B)'], s=200, label='DES', edgecolors='black')
ax_2.scatter(nondes_df_hnum_scatter['A-A/B-B'], nondes_df_hnum_scatter['A-B/(A-A + B-B)'], s=200, label='non-DES', alpha=0.5, edgecolors='black')
ax_2.set_xlabel('\n A-A/B-B')
ax_2.set_ylabel('A-B/(A-A + B-B) \n')
ax_2.legend(loc='upper right')
ax_2.set_ylim(top=2.6, bottom=0)
ax_2.set_xlim(right=61)

dirmaker(f'./plots/scatterplots/{xdate}')
plt.savefig(f"./plots/scatterplots/{xdate}/hnum_ratios_inter_vs_intra_{xdate}_alpha5.tiff", facecolor="white", bbox_inches="tight", dpi=350)

In [None]:
# ratio of intra-component hnum vs ratio of inter-component hnum to intra-component hnum
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 30,
    "ytick.labelsize": 30,
    'font.size': 34,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

xdate = datetime.datetime.now().strftime("%m-%d-%Y")

fig_2, ax_2 = plt.subplots(1,1)
fig_2.set_size_inches(12,10, forward=True)  

ax_2.scatter(des_df_hnum_scatter['A-B/(A-A + B-B)'], des_df_hnum_scatter['A-A/B-B'], s=200, label='DES', edgecolors='black') # c='blue', 
ax_2.scatter(nondes_df_hnum_scatter['A-B/(A-A + B-B)'], nondes_df_hnum_scatter['A-A/B-B'], s=200, label='non-DES', alpha=0.5, edgecolors='black') # c='red', 
ax_2.set_ylabel('A-A/B-B \n')
ax_2.set_xlabel('\n A-B/(A-A + B-B)')
ax_2.legend(loc='upper right')
ax_2.set_ylim(top=61)
ax_2.set_xlim(right=2.6)

dirmaker(f'./plots/scatterplots/{xdate}')
plt.savefig(f"./plots/scatterplots/{xdate}/hnum_ratios_intra_vs_inter_{xdate}_alpha5.tiff", facecolor="white", bbox_inches="tight", dpi=350)

## histogram

In [None]:
# hnum histogram
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 30,
    "ytick.labelsize": 30,
    'font.size': 28,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

xdate = datetime.datetime.now().strftime("%m-%d-%Y")

fig_2, ax_2 = plt.subplots(1,1)
fig_2.set_size_inches(12,10, forward=True)  

ax_2.hist([des_df_hnum_scatter['A-A/B-B'], nondes_df_hnum_scatter['A-A/B-B'], 
        des_df_hnum_scatter['A-B/(A-A + B-B)'], nondes_df_hnum_scatter['A-B/(A-A + B-B)']], bins=5,
         label=['A-A/B-B des', 'A-A/B-B non-des', 'A-B/(A-A + B-B) des', 'A-B/(A-A + B-B) non-des'])

ax_2.legend(loc='upper right', prop={'weight':'bold'})
# ax_2.hist(nondes_df_hnum_scatter, bins=10)
ax_2.set_xlabel('\n Average hydrogen bond numbers')
ax_2.set_xlim(right=61)
ax_2.set_ylabel('Number of systems \n')
dirmaker(f'./plots/hist/{xdate}')
plt.savefig(f"./plots/hist/{xdate}/hnum_hist_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=350)

In [None]:
# hlife histogram 
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 30,
    "ytick.labelsize": 30,
    'font.size': 28,
    'axes.labelweight': 'bold',
    'figure.dpi': 150.0,
    'axes.linewidth':2.0,
})

xdate = datetime.datetime.now().strftime("%m-%d-%Y")

fig_2, ax_2 = plt.subplots(1,1)
fig_2.set_size_inches(12,10, forward=True)  

ax_2.hist([des_df_hlife_scatter['A-A/B-B'], nondes_df_hlife_scatter['A-A/B-B'], 
        des_df_hlife_scatter['A-B/(A-A + B-B)'], nondes_df_hlife_scatter['A-B/(A-A + B-B)']], bins=5,
         label=['A-A/B-B des', 'A-A/B-B non-des', 'A-B/(A-A + B-B) des', 'A-B/(A-A + B-B) non-des'], rwidth=20.9) #, histtype='stepfilled')

ax_2.legend(loc='upper right', prop={'weight':'bold'})

ax_2.set_xlabel('Average hydrogen bond lifetimes')
ax_2.set_ylabel('Number of systems')
ax_2.set_xlim(right=61)
dirmaker(f'./plots/hist/{xdate}')
plt.savefig(f"./plots/hist/{xdate}/hlife_hist_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=350)

In [None]:
# non-DES alone
fig,ax = plt.subplots(1,1, sharey=True)
fig.set_size_inches(14,8)
# ax1.set_ylabel("Density", weight="bold")
nondes_df_hlife_scatter.plot.kde(ax=ax, title='non-DES KDE')
des_df_hlife_scatter.plot.kde(ax=ax, title='DES KDE')

In [None]:
# non-overlapping histo NON-DES
xdate = datetime.datetime.now().strftime("%m-%d-%Y")
non_des_fig_scatter = plt.figure()
non_des_fig_scatter.set_size_inches(12, 8, forward=True)
non_des_ax_scatter = non_des_fig_scatter.add_subplot(1,1,1)
non_des_ax_scatter.set_xlabel("Hydrogen bond number", fontsize=24, weight='bold')
non_des_ax_scatter.set_ylabel("Number of systems", fontsize=24, weight='bold')
ytick = np.arange(0,40, 2)
xtick = np.arange(0,90, 10)
plt.yticks(ytick,fontsize=22, weight='bold')
plt.xticks(xtick,fontsize=22, weight='bold')
plt.title('Non-DES', fontsize=22, weight='bold')
plt.ylim([0,12])
# non_des_hist = non_des[['AA', 'BB', 'AB']]
# non_des_hist.plot.hist(bins=20, alpha=0.5, ylim=[0,22], ax =non_des_ax_scatter) # ylim=[0,22], 
plt.hist([nondes_hnum_scatter['A-A'], nondes_hnum_scatter['B-B'], nondes_hnum_scatter['A-B']], bins=10, label=['A-A', 'B-B', 'A-B'])
plt.legend(loc='upper right', prop={'weight':'bold'})
non_des_fig_scatter.savefig(f'nondes_hnum_scatter_nonoverlap_{xdate}.png', dpi=350,facecolor='white', bbox_inches='tight')
plt.show()


## Feature importance

### hnum or hlife
single plots

In [None]:
import matplotlib as mpl

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 34,
    "ytick.labelsize": 40,
    'font.size': 44,
    'axes.labelweight': 'bold',
    'axes.labelsize': 48,
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

pathway = Path()

for file in pathway.glob('./model-logs/training/hnum/01-23-2023/*.txt'):
    if str(file).__contains__('KNeighbors'):
        continue


    model_tags = {
        'A-A' : [],
        'B-B' : [],
        'A-B' : [],
        'A-A/B-B' : [],
        'A-B/(A-A + B-B)' : []
    }

    index = []
    with open(file, 'r+') as r:        
        # index = 0
        for l_no, line in enumerate(r):
            # search string
            if 'Top ' in line:
                # index.join(str(l_no))
                index.append(l_no)
                # print(f'string found in {file} on line number {l_no}')
                break

        # print(index)

    with open(file, 'r+') as r:
        lines = r.readlines()

        for line in lines[index[-1] + 1:]:
            # print(line)
            splitted = line.strip().split()
            # print(splitted)

            if splitted[0] == '4':
                model_tags['A-B/(A-A + B-B)'].append(int(splitted[1]))
            else:
                model_tags[splitted[2]].append(int(splitted[1]))

    print(model_tags)

    fig = plt.figure()
    fig.set_size_inches(24, 12, forward=True)
    test = fig.add_subplot(1,1,1)        
    test.set_ylim([0, 100])
    # test.set_xlim([0, 8.0])
    bar_width=0.5
    prop_cycle = plt.rcParams['axes.prop_cycle']
    colors = prop_cycle.by_key()['color']
    
    names = list(model_tags.keys())
    values = list(model_tags.values())
    values = list(np.concatenate(values).flat)
    test.bar(range(len(model_tags)), values, tick_label=names, color=colors, width=bar_width)
    test.set_xlabel("\nHydrogen bond types")
    test.set_ylabel('Frequency\n')
    figname = str(file.stem).split('_')[0]
    figtype = str(file.stem).split('_')[1]
    fig.savefig(f'./plots/top-coeffs/{figtype}/{figname}_{figtype}_{xdate}.tiff', dpi=350, facecolor='white', bbox_inches='tight')
    # fig.savefig(f'{folder}/{figname}.pdf', dpi=fig.dpi, facecolor='white', bbox_inches='tight')
    plt.show()

       

In [None]:
features_merged_ = ['{A-A_n}', 'B-B_n', 'A-B_n', r'$\frac{A-A_n}{B-B_n}', r'$\frac{A-B_n}{A-A_n + B-B_n}',
                   'A-A_l', 'B-B_l', 'A-B_l', r'$\frac{A-A_l}{B-B_l}',
                   r'$\frac{A-B_l}{A-A_l + B-B_l}']

In [None]:
# merged

import matplotlib as mpl

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 34,
    "ytick.labelsize": 40,
    'font.size': 44,
    'axes.labelweight': 'bold',
    'axes.labelsize': 48,
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

pathway = Path()

for file in pathway.glob('./model-logs/training/merged/01-23-2023/*.txt'):
    print(file)
    if str(file).__contains__('KNeighbors'):
        continue


    model_tags_merged = {
        'A-A_n' : [],
        'B-B_n' : [],
        'A-B_n' : [],
        'A-A_n/B-B_n' : [],
        'A-B_n/(A-A_n + B-B_n)' : [],
        'A-A_l' : [],
        'B-B_l' : [],
        'A-B_l' : [],
        'A-A_l/B-B_l' : [],
        'A-B_l/(A-A_l + B-B_l)' : []
    }

    index = []
    with open(file, 'r+') as r:        
        # index = 0
        for l_no, line in enumerate(r):
            # search string
            if 'Top ' in line:
                # index.join(str(l_no))
                index.append(l_no)
                # print(f'string found in {file} on line number {l_no}')
                break

        # print(index)

    with open(file, 'r+') as r:
        lines = r.readlines()

        for line in lines[index[-1] + 1:]:
            # print(line)
            splitted = line.strip().split()
            # print(splitted)

            if splitted[0] == '4':
                model_tags_merged['A-B_n/(A-A_n + B-B_n)'].append(int(splitted[1]))
            elif splitted[0] == '9':
                model_tags_merged['A-B_l/(A-A_l + B-B_l)'].append(int(splitted[1]))
            else:
                model_tags_merged[splitted[2]].append(int(splitted[1]))

    print(model_tags_merged)

    fig = plt.figure()
    fig.set_size_inches(30, 20, forward=True)
    test = fig.add_subplot(1,1,1)        
    test.set_ylim([0, 100])
    # test.set_xlim([0, 8.0])
    bar_width=0.5
    prop_cycle = plt.rcParams['axes.prop_cycle']
    colors = prop_cycle.by_key()['color']
    
    # names = list(model_tags_merged.keys())
    names = ['A-A_n', 'B-B_n', 'A-B_n', r'$\frac{A-A_n}{B-B_n}$', r'$\frac{A-B_n}{A-A_n + B-B_n}$',
                   'A-A_l', 'B-B_l', 'A-B_l', r'$\frac{A-A_l}{B-B_l}$',
                   r'$\frac{A-B_l}{A-A_l + B-B_l}$']
    values = list(model_tags_merged.values())
    values = list(np.concatenate(values).flat)
    test.bar(range(len(model_tags_merged)), values, tick_label=names, color=colors, width=bar_width)
    test.set_xlabel("\nHydrogen bond types")
    test.set_ylabel('Frequency\n')
    figname = str(file.stem).split('_')[0]
    figtype = str(file.stem).split('_')[1]
    fig.savefig(f'./plots/top-coeffs/{figtype}/{figname}_{figtype}_{xdate}.tiff', dpi=350, facecolor='white', bbox_inches='tight')
    # fig.savefig(f'{folder}/{figname}.pdf', dpi=fig.dpi, facecolor='white', bbox_inches='tight')
    plt.show()

       

### hnum or hlife
combined plots

In [None]:
import matplotlib as mpl

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 34,
    "ytick.labelsize": 40,
    'font.size': 44,
    'axes.labelweight': 'bold',
    'axes.labelsize': 48,
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
})

pathway = Path()
model_tags = {
        'A-A' : [],
        'B-B' : [],
        'A-B' : [],
        'A-A/B-B' : [],
        'A-B/(A-A + B-B)' : []
    }

for file in pathway.glob('./model-logs/training/hlife/01-23-2023/*.txt'):
    if str(file).__contains__('KNeighbors'):
        continue


    

    index = []
    with open(file, 'r+') as r:        
        # index = 0
        for l_no, line in enumerate(r):
            # search string
            if 'Top ' in line:
                # index.join(str(l_no))
                index.append(l_no)
                # print(f'string found in {file} on line number {l_no}')
                break

        # print(index)

    with open(file, 'r+') as r:
        lines = r.readlines()

        for line in lines[index[-1] + 1:]:
            # print(line)
            splitted = line.strip().split()
            # print(splitted)

            if splitted[0] == '4':
                model_tags['A-B/(A-A + B-B)'].append(int(splitted[1]))
            else:
                model_tags[splitted[2]].append(int(splitted[1]))

print(list(model_tags.values()))
total_values = list(model_tags.values())
total = []
for x in total_values:
    total.append(sum(x))

print(total)

fig = plt.figure()
fig.set_size_inches(24, 12, forward=True)
test = fig.add_subplot(1,1,1)        
test.set_ylim([0, 600])
# test.set_xlim([0, 8.0])
bar_width=0.5
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

names = list(model_tags.keys())
total_values = list(model_tags.values())
total = []
for x in total_values:
    total.append(sum(x))

print(total)

test.bar(range(len(model_tags)), total, tick_label=names, color=colors, width=bar_width)
test.set_xlabel("\nHydrogen bond types")
test.set_ylabel('Frequency\n')
figname = 'combined_plots'
figtype = str(file.stem).split('_')[1]
# fig.savefig(f'./plots/top-coeffs/{figtype}/{figname}_{figtype}_{xdate}.tiff', dpi=350, facecolor='white', bbox_inches='tight')
# fig.savefig(f'{folder}/{figname}.pdf', dpi=fig.dpi, facecolor='white', bbox_inches='tight')
plt.show()

       

In [None]:
# merged
import matplotlib as mpl

xdate = datetime.datetime.now().strftime("%m-%d-%Y")
plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 40,
    "ytick.labelsize": 48,
    'font.size': 54,
    'axes.labelweight': 'bold',
    'axes.labelsize': 48,
    'figure.dpi': 350.0,
    'axes.linewidth':2.0,
    'mathtext.default': 'regular',
    #'mathtext.bf' : 'regular:bold'
})

pathway = Path()
model_tags_merged = {
        'A-A_n' : [],
        'B-B_n' : [],
        'A-B_n' : [],
        'A-A_n/B-B_n' : [],
        'A-B_n/(A-A_n + B-B_n)' : [],
        'A-A_l' : [],
        'B-B_l' : [],
        'A-B_l' : [],
        'A-A_l/B-B_l' : [],
        'A-B_l/(A-A_l + B-B_l)' : []
    }

for file in pathway.glob('./model-logs/training/merged/01-23-2023/*.txt'):
    if str(file).__contains__('KNeighbors'):
        continue


    

    index = []
    with open(file, 'r+') as r:        
        # index = 0
        for l_no, line in enumerate(r):
            # search string
            if 'Top ' in line:
                # index.join(str(l_no))
                index.append(l_no)
                # print(f'string found in {file} on line number {l_no}')
                break

        # print(index)

    with open(file, 'r+') as r:
        lines = r.readlines()

        for line in lines[index[-1] + 1:]:
            # print(line)
            splitted = line.strip().split()
            # print(splitted)

            if splitted[0] == '4':
                model_tags_merged['A-B_n/(A-A_n + B-B_n)'].append(int(splitted[1]))
            elif splitted[0] == '9':
                model_tags_merged['A-B_l/(A-A_l + B-B_l)'].append(int(splitted[1]))
            else:
                model_tags_merged[splitted[2]].append(int(splitted[1]))
                

print(list(model_tags_merged.values()))
total_values = list(model_tags_merged.values())
total = []
for x in total_values:
    total.append(sum(x))

print(total)

fig = plt.figure()
fig.set_size_inches(40, 20, forward=True)
test = fig.add_subplot(1,1,1)        
test.set_ylim([0, 600])
# test.set_xlim([0, 8.0])
bar_width=0.5
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

names = [r'A-A$_{\_n}$', r'B-B$_{\_n}$', r'A-B$_{\_n}$', r"$\frac{A-A_{\_n}}{B-B_{\_n}}$", r"$\frac{A-B_{\_n}}{A-A_{\_n} + B-B_{\_n}}$",
                   r'A-A$_{\_l}$', r'B-B$_{\_l}$', r'A-B$_{\_l}$', r'$\frac{A-A_{\_l}}{B-B_{\_l}}$',
                   r'$\frac{A-B_{\_l}}{A-A_{\_l} + B-B_{\_l}}$']

total_values = list(model_tags_merged.values())
total = []
for x in total_values:
    total.append(sum(x))

print(total)

test.bar(range(len(model_tags_merged)), total, tick_label=names, color=colors, width=bar_width)
test.set_xlabel("\nHydrogen bond types")
test.set_ylabel('Frequency\n')
figname = 'combined_plots'
figtype = str(file.stem).split('_')[1]
fig.savefig(f'./plots/top-coeffs/{figtype}/{figname}_{figtype}_{xdate}.tiff', dpi=350, facecolor='white', bbox_inches='tight')
plt.show()

       

## Confusion Matrices visualization

#### hnum

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

font = {'family': 'normal',
        'weight': 'bold',
        'size': 22}
plt.rc('font', **font)

models = {
    'KNN': knn_class_hnum,
    'SVC': sv_class_hnum,
    'LR': reg_log_hnum,
    'DT': dt_class_hnum,
    'RF': rf_class_hnum,
    'XG': xg_boost_hnum,
}

class_names = target_names

for name, model in models.items():
    titles_options = [
        (f"{name} confusion matrix (not normalized)_hnum", None),
        (f"{name} confusion matrix (normalized)_hnum", "true"), ]

    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_estimator(
            model,
            X_test_hnum,
            y_test_hnum,
            display_labels=class_names,
            cmap=plt.cm.Blues,
            normalize=normalize,
        )

        disp.figure_.set_size_inches(14, 10)
        disp.ax_.set_title(title, fontsize=30, weight='bold')
        disp.ax_.set_xlabel("Predicted label", fontsize=24, weight='bold')
        disp.ax_.set_ylabel("True label", fontsize=24, weight='bold')

        print(title)
        print(disp.confusion_matrix)
        plt.savefig(
            f"./plots/classifiers/{title}.png", dpi=400, facecolor='white')

    plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

font = {'family': 'normal',
        'weight': 'bold',
        'size': 22}
plt.rc('font', **font)

models = {
    'KNN': knn_class_hlife,
    'SVC': sv_class_hlife,
    'LR': reg_log_hlife,
    'DT': dt_class_hlife,
    'RF': rf_class_hlife,
    'XG': xg_boost_hlife,
}

class_names = target_names

for name, model in models.items():
    titles_options = [
        (f"{name} confusion matrix (not normalized)_hlife", None),
        (f"{name} confusion matrix (normalized)_hlife", "true"), ]

    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_estimator(
            model,
            X_test_hlife,
            y_test_hlife,
            display_labels=class_names,
            cmap=plt.cm.Blues,
            normalize=normalize,
        )

        disp.figure_.set_size_inches(14, 10)
        disp.ax_.set_title(title, fontsize=30, weight='bold')
        disp.ax_.set_xlabel("Predicted label", fontsize=24, weight='bold')
        disp.ax_.set_ylabel("True label", fontsize=24, weight='bold')

        print(title)
        print(disp.confusion_matrix)
        plt.savefig(
            f"./plots/classifiers/{title}.png", dpi=400, facecolor='white')

    plt.show()


In [None]:
models = {
    'KNN': knn_class_hnum,
    'SVC': sv_class_hnum,
    'LR': reg_log_hnum,
    'DT': dt_class_hnum,
    'RF': rf_class_hnum,
    'XG': xg_boost_hnum,
}
for name, model in models.items():
    print(name, model)
