### Import library

Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP
    Goal is to predict if mutation is SNP or PD
    XG Boost
        
    Total samples: 3368
    2254 PD samples
    1111 SNP samples
    3 NA samples

In [187]:
""" Imports the required libraries and packages """

import pandas as pd                                                              # Data manipulation in dataframes
import numpy as np                                                               # Array manipulation
import xgboost as xgb                                                            # Gradient boosting package

import random as rd                                                              # Random seed generation
import time                                                                      # Time program run time
import sys

import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

from sklearn.metrics import(
    matthews_corrcoef,                                                           # MCC for evaluation
    # balanced_accuracy_score, #hyperparameter evaluation
    # f1_score,  #hyperparameter evaluation
    confusion_matrix,                                                            # Confusion matrix for classification evalutation
    classification_report                                                        # Return the F1, precision, and recall of a prediction
    )

from sklearn.model_selection import(
    train_test_split,                                                            # Splits data frame into the training set and testing set
    # GridSearchCV,  # Searches all hyperparameters
    # RandomizedSearchCV, # Searches random range of hyperparameters
    GroupKFold                                                                   # K-fold CV with as groups
        )

from sklearn.utils import shuffle
# from sklearn.ensemble import RandomForestClassifier                              # SK learn API for classificastion random forests

np.set_printoptions(precision = 3,threshold=np.inf, suppress=True)               # Full array printing

### Split dataset into training and validation sets

In [188]:
def Train_Test_Split(file):
    """      
    Input:      file             Pre-processed dataset done by PDB2AC script

    Returns:    Training_Set     80% training set split
                Testing_Set      20% testing set split
                
    80% training and 20% testing split. Splits are shuffled randomly and index reset
    """
    AC_dataset                  = pd.read_csv(file, index_col=0)  
    Training_Set, Testing_Set   = train_test_split(AC_dataset,train_size = 0.8)
        
    Training_Set                = Training_Set.sample(frac = 1) #Shuffle data after splitting
    Testing_Set                 = Testing_Set.sample(frac = 1)
    
    Training_Set.reset_index(drop=True, inplace = True) #Drop index to avoid training on index values
    Testing_Set.reset_index(drop=True, inplace = True)  #Reset index after splitting for compatability with group fold CV
    
    return Training_Set, Testing_Set

In [189]:
def data(Training_Set, Testing_Set):
    """      
    Input:      Training_Set     80% training set split
                Testing_Set      20% testing set split

    Returns:    train_features   Features for training
                train labels     Class lables for training
                test_features    Features for testing
                test_labels      Class labels for testing
                
    Creates the datasets needed for GBC model training and predictions
    """
    
    train_features     = Training_Set.drop(['AC Code','dataset'], axis =1)      
    train_labels       = Training_Set['dataset']                                  
    groups             = Training_Set['AC Code'].to_list()                        #List of proteins for grouping
    
    test_features     = Testing_Set.drop(['AC Code','dataset'], axis =1)         
    test_labels       = Testing_Set['dataset']                                  
        
    return(train_features, train_labels, test_features, test_labels)

### Initial evaluation

In [190]:
def test(inData, classData, ValData, Vallabel):
    """ 
    Input:  Training_Set   Training data
            Testing_Set    Testing data

    Evaluate training data before CV and balancing. Gradient boosting for prediction on the test data. 
    True values are testing data class labels
    """    
    d_train = xgb.DMatrix(inData, classData)
    d_test = xgb.DMatrix(ValData, Vallabel)

    params = {
    'booster': 'gbtree',
    'objective': 'binary:hinge', 
    }
    XGB_initial = xgb.train(params, d_train)
    
    Output_pred = XGB_initial.predict(d_test)
    print(f"              **Initial Evaluation**")
    print(f"Confusion Matrix:\n {confusion_matrix(Vallabel, Output_pred)}")
    print(f"MCC              {matthews_corrcoef(Vallabel, Output_pred)}\n")


## Group K-fold CV (outer loop)

In [191]:
# def CV(Training_Set):
#     """      
#     Input:      Training_Set     80% training set split
            
#     Returns:    IT_list         List of training features for each fold
#                 LT_list         List of training class labels for each fold
#                 IV_list         List of validation features for each fold
#                 LV_list         List of validation class labels for each fold

#     K-fold CV with protein groups separated between training and validation sets for each fold. Creates 5 folds.
#     """
    
#     features     = Training_Set.drop(['AC Code','dataset'], axis =1)         #Features for training
#     labels       = Training_Set['dataset']                         #Class labels for training
#     groups       = Training_Set['AC Code'].to_list()               #List of proteins for grouping
    
#     params = {
#     'booster': 'gbtree',
#     'objective': 'binary:hinge', 
#     }
    
#     d_train = xgb.DMatrix(features, labels)
#     xgb_model = xgb.train(params, d_train)
        
#     CV = xgb.cv(
#         params=params,
#         dtrain = d_train,
#         nfold = 5,
#         early_stopping_rounds= 20,
#         metrics='error',
#         as_pandas=True,
#     )
    
#     print(CV)
        
#     # CV             = GroupKFold(n_splits = 5)                           #Creates 5 splits
    
#     # IT_list = []
#     # LT_list = []
#     # IV_list = []
#     # LV_list = []
    
#     # for train_idx, val_idx in CV.split(Input_CV, Output_CV, Protein_Groups): #Generates the indices to be used for a training and validation split. Indicies are unique to train/ val sets
        
#     #     Rd = np.random.randint(time.time())                                  #Random number from 1 to time since epoch

#     #     Input_train                        = Input_CV.loc[train_idx]         #New dataframe from selected indices
#     #     Classes_train                      = Output_CV.loc[train_idx]
#     #     Input_train.drop(['AC Code'], axis = 1, inplace = True)              #Group identifer not needed for training
                
#     #     Input_val                          = Input_CV.loc[val_idx]
#     #     Classes_val                        = Output_CV.loc[val_idx]
#     #     Input_val.drop(['AC Code'], axis   = 1, inplace = True)

#     #     IT_list.append(Input_train.sample(frac=1, random_state=Rd))          #Shuffles lists, random state to ensure features and labels match for each fold
#     #     LT_list.append(Classes_train.sample(frac=1, random_state=Rd))
#     #     IV_list.append(Input_val.sample(frac=1, random_state=(Rd-1)))
#     #     LV_list.append(Classes_val.sample(frac=1, random_state=(Rd-1)))
        

#     return(xgb_model)


## Balancing (inner loop)

In [192]:
def find_minority_class(classData):
    """ 
    Input:        classData  Array of class labels

    Returns:      minClass   The label for the minority class
                  minSize    The number of items in the minority class
                  maxSize    The number of items in the majority class

    Find information about class size imbalance
    """
    
    Minority_count = 0
    Majority_count = 0
    for datum in classData:
        if datum == 1:
            Majority_count += 1
        elif datum == 0:
            Minority_count += 1

    minClass = 0
    minSize  = Minority_count
    maxSize  = Majority_count
    if Minority_count > Majority_count:
        minClass = 1
        minSize  = Majority_count
        maxSize  = Minority_count

    return minClass, minSize, maxSize

In [193]:
def balance(inData, classData, minClass, minSize):
    """ 
    Input:        inData          array of input data
                  classData       array of classes assigned
                  minorityClass   class label for the minority class
                  minoritySize    size of the minority class

    Returns:      usedLines       array of indexes that are of interest for a balanced dataset

    Perform the actual balancing for a fold between SNPs and PDs
    """
    usedLines = [False] * len(inData) #Array of false for length of data
    for i in range(len(inData)):
        if classData[i] == minClass:
            usedLines[i] = True            #True lines are SNP
            
    usedCount = 0
    while usedCount < minSize:
        i = rd.randrange(len(inData))
        if usedLines[i] == False:
            usedLines[i] = True
            usedCount += 1          #Set PD lines "True", until equal to number of SNP lines

    return usedLines

In [194]:
def balance_data(inData, classData, usedLines):
    """     
    Input:      inData      array of input training data
                classData   array of classes assigned to training data
                usedLines   array of line indexes to print

    Returns:    input_balance  Dataframe of balanced training features
                label_balance  Dataframe of balanced training labels
                       
    Create dataframe of the input training data and classes used. Index_list preserves the indicies between usedLines and inData, used to pull the needed lines.
    """
    Rd = np.random.randint(time.time())
    index_list = []
    
    for i in range(len(usedLines)):
        if usedLines[i] == True:
            index_list.append(i)
             
    input_balance = inData.iloc[index_list] 
    label_balance = classData.iloc[index_list]   
    
    input_balance = input_balance.sample(frac=1, random_state=Rd).reset_index(inplace = False, drop = True)
    label_balance = label_balance.sample(frac=1, random_state=Rd).reset_index(inplace = False, drop = True)
    
    return input_balance, label_balance

In [195]:
def Balance_ratio(maxSize, minSize): 
    """ 
    Input:      maxSize     The number of items in the majority class
                minSize     The number of items in the minority class

    Returns:    BF          Number of balancing folds

    Calculate the number of balancing folds needed using ratio of majority to minority class size. Double to ensure sufficient
    majority class instances are sampled, then + 1 to make odd to allow weighted vote.
    """
    Divide = maxSize/minSize
    BF = (2 * round(Divide)) + 1 #Double ratio to nearest integer
    return BF

In [196]:
def Balance_Folds(BF, inData, classData, minClass, minSize):
    """ 
    Input:      BF                Number of balancing folds
                inData            Features for training
                classData         Class labels for training
                minClass          The label for the minority class
                minSize           The number of items in the minority class
                                  
    Returns:    Input_folds       List of balanced training feature folds
                Output_folds      List of balanced training label folds

    Perform the balance_data() function n number of balancing fold times. Return lists for feature data and labels
    where each item is the output of balance_data()
    """
    Input_folds  = []
    Output_folds = []

    for i in range(BF):
        usedLines                    = balance(inData, classData, minClass, minSize)
        input_balance, label_balance = balance_data(inData, classData, usedLines)
        
        Input_folds.append(input_balance)
        Output_folds.append(label_balance)
            
    return Input_folds, Output_folds

### XGB hyperparameter tuning

In [197]:
# def Hyperparameter(BF, Input_folds, Output_folds):
#     """ Input:      BF                Number of balancing folds needed
#                     Input_folds       List of 5 balanced arrays of training data
#                     Output_folds      List of 5 balanced arrays of training data's labels

#         Returns:    BF_RFC_HP         List of optimized hyperparameters for each RFC

#         Perform RandomSearchCV on each RFC to optimize number of trees, max depth and max samples
#     """  
#     estimator = RandomForestClassifier()
#     param_grid = {
#                 'n_estimators':np.arange(50,500,50),
#                 'max_depth': np.arange(2, 10, 2),
#                 'max_samples': np.arange(0.2, 1.2, 0.2)
#                   }
#     BF_RFC_HP = []

#     for i in range(BF):
#         HPtuning = RandomizedSearchCV(
#             estimator,
#             param_grid, 
#             scoring = 'balanced_accuracy',
#             cv = 10,
#             n_jobs = 6, #how many cores to run in parallel
#             verbose = 2
#             ).fit(Input_folds[i], Output_folds[i])
#         BF_RFC_HP.append(HPtuning.best_params_)
    
#     return(BF_RFC_HP)

### Train XGB on the trainings folds

In [198]:
def BF_fitting(BF, Input_folds, Output_folds, ValData, Vallabel): 
    """ 
    Input:      BF                Number of balancing folds                      
                Input_folds       List of balanced training feature folds
                Output_folds      List of balanced training label folds

    Returns:    BF_GBC            List of gradient boosted trees trained on each balancing fold

    Create GBC model that returns probability predictions for each fold, using output of Balance_Folds() as training data
    """    
    d_test = xgb.DMatrix(ValData, Vallabel)
    
    params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic', 
    'eval_metric': ['error', 'logloss'],
    'verbosity': 2,
    }
    
    
    BF_GBC = []
    for i in range(BF):
        d_train = xgb.DMatrix(Input_folds[i], Output_folds[i]) #Create DMatrix for each training balanced fold
        BF_GBC.append(xgb.train(params, 
                                d_train, 
                                evals  = [(d_test,'eval'), (d_train,'train')],
                                verbose_eval = True, 
                                early_stopping_rounds=10,
                                )
                      ) #Generates and fits a GBC for each training balanced fold
        
    return BF_GBC

#### Validate each GBC on validation set, for each fold

In [199]:
def BF_validate(BF_GBC, ValData):
    """ 
    Input:      BF_RFC          List of RFCs trained on balancing folds
                ValData         Unseen validation features from CV fold
                
    Returns:    Prob_matrix     List of arrays. Each item is 2D matrix where the 1st dimension is each subset in balancing fold, 
                                2nd dimension is predicted probability
    
    Test the trained RFCs on the test set, then for every instance, outputs the predicted probability for each class
    """
    
    Prob_matrix = []
    
    for i in range(len(BF_GBC)):
        dtest = xgb.DMatrix(ValData)
        Prob = BF_GBC[i].predict(dtest) #Predicts the probability of an instance belonging to the major/ positive class (PD/ 1). Output has shape (n_predictions,)
        Prob_matrix.append(Prob)   
        
    return Prob_matrix

### Weighted voting

In [200]:
def Weighted_Vote(Prob_matrix):
    """ 
    Input:      Prob_matrix     List of arrays. 2D matrix where the 1st dimension is each subset in balancing fold, 
                                2nd dimension is predicted probability

    Returns:    Final_vote      Weighted vote classification

    Calculate the final weighted vote using confidence scores (Sc) from Prob_matrix. Binary classification formula:
    Sc = (S0 -T)/(1-T) if S0> T
    Sc = (T-S0)/T if S0 < T
    """
    PD_prob_matrix = Prob_matrix 
    
    SNP_prob_matrix = []
    for i in range(len(Prob_matrix)):
        sub = 1 - Prob_matrix[i]
        SNP_prob_matrix.append(sub)
                                            #**Predictor states its prediction/ confidence scores are between 0.0 and 1.0 for each class**#
    
    Sum_SNP = np.sum(SNP_prob_matrix, axis = 0)     #Sum of all SNP confidence scores. 1D Array
    Sum_PD  = np.sum(PD_prob_matrix, axis = 0)      #Sum of all PD confidence scores. 1D Array
    
                                                        #**Predictor outputs 0. . . 1 as a prediction**#
                                                        
    # T = 0.45               #Lower threshold gives more sensitivity to PDs over SNPs
    # Sc_SNP = []
    # Sc_PD = []

    # for fold in range(len(Prob_matrix)):        #Calculates SNP Sc all instances in each fold

    #     Sc_SNP_fold = []                        #List of the Sc for each fold
    #     for value in range(len(SNP_prob_matrix[fold])):
    #         S0 = Prob_matrix[fold][value]  #Each SNP's confidence in prob matrix fold
    #         if S0 < T:
    #             Sc = (T - S0)/T
    #         elif S0 >= T:
    #             Sc = (S0 - T)/(1 - T)        
    #         Sc_SNP_fold.append(Sc)              #List of Sc for each fold
    #     Sc_SNP.append(Sc_SNP_fold)              #List of folds with Sc

    # for fold in range(len(PD_prob_matrix)):        #Calculates PD Sc all instances in each fold
    #     Sc_PD_fold = []
    #     for value in range(len(Prob_matrix[fold])):
    #         S0 = Prob_matrix[fold][value]  #Each PD's confidence in prob matrix fold
    #         if S0 < T:
    #             Sc = (T - S0)/T
    #         elif S0 >= T:
    #             Sc = (S0 - T)/(1 - T)        
    #         Sc_PD_fold.append(Sc)
    #     Sc_PD.append(Sc_PD_fold)
    
    Vote_arr  = [] 

    for i in range(len(Sum_PD)):
        if Sum_PD[i] >= Sum_SNP[i]:
            Vote_arr.append([1])                #Append PD classifications to list
        elif Sum_SNP[i] > Sum_PD[i]:
            Vote_arr.append([0])                #Append SNP classifications to list

    Final_vote = np.stack(Vote_arr)             #Converts list of arrays to a 2D array
    Final_vote = Final_vote.ravel()             #Flattens 2D array to 1D array

    return(Final_vote, Sum_PD, Sum_SNP)         #Returns the final confidence scores


In [201]:
# def Final_score(Sum_PD, Sum_SNP, BF):
#     """ 
#     Input:      Sum_PD      Sum of confidence score for PD predictions
#                 Sum_SNP     Sum of confidence score for SNP predictions

#     Returns:    S_out       Final confidence score

#     Calculate the final confidence score
#     """
    
#     S_Out = np.abs((Sum_PD - Sum_SNP) /(BF*2))
        
#     return S_Out

In [202]:
def evalutation(Final_vote, Vallabel):
    """ 
    Input:      Testing_Set        Unseen 20% testing data

    Evaluate each fold with confusion matrix and MCC
    """

    Output_pred = Final_vote
    MCC = matthews_corrcoef(Vallabel, Output_pred)
    print(f"-----------------------------------------------------\n              ***Fold Evaluation***\n")
    print(f"Confusion Matrix:\n {confusion_matrix(Vallabel, Output_pred)}")
    print(f"{classification_report(Vallabel, Output_pred)}\nMCC                           {MCC}\n")
    
    return MCC

In [203]:
def plot(Score_list):
     """ 
     Input:      Score_list        List of MCC scores

     Plots the MCCs of n runs, and calculates the average MCC
     """
     fig, ax = plt.subplots(figsize=(16,10), dpi= 65)
     x_axis = range(len(Score_list))
     y_axis = Score_list

     plt.scatter(x_axis, y_axis, color = 'teal')
     plt.axhline(y=np.nanmean(Score_list), color = 'red', linestyle = 'dotted', linewidth = '1', label ='Avg')
     plt.title('MCC of 15 XG Boost runs, no CV')
     plt.xlabel('Run Number')
     plt.ylabel('MCC')
     plt.legend()
     plt.show()
     print(f"Average: {np.nanmean(Score_list)}")

### Main Program

In [204]:
file = "AC_dataset.csv"
# Score_list = []
# for i in range(0,15):
Training_Set, Testing_Set          = Train_Test_Split(file)                                 #Create training and testing sets
inData, classData, ValData, Vallabel = data(Training_Set, Testing_Set)                     #training/ testing features and labels
# xgb_model = CV(Training_Set)                                                              #Cross-validate training set   
                           
# test(inData, classData, ValData, Vallabel)                                                          #Initial evaluation

minClass, minSize, maxSize  = find_minority_class(classData)                            #Determines imbalance
BF                          = Balance_ratio(maxSize, minSize)                           #Determins number of balancing folds needed
Input_folds, Output_folds   = Balance_Folds(BF, inData, classData, minClass, minSize)   # balance() and balance_data() functions are called under this
# BF_RFC_HP = Hyperparameter(BF, Input_folds, Output_folds)
BF_GBC                      = BF_fitting(BF, Input_folds, Output_folds, ValData, Vallabel)
Prob_matrix                 = BF_validate(BF_GBC, ValData)

Final_vote, Sum_PD, Sum_SNP = Weighted_Vote(Prob_matrix)
# S_Out                       = Final_score(Sum_PD, Sum_SNP, BF)

MCC = evalutation(Final_vote, Vallabel)

# Score_list.append(MCC)    
# plot(Score_list)

[15:39:06] INFO: C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-0fc7796c793e6356f-1/xgboost/xgboost-ci-windows/src/tree/updater_prune.cc:98: tree pruning end, 102 extra nodes, 0 pruned nodes, max_depth=6
[0]	eval-error:0.33284	eval-logloss:0.63806	train-error:0.19460	train-logloss:0.58911
[15:39:06] INFO: C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-0fc7796c793e6356f-1/xgboost/xgboost-ci-windows/src/tree/updater_prune.cc:98: tree pruning end, 88 extra nodes, 0 pruned nodes, max_depth=6
[1]	eval-error:0.30758	eval-logloss:0.60548	train-error:0.16929	train-logloss:0.52338
[15:39:06] INFO: C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-0fc7796c793e6356f-1/xgboost/xgboost-ci-windows/src/tree/updater_prune.cc:98: tree pruning end, 104 extra nodes, 0 pruned nodes, max_depth=6
[2]	eval-error:0.30906	eval-logloss:0.58779	train-error:0.14342	train-logloss:0.47128
[15:39:06] INFO: C:/buildkite-agent/builds/buildkite-windows-cpu-aut