### Import library

In [None]:
""" Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP
    Goal is to predict if mutation is SNP or PD
    improve_MCC branch
    
    Total samples: 3368
    2254 PD samples
    1111 SNP samples
    3 NA samples
"""

In [None]:
""" Imports the required libraries and packages """

import pandas as pd                                                              # Data manipulation in dataframes
import numpy as np                                                               # Array manipulation

import random as rd                                                              # Random seed generation
import time                                                                      # Time program run time
import hyperopt

import matplotlib.pyplot as plt
from matplotlib.patches import Patch                                             # CV visualise
from matplotlib import colors 

from sklearn.metrics import(
    matthews_corrcoef,                                                           # MCC for evaluation
    confusion_matrix,                                                            # Confusion matrix for classification evalutation
    classification_report                                                        # Return the F1, precision, and recall of a prediction
    )

from sklearn.model_selection import(
    train_test_split,                                                            # Splits data frame into the training set and testing set
    GroupKFold                                                                   # K-fold CV with as groups
        )

from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier                              # SK learn API for classificastion random forests

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK                                       # Functions for minimising cost functions
from hyperopt.pyll.base import scope
from functools import partial

np.set_printoptions(precision = 3,threshold=np.inf, suppress=True)               # Full array printing

### Split dataset into training and validation sets

In [None]:
def Train_Test_Split(file):
    """      
    Input:      file             Pre-processed dataset done by PDB2AC script

    Returns:    Training_Set     80% training set split
                Testing_Set      20% testing set split
                
    80% training and 20% testing split. Splits are shuffled randomly and index reset
    """
    AC_dataset                  = pd.read_csv(file)    
    Training_Set, Testing_Set   = train_test_split(AC_dataset,train_size = 0.8)
    
    Training_Set.reset_index(drop=True, inplace = True) #Drop index to avoid training on index values
    Testing_Set.reset_index(drop=True, inplace = True)  #Reset index after splitting for compatability with CV()
        
    Training_Set                = Training_Set.sample(frac = 1) #Shuffle data after splitting
    Testing_Set                 = Testing_Set.sample(frac = 1)
    
    return Training_Set, Testing_Set

### Initial evaluation

In [None]:
def test(Training_Set, Testing_Set):
    """ 
    Input:  Training_Set   Training data
            Testing_Set    Testing data

    Evaluate training data before CV and balancing. Random forest classifier for prediction on the test data. 
    True values are testing data class labels
    """
    Initial_train  = Training_Set.drop(['dataset_pd','AC Code'], axis=1, inplace = False)
    labels_train   = Training_Set['dataset_pd'].astype('int32')

    
    Initial_test   = Testing_Set.drop(['dataset_pd','AC Code'],axis=1, inplace = False)
    labels_test    = Testing_Set['dataset_pd'].astype('int32')
    
    RFC_initial    = RandomForestClassifier()
    RFC_initial.fit(Initial_train, labels_train)

    Output_pred    = RFC_initial.predict(Initial_test)
    
    print(f"              **Initial Evaluation**")
    print(f"Confusion Matrix:\n {confusion_matrix(labels_test, Output_pred)}")
    print(f"MCC              {matthews_corrcoef(labels_test, Output_pred)}\n")


## Group K-fold CV (outer loop)

In [None]:
def CV(Training_Set):
    """      
    Input:      Training_Set     80% training set split
            
    Returns:    IT_list         List of training features for each fold
                LT_list         List of training class labels for each fold
                IV_list         List of validation features for each fold
                LV_list         List of validation class labels for each fold

    Group K-fold CV with protein groups separated between training and validation sets for each fold. Creates 5 folds.
    """
    
    Input_CV       = Training_Set.drop(['dataset_pd'], axis =1)         #Features for training
    Output_CV      = Training_Set['dataset_pd'].copy().astype('int32')  #Class labels for training
    Protein_Groups = Training_Set['AC Code'].to_list()                  #List of proteins for grouping
        
    CV             = GroupKFold(n_splits = 5)                           #Creates 5 splits
    
    IT_list = []
    LT_list = []
    IV_list = []
    LV_list = []
    
    for train_idx, val_idx in CV.split(Input_CV, Output_CV, Protein_Groups): #Generates the indices to be used for a training and validation split. Indicies are unique to train/ val sets
        
        Rd = np.random.randint(time.time())                                  #Random number from 1 to time since epoch

        Input_train                        = Input_CV.loc[train_idx]         #New dataframe from selected indices
        Classes_train                      = Output_CV.loc[train_idx]
        Input_train.drop(['AC Code'], axis = 1, inplace = True)              #Group identifer not needed for training
                
        Input_val                          = Input_CV.loc[val_idx]
        Classes_val                        = Output_CV.loc[val_idx]
        Input_val.drop(['AC Code'], axis   = 1, inplace = True)

        IT_list.append(Input_train.sample(frac=1, random_state=Rd))          #Shuffles lists, random state to ensure features and labels match for each fold
        LT_list.append(Classes_train.sample(frac=1, random_state=Rd))
        IV_list.append(Input_val.sample(frac=1, random_state=(Rd-1)))
        LV_list.append(Classes_val.sample(frac=1, random_state=(Rd-1)))
        

    return(IT_list, LT_list, IV_list, LV_list)


## Balancing (inner loop)

In [None]:
def find_minority_class(classData):
    """ 
    Input:        classData  Array of class labels

    Returns:      minClass   The label for the minority class
                  minSize    The number of items in the minority class
                  maxSize    The number of items in the majority class

    Find information about class size imbalance
    """
    
    Minority_count = 0
    Majority_count = 0
    for datum in classData:
        if datum == 1:
            Majority_count += 1
        elif datum == 0:
            Minority_count += 1

    minClass = 0
    minSize  = Minority_count
    maxSize  = Majority_count
    if Minority_count > Majority_count:
        minClass = 1
        minSize  = Majority_count
        maxSize  = Minority_count

    return minClass, minSize, maxSize

In [None]:
def balance(inData, classData, minClass, minSize):
    """ 
    Input:        inData          array of input data
                  classData       array of classes assigned
                  minorityClass   class label for the minority class
                  minoritySize    size of the minority class

    Returns:      array of indexes that are of interest for a balanced dataset

    Perform the actual balancing for a fold between SNPs and PDs
    """
    usedLines = [False] * len(inData) #Array of false for length of data
    for i in range(len(inData)):
        if classData.array[i] == minClass:
            usedLines[i] = True
            
    usedCount = 0
    while usedCount < minSize:
        i = rd.randrange(len(inData))
        if usedLines[i] == False:
            usedCount += 1
            usedLines[i] = True       

    return usedLines

In [None]:
def balance_data(inData, classData, usedLines):
    """     
    Input:      inData      array of input training data
                classData   array of classes assigned to training data
                usedLines   array of line indexes to print

    Returns:    input_balance  Dataframe of balanced training features
                label_balance  Dataframe of balanced training labels
                       
    Create dataframe of the input training data and classes used. The index [i] is the identifier between the two arrays.
    """
    input_balance = []
    label_balance = []
    
    for i in range(len(inData)):
        if usedLines[i] == True:
            input_i = inData.iloc[i]
            input_balance.append(input_i)
            
            label_i = classData.iloc[i]
            label_balance.append(label_i)
    
    return input_balance, label_balance

### Balance for n folds

In [None]:
def Balance_ratio(maxSize, minSize): 
    """ 
    Input:      maxSize     The number of items in the majority class
                minSize     The number of items in the minority class

    Returns:    BF          Number of balancing folds

    Calculate the number of balancing folds needed using ratio of majority to minority class size. Double to ensure sufficient
    majority class instances are sampled, then + 1 to make odd to allow weighted vote.
    """
    Divide = maxSize/minSize
    BF = (2 * round(Divide)) + 1 #Double ratio to nearest integer
    return BF

In [None]:
def Balance_Folds(BF, inData, classData, minClass, minSize):
    """ 
    Input:      BF                Number of balancing folds
                usedLines         Array of line indices to use
                input_balance     Dataframe of balanced training features
                label_balance     Dataframe of balanced training labels
                    
    Returns:    Input_folds       List of balanced training feature folds
                Output_folds      List of balanced training label folds

    Perform the balance_data() function n number of balancing fold times. Return lists for feature data and labels
    where each item is the output of balance_data()
    """
    Input_folds  = []
    Output_folds = []

    for i in range(BF):
        usedLines                    = balance(inData, classData, minClass, minSize)
        input_balance, label_balance = balance_data(inData, classData, usedLines)
        
        Input_folds.append(input_balance)
        Output_folds.append(label_balance)
            
    return Input_folds, Output_folds

### RFC hyperparameter tuning

In [None]:
# def Hyperparameter(BF, Input_folds, Output_folds):
#     """ Input:      BF                Number of balancing folds needed
#                     Input_folds       List of 5 balanced arrays of training data
#                     Output_folds      List of 5 balanced arrays of training data's labels

#         Returns:    BF_RFC_HP         List of optimized hyperparameters for each RFC

#         Perform RandomSearchCV on each RFC to optimize number of trees, max depth and max samples
#     """  
#     estimator = RandomForestClassifier()
#     param_grid = {
#                 'n_estimators':np.arange(50,500,50),
#                 'max_depth': np.arange(2, 10, 2),
#                 'max_samples': np.arange(0.2, 1.2, 0.2)
#                   }
#     BF_RFC_HP = []

#     for i in range(BF):
#         HPtuning = RandomizedSearchCV(
#             estimator,
#             param_grid, 
#             scoring = 'balanced_accuracy',
#             cv = 10,
#             n_jobs = 6, #how many cores to run in parallel
#             verbose = 2
#             ).fit(Input_folds[i], Output_folds[i])
#         BF_RFC_HP.append(HPtuning.best_params_)
    
#     return(BF_RFC_HP)

### Train RFC on the trainings folds

In [None]:
def BF_fitting(BF, Input_folds, Output_folds): 
    """ 
    Input:      BF                Number of balancing folds                      
                Input_folds       List of balanced training feature folds
                Output_folds      List of balanced training label folds

    Returns:    BF_RFC            List of RFCs trained on each balancing fold

    Create RFC model that returns probability predictions for each fold, using output of Balance_Folds() as training data
    """    
    BF_RFC = []
    for i in range(BF):
        BF_RFC.append(RandomForestClassifier(verbose = 0)) #Generates a RFC for each fold's training data
        BF_RFC[i].fit(Input_folds[i], Output_folds[i])     #Fits the RFC to each folds' training data
        
    return BF_RFC

#### Validate each RFC on validation set, for each fold

In [None]:
def BF_validate(BF_RFC, ValData):
    """ 
    Input:      BF_RFC          List of RFCs trained on balancing folds
                ValData         Unseen validation features from CV fold
                
    Returns:    Prob_matrix     List of arrays. Each item is 2D matrix where the 1st dimension is each subset in balancing fold, 
                                2nd dimension is predicted probability
    
    Test the trained RFCs on the test set, then for every instance, outputs the predicted probability for each class
    """
    
    Prob_matrix = []
    
    for i in range(len(BF_RFC)):
        Prob = BF_RFC[i].predict_proba(ValData.values) #Predicts the probability of an instance belonging to major or minor class
        Prob_matrix.append(Prob)   
        
    return Prob_matrix

### Weighted voting

In [None]:
def Weighted_Vote(Prob_matrix):
    """ 
    Input:      Prob_matrix     List of arrays. 2D matrix where the 1st dimension is each subset in balancing fold, 
                                2nd dimension is predicted probability

    Returns:    Final_vote      Weighted vote classification

    Calculate the final weighted vote using confidence scores (Sc) from Prob_matrix. Binary classification formula:
    Sc = (S0 -T)/(1-T) if S0> T
    Sc = (T-S0)/T if S0 < T
    """
    # Sc_SNP = []
    # Sc_PD = []
    
    # for i in range(len(Prob_matrix)):
    #     Sc_SNP.append(Prob_matrix[i][:,0])
    #     Sc_PD.append(Prob_matrix[i][:,1])
    
    T = 0.45                                    #Lower threshold gives more sensitivity to PDs over SNPs
    Sc_SNP = []
    Sc_PD = []

    for fold in range(len(Prob_matrix)):        #Calculates SNP Sc all instances in each fold

        Sc_SNP_fold = []                        #List of the Sc for each fold
        for value in range(len(Prob_matrix[fold][:,0])):
            S0 = Prob_matrix[fold][:,0][value]  #Each SNP's confidence in prob matrix fold
            if S0 < T:
                Sc = (T - S0)/T
            elif S0 >= T:
                Sc = (S0 - T)/(1 - T)        
            Sc_SNP_fold.append(Sc)              #List of Sc for each fold
        Sc_SNP.append(Sc_SNP_fold)              #List of folds with Sc

    for fold in range(len(Prob_matrix)):        #Calculates PD Sc all instances in each fold
        Sc_PD_fold = []
        for value in range(len(Prob_matrix[fold][:,1])):
            S0 = Prob_matrix[fold][:,1][value]  #Each PD's confidence in prob matrix fold
            if S0 < T:
                Sc = (T - S0)/T
            elif S0 >= T:
                Sc = (S0 - T)/(1 - T)        
            Sc_PD_fold.append(Sc)
        Sc_PD.append(Sc_PD_fold)
        
    columnSNP = np.stack(Sc_SNP)                #Covert list of lists to array, shape (5,~539)
    columnPD  = np.stack(Sc_PD)

    Sum_SNP   = np.sum(columnSNP, axis = 0)     #Sum of all SNP confidence scores. 1D Array
    Sum_PD    = np.sum(columnPD, axis = 0)      #Sum of all PD confidence scores. 1D Array
    
    
    Vote_arr  = [] 

    for i in range(len(Sum_PD)):
        if Sum_PD[i] >= Sum_SNP[i]:
            Vote_arr.append([1])                #Append PD classifications to list
        elif Sum_SNP[i] > Sum_PD[i]:
            Vote_arr.append([0])                #Append SNP classifications to list

    Final_vote = np.stack(Vote_arr)             #Converts list of arrays to a 2D array
    Final_vote = Final_vote.ravel()             #Flattens 2D array to 1D array

    return(Final_vote, Sum_PD, Sum_SNP)         #Returns the final confidence scores


## Final confidence

In [None]:
# def Final_score(Sum_PD, Sum_SNP, BF):
#     """ 
#     Input:      Sum_PD      Sum of confidence score for PD predictions
#                 Sum_SNP     Sum of confidence score for SNP predictions

#     Returns:    S_out       Final confidence score

#     Calculate the final confidence score
#     """
    
#     S_Out = np.abs((Sum_PD - Sum_SNP) /(BF*2))
        
#     return S_Out

In [None]:
def evalutation(Vallabel, Final_vote):
    """ 
    Input:      Vallabel           Unseen validation class labels from CV fold
                Final_vote         Weighted vote classification

    Evaluate each fold with confusion matrix and MCC
    """
    Output_pred = Final_vote
    print(f"-----------------------------------------------------\n              ***Fold {folds + 1} Evaluation***\n")
    print(f"Confusion Matrix:\n {confusion_matrix(Vallabel, Output_pred)}")
    print(f"{classification_report(Vallabel, Output_pred)}\nMCC                {matthews_corrcoef(Vallabel, Output_pred)}\n")

### Main Program

In [17]:
file = "AC_dataset.csv"

Training_Set, Testing_Set          = Train_Test_Split(file)                                 #Create training and testing sets
test(Training_Set, Testing_Set)                                                             #Initial evaluation
IT_list, LT_list, IV_list, LV_list = CV(Training_Set)                                       #Cross-validate training set

for folds in range(len(IT_list)):                                                       
    classData                   = LT_list[folds]                                            #Training labels
    inData                      = IT_list[folds]                                            #Training features
    ValData                     = IV_list[folds]                                            #Validation features
    Vallabel                    = LV_list[folds]                                            #Validation labels

    minClass, minSize, maxSize  = find_minority_class(classData)                            #Determines imbalance
    BF                          = Balance_ratio(maxSize, minSize)                           #Determins number of balancing folds needed
    Input_folds, Output_folds   = Balance_Folds(BF, inData, classData, minClass, minSize)   # balance() and balance_data() functions are called under this
    # BF_RFC_HP = Hyperparameter(BF, Input_folds, Output_folds)
    BF_RFC                      = BF_fitting(BF, Input_folds, Output_folds)
    Prob_matrix                 = BF_validate(BF_RFC, ValData)

    Final_vote, Sum_PD, Sum_SNP = Weighted_Vote(Prob_matrix)
    # S_Out                       = Final_score(Sum_PD, Sum_SNP, BF)

    evalutation(Vallabel, Final_vote)

-----------------------------------------------------
              ***Fold 1 Evaluation***

Confusion Matrix:
 [[136  32]
 [ 60 311]]
              precision    recall  f1-score   support

           0       0.69      0.81      0.75       168
           1       0.91      0.84      0.87       371

    accuracy                           0.83       539
   macro avg       0.80      0.82      0.81       539
weighted avg       0.84      0.83      0.83       539

MCC                0.6237443150018233

-----------------------------------------------------
              ***Fold 2 Evaluation***

Confusion Matrix:
 [[156  29]
 [ 59 295]]
              precision    recall  f1-score   support

           0       0.73      0.84      0.78       185
           1       0.91      0.83      0.87       354

    accuracy                           0.84       539
   macro avg       0.82      0.84      0.83       539
weighted avg       0.85      0.84      0.84       539

MCC                0.6560134118221573