### Import library

In [1]:
""" Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP
    Goal is to predict if mutation is SNP or PD
    CV branch
    
    Total samples: 3368
    2254 PD samples
    1111 SNP samples
"""

' Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP\n    Goal is to predict if mutation is SNP or PD\n    CV branch\n    \n    Total samples: 3368\n    2254 PD samples\n    1111 SNP samples\n'

In [102]:
""" Imports the required libraries and packages """

import pandas as pd  #Import for data manipulation in dataframes
import numpy as np  #Array manipulation and calculates mean

import random as rd
import time

from sklearn.metrics import(
    matthews_corrcoef,  # MCC for evaluation
    balanced_accuracy_score, #hyperparameter evaluation
    f1_score,  #hyperparameter evaluation
    confusion_matrix,  # confusion matrix for classification evalutation
    classification_report #Return the F1, precision, and recall of a prediction
    )
from sklearn.model_selection import(
    train_test_split,  # Splits data frame into the training set and testing set
    GridSearchCV,  # Cross validation to improve hyperparameters
    RandomizedSearchCV,
    KFold,
    StratifiedKFold, # K-fold CV
    GroupKFold,
    StratifiedGroupKFold
        )

from sklearn.utils import shuffle

from sklearn.ensemble import RandomForestClassifier #SK learn API for classificastion random forests
from sklearn.tree import DecisionTreeClassifier #Single tree decisions 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier #allows for confidence scores to be predicted for each
np.set_printoptions(threshold=np.inf, precision=3) #full array printing

### Clean dataset with pandas

In [103]:
def Clean_data(file):
    """ Input:      file        The dataset to read

        Returns:    Cleaned      Cleaned dataframe with numeric values for class

        Create, clean and convert dataset E2.csv to PD dataframe. Removes blank spaces and NaNs,
        and applies One Hot Encoding to convert classes (PD/SNP) to 1/0
    """
    df = pd.read_csv('E2.csv')

    #Remove unrequired NaNs, blank spaces, reset index to run from 0
    df.dropna(inplace = True)
    df.replace(' ', '_', regex=True, inplace=True)
    df.reset_index(drop=True, inplace = True)

    Cleaned_encoded = pd.get_dummies(df, columns=['dataset']) #Encode the PD and SNP columns
    Cleaned = Cleaned_encoded.drop(['dataset_snp'],axis = 1)
    
    return Cleaned

### Split dataset into training and validation sets

In [104]:
def Train_Test_Split(Cleaned):
    """      
    Input:      Cleaned          Cleaned dataframe

    Returns:    Training_Set     80% training set split
                Testing_Set      20% testing set split
                labels           Class labels for training set

    80% training and 20% testing split. Writes the data to txt files. Splits are shuffled randomly
    """
    Training_Set, Testing_Set = train_test_split(Cleaned,train_size = 0.8)
    labels = Training_Set['dataset_pd'].astype('int32')
    
    Training_file = Training_Set.drop(['Binding', 'SProtFT0', 'SProtFT1', 'SProtFT2', 'SProtFT3', 'SProtFT4', 'SProtFT5', 'SProtFT6', 'SProtFT7', 'SProtFT8', 'SProtFT9', 'SProtFT10', 'SProtFT11', 'SProtFT12', 'Interface', 'Relaccess', 'Impact', 'HBonds', 'SPhobic', 'CPhilic', 'BCharge', 'SSGeom', 'Voids', 'MLargest1', 'MLargest2', 'MLargest3', 'MLargest4', 'MLargest5', 'MLargest6', 'MLargest7', 'MLargest8', 'MLargest9', 'MLargest10', 'NLargest1', 'NLargest2', 'NLargest3', 'NLargest4', 'NLargest5', 'NLargest6', 'NLargest7', 'NLargest8', 'NLargest9', 'NLargest10', 'Clash', 'Glycine', 'Proline', 'CisPro'],axis=1)
    Testing_file = Testing_Set.drop(['Binding', 'SProtFT0', 'SProtFT1', 'SProtFT2', 'SProtFT3', 'SProtFT4', 'SProtFT5', 'SProtFT6', 'SProtFT7', 'SProtFT8', 'SProtFT9', 'SProtFT10', 'SProtFT11', 'SProtFT12', 'Interface', 'Relaccess', 'Impact', 'HBonds', 'SPhobic', 'CPhilic', 'BCharge', 'SSGeom', 'Voids', 'MLargest1', 'MLargest2', 'MLargest3', 'MLargest4', 'MLargest5', 'MLargest6', 'MLargest7', 'MLargest8', 'MLargest9', 'MLargest10', 'NLargest1', 'NLargest2', 'NLargest3', 'NLargest4', 'NLargest5', 'NLargest6', 'NLargest7', 'NLargest8', 'NLargest9', 'NLargest10', 'Clash', 'Glycine', 'Proline', 'CisPro'],axis=1)

    with open('Training set.txt', 'w') as file: #Writes training data to files
        file.write(Training_file.to_string())
    with open('Testing set.txt', 'w') as file: #Writes testing data to files
        file.write(Testing_file.to_string())


    return Training_Set, Testing_Set, labels

### Initial evaluation

In [None]:
# def test(Initial_RFC, Input_test, Classes_test):
#     """ Input:  Input_test      Features test data
#                 Classes_test    Class label test data

#         Evaluates the training data before balancing. Random forest classifier makes prediction using the test features. True values 
#         are the class labels testing data
#     """

#     Output_pred = Initial_RFC.predict(Input_test) #Always perdict on the unseen test data, as train has been used by the estimastor
#     print(f"              **Initial Evaluation**\n")
#     print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
#     print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")


### Balancing

In [105]:
def find_minority_class(classData):
    """ Input:    classData  Array of class labels
    
        Returns:  minClass   The label for the minority class
                  minSize    The number of items in the minority class
                  maxSize    The number of items in the majority class

        Finds information about the inbalance in class sizes
    """
    
    Minority_count = 0
    Majority_count = 0
    for datum in classData:
        if datum == 1:
            Majority_count += 1
        elif datum == 0:
            Minority_count += 1

    minClass = 0
    minSize = Minority_count
    maxSize = Majority_count
    if Minority_count > Majority_count:
        minClass = 1
        minSize = Majority_count
        maxSize = Minority_count

    return minClass, minSize, maxSize

In [106]:
def balance(inData, classData, minClass, minSize):
    """ Input:    inData          array of input data
                  classData       array of classes assigned
                  minorityClass   class label for the minority class
                  minoritySize    size of the minority class

        Returns: array of indexes that are of interest for a balanced dataset

        Perform the actual balancing for a fold between SNPs and PDs
    """
    usedLines = [False] * len(inData) #Array of false for length of data
    for i in range(len(inData)):
        if classData[i] == minClass:
            usedLines[i] = True
            
    usedCount = 0
    while usedCount < minSize:
        i = rd.randrange(len(inData))
        if usedLines[i] == False:
            usedCount += 1
            usedLines[i] = True       

    return usedLines

In [107]:
def balance_data(inData, classData, usedLines):
    """     Input:     inData      array of input training data
                       classData   array of classes assigned to training data
                       usedLines   array of line indexes to print

            Returns:   train_balance  Array of balanced input training data
                       
        Create array of the input training data and classes used.
        The index [i] is the identifier between the two arrays.
    """
    train_balance = []
    
    for i in range(len(inData)):
        if usedLines[i]:
            train_balance.append(inData[i])

    train_balance = pd.DataFrame(train_balance, columns = ['pdbcode','Binding', 'SProtFT0', 'SProtFT1', 'SProtFT2', 'SProtFT3', 'SProtFT4', 'SProtFT5', 'SProtFT6', 'SProtFT7', 'SProtFT8', 'SProtFT9', 'SProtFT10', 'SProtFT11', 'SProtFT12', 'Interface', 'Relaccess', 'Impact', 'HBonds', 'SPhobic', 'CPhilic', 'BCharge', 'SSGeom', 'Voids', 'MLargest1', 'MLargest2', 'MLargest3', 'MLargest4', 'MLargest5', 'MLargest6', 'MLargest7', 'MLargest8', 'MLargest9', 'MLargest10', 'NLargest1', 'NLargest2', 'NLargest3', 'NLargest4', 'NLargest5', 'NLargest6', 'NLargest7', 'NLargest8', 'NLargest9', 'NLargest10', 'Clash', 'Glycine', 'Proline', 'CisPro', 'Class'])
    
    return train_balance

In [108]:
file                         = 'E2.csv'
Cleaned                       = Clean_data(file)

Training_Set, Testing_Set, labels = Train_Test_Split(Cleaned)
inData                       = Training_Set.to_numpy()
classData                    = labels.to_numpy()

minClass, minSize, maxSize   = find_minority_class(classData)
usedLines = balance(inData, classData, minClass, minSize)
Input_balance = balance_data(inData, classData, usedLines)

BF                           = Balance_ratio(maxSize, minSize)

In [109]:
Input_balance

Unnamed: 0,pdbcode,Binding,SProtFT0,SProtFT1,SProtFT2,SProtFT3,SProtFT4,SProtFT5,SProtFT6,SProtFT7,...,NLargest6,NLargest7,NLargest8,NLargest9,NLargest10,Clash,Glycine,Proline,CisPro,Class
0,2q4r:A:139:K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,105.077,93.135,89.569,73.005,64.813,-4.94,-100.0000,-100.0,0.0,1
1,1xmj:A:544:V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,116.283,91.465,86.835,75.264,74.450,-1.65,0.9408,-100.0,0.0,1
2,1exz:A:29:A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,40.262,38.088,35.160,31.736,30.562,-1.95,-100.0000,-100.0,0.0,0
3,2pqt:A:214:A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,78.970,75.953,74.407,73.148,72.188,-2.04,-100.0000,-100.0,0.0,0
4,3s41:A:414:E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,132.261,112.289,111.390,110.232,99.504,-10.53,-100.0000,-100.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,1ya4:A:203:E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,230.954,213.469,183.640,180.267,167.845,-5.96,-100.0000,-100.0,0.0,0
1796,3f1r:A:206:N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,73.083,69.168,66.571,58.856,47.585,-4.39,-100.0000,-100.0,0.0,0
1797,2obd:A:438:M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,227.961,215.687,205.329,175.940,131.136,9.18,-100.0000,-100.0,0.0,0
1798,1qki:A:387:H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,160.417,158.277,150.109,148.360,134.120,-2.17,-100.0000,-100.0,0.0,1


### Balance for n folds

In [85]:
def Balance_ratio(maxSize, minSize): 
    """ Input:      maxSize     The number of items in the majority class
                    minSize     The number of items in the minority class

        Returns:    BF          Number of balancing folds

        Calculate the number of balancing folds needed using ratio of majority to minority class size. Double to ensure sufficient
        majority class instances are sampled, then + 1 to make odd to allow weighted vote.
    """
    Divide = maxSize/minSize
    BF = (2 * round(Divide)) + 1 #Double ratio to nearest integer
    return BF

In [18]:
def Balance_Folds(BF, inData, classData, minClass, minSize):
    """ Input:      BF                Number of balancing folds needed
                    usedLines         Array of line indexes to print
                    Input_balance     Input_balance  Array of balanced input training data
                    Label_balance     Array of balanced classes assigned to training data

        Returns:    Input_folds       List of 5 balanced arrays of training data
                    Output_folds      List of 5 balanced arrays of training data's labels

        Perform the balance_data() function n number of balancing fold times. Return lists for feature data and labels
        where each item is the output of balance_data()
    """
    Input_folds = []
    
    for i in range(BF):
        usedLines = balance(inData, classData, minClass, minSize)
        Input_balance = balance_data(inData, classData, usedLines)
        Input_folds.append(Input_balance)
        
    with open('Balanced training data.txt', 'w') as f:
        for number, fold in zip(range(BF), Input_folds):
            f.write(f"Fold: {number}\n\n{fold}\n\n\n")
            
    return Input_folds

#### RFC hyperparameter tuning

In [7]:
#  def Hyperparameter(BF, Input_folds, Output_folds):
#     """ Input:      BF                Number of balancing folds needed
#                     Input_folds       List of 5 balanced arrays of training data
#                     Output_folds      List of 5 balanced arrays of training data's labels

#         Returns:    BF_RFC_HP         List of optimized hyperparameters for each RFC

#         Perform RandomSearchCV on each RFC to optimize number of trees, max depth and max samples
#     """  
#     estimator = RandomForestClassifier()
#     param_grid = {
#                 'n_estimators':np.arange(50,500,50),
#                 'max_depth': np.arange(2, 10, 2),
#                 'max_samples': np.arange(0.2, 1.2, 0.2)
#                   }
#     BF_RFC_HP = []

#     for i in range(BF):
#         HPtuning = RandomizedSearchCV(
#             estimator,
#             param_grid, 
#             scoring = 'balanced_accuracy',
#             cv = 10,
#             n_jobs = 6, #how many cores to run in parallel
#             verbose = 2
#             ).fit(Input_folds[i], Output_folds[i].ravel())
#         BF_RFC_HP.append(HPtuning.best_params_)
    
#     return(BF_RFC_HP)

In [19]:
def Group_data(Input_folds):
    """      
    Input:      Input_folds     List of balanced training sets

    Returns:    Input_CV         Datastet with input features for training
                Output_CV        Datastet with class labels for training
                Protein_Groups   List of proteins for grouping

    Alphabetically order the identifier column, extract only the 'pdbcode' and make that the identifer column.
    Use this formatted dataset to create the training features, class labels, and group identifiers.
    """
    Input_CV = []
    Output_CV = []
    Protein_Groups = []
    for i in range(len(Input_folds)):
        Group_df_i = Input_folds[i]
        Group_df = Group_df_i.sort_values(by=['pdbcode'])

        PDB_codes = []
        for i in range(len(Group_df)): 
            PDB_codes.append(Group_df.iloc[i][0].partition(':')[0])

        Group_df.drop(['pdbcode'], axis=1, inplace=True) #Remove 'pdbcode:chain:resnum:mutation' column
        Group_df.insert(0, 'PDB code', PDB_codes)
        Group_df.reset_index(inplace = True, drop = True)

        Input_CV.append(Group_df.drop(['Class'], axis =1))
        Output_CV.append(Group_df['Class'].copy().astype('int32')) 
        Protein_Groups.append(Group_df['PDB code'].to_list())

    return Input_CV, Output_CV, Protein_Groups

### Group K-fold CV

In [20]:
def CV(Input_CV, Output_CV, Protein_Groups):
    """      
    Input:      Input_CV             Datastet with input features for training
                Output_CV            Datastet with class labels for training
                Protein_Groups       List of proteins for grouping
            
    Returns:    Input_train_list     List of training features, for each fold
                Classes_train_list   List of training classes, for each fold
                Input_val_list       List of validating features , for each fold
                Classes_val_list     List of validating classes , for each fold

    Stratified K-fold CV that maintains protein groups, attempts to preserve number of samples of each class 
    for each fold, and ensures protein groups are separated. 5 folds.
    """
    for i in range(len(Input_folds)):
    
        CV = StratifiedGroupKFold(n_splits = 5, shuffle = True) #Only shuffles proteins in each group, not groups in fold

        Fold = 0
        Input_train_list = []
        Classes_train_list = []
        Input_val_list = []
        Classes_val_list = []

        for train_idx, val_idx in CV.split(Input_CV[i], Output_CV[i], Protein_Groups[i]):
            Rd = np.random.randint(time.time()) #Random number from 1 to time since epoch
            Input_train = Input_CV[i].loc[train_idx]
            Input_train.drop(['PDB code'], axis = 1, inplace = True)
            Classes_train = Output_CV[i].loc[train_idx]
#             Classes_train.drop(['PDB code'], axis = 1, inplace = True)


            Input_train_list.append(Input_train.sample(frac=1, random_state=Rd)) #shuffles each split and adds to list.
                                                                                 # Random state so train and test data index same
            Classes_train_list.append(Classes_train.sample(frac=1, random_state=Rd))

            Input_val = Input_CV[i].loc[val_idx]
            Classes_val = Output_CV[i].loc[val_idx]
            Input_val.drop(['PDB code'], axis = 1, inplace = True)


            Input_val_list.append(Input_val.sample(frac=1, random_state=Rd))
            Classes_val_list.append(Classes_train.sample(frac=1, random_state=Rd))


    return(Input_train_list,Classes_train_list,Input_val_list,Classes_val_list)


In [21]:
file                         = 'E2.csv'
Output                       = Clean_data(file)

Training_Set, Testing_Set, labels = Train_Test_Split(Output)
inData                       = Training_Set.to_numpy()
classData                    = labels.to_numpy()

minClass, minSize, maxSize   = find_minority_class(classData)
BF                           = Balance_ratio(maxSize, minSize)
usedLines = balance(inData, classData, minClass, minSize)
Input_folds = Balance_Folds(BF, inData, classData, minClass, minSize)

Input_CV, Output_CV, Protein_Groups = Group_data(Input_folds)

Input_train_list,Classes_train_list,Input_val_list,Classes_val_list = CV(Input_CV, Output_CV, Protein_Groups)

In [22]:
Input_train_list

[      Binding  SProtFT0  SProtFT1  SProtFT2  SProtFT3  SProtFT4  SProtFT5  \
 787       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
 283       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
 1805      1.0       0.0       0.0       0.0       0.0       0.0       0.0   
 84        0.0       0.0       0.0       0.0       0.0       0.0       0.0   
 894       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
 ...       ...       ...       ...       ...       ...       ...       ...   
 1095      0.0       0.0       0.0       0.0       0.0       0.0       0.0   
 739       1.0       0.0       0.0       0.0       0.0       0.0       0.0   
 866       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
 1501      0.0       0.0       0.0       0.0       0.0       0.0       0.0   
 1508      1.0       0.0       0.0       0.0       0.0       0.0       0.0   
 
       SProtFT6  SProtFT7  SProtFT8  ...  NLargest5  NLargest6

### Train RFC on balanced dataset

In [None]:
def BF_training(BF, Input_train_list, Classes_train_list): 
    """ Input:      BF              Number of balancing folds
                    Input_folds     List of 5 balanced arrays for training data
                    Output_folds    List of 5 balanced arrays of training data's labels

        Returns:    BF_RFC          List of RFC's trained on data in each balancing fold

        Create a model that returns probability predictions for each fold, using Balance_Fold() as input
    """    
    BF_RFC = []
    
    for i in range(BF):
        BF_RFC.append(RandomForestClassifier(
                                             verbose = 1
                                            )) #Generates a RF for each fold 
        BF_RFC[i].fit(Input_train_list[i], Classes_train_list[i]) #Fits the RFC to balanced training data    
        
    return BF_RFC

In [None]:
BF_RFC = BF_training(BF,Input_train_list, Classes_train_list)

#### Test RFC on test set

In [None]:
def BFC_test(BF_RFC, Input_val_list):
    """ Input:  BF_RFC          List of RFC's trained on data in each balancing fold
                Input_val_list      20% unseen testing data split from cv
                
        Returns:Prob_matrix     List of arrays. Each item is 2D matrix where the 1st dimension is each subset in balancing fold, 
                                2nd dimension is predicted probability
    
        Test the trained RFCs on the test set, then for every instance, outputs the predicted probability for each class
    """
    Prob_matrix = [] #Empty list
    Prob_matrixlist = []
    for i in range(len(BF_RFC)):
        Prob_list = BF_RFC[i].predict_proba(Input_val_list[i].values)
        Prob_matrix.append(Prob_list)   
        
    with open('Test probabilities.txt', 'w') as f:
        for number, line in zip(range(BF), Prob_matrix ):
            f.write(f"Fold: {number}\n\n   SNP    PD\n{line}\n\n\n")

    return Prob_matrix

In [None]:
Prob_matrix = BFC_test(BF_RFC, Input_val_list)
Prob_matrix[0]

### Weighted voting

In [None]:
def Weighted_Vote(Prob_matrix, BF):
    """ Input:      Prob_matrix     List of arrays. 2D matrix where the 1st dimension is each subset in balancing fold, 
                                    2nd dimension is predicted probability
                    BF              Number of balancing folds

        Returns:    Final_vote      Weighted vote classification

        Calculate the final weighted vote using confidence scores (Sc). Binary classification formula Sc = 2|S0 - 0.5|
    """
    Sc_PD = [] #Empty list
    Sc_SNP = [] #Empty list
    for i in range(BF):
        Sc_PD.append(2* (Prob_matrix[i][:,1] - 0.5)) #Confidence scores for PD, for each fold
        Sc_SNP.append(2*(Prob_matrix[i][:,0] - 0.5)) #Confidence scores for SNP, for each fold
    

    Sum_PD = np.sum(Sc_PD, axis = 0) #Sum of all PD confidence scores. 1D Array
    Sum_SNP = np.sum(Sc_SNP, axis = 0) #Sum of all SNP confidence scores. 1D Array     

    Vote_arr = [] #Empty list
    
    for i in range(len(Classes_test)):
        if Sum_PD[i] >= Sum_SNP[i]:
            Vote_arr.append([1]) #Append PD classifications to list
        elif Sum_SNP[i] > Sum_PD[i]:
            Vote_arr.append([0]) #Append SNP classifications to list
            
    Final_vote = np.stack(Vote_arr) #Converts list of arrays to a 2D array, shape (674,1)
    Final_vote = Final_vote.ravel() #Flattens 2D array to 1D array
        
    return(Final_vote, Sum_PD, Sum_SNP) #Returns the final confidence scores


### Final confidence

In [None]:
def Final_score(Sum_PD, Sum_SNP, BF):
    """ Input:      Sum_PD      Sum of confidence score for PD predictions
                    Sum_SNP     Sum of confidence score for SNP predictions

        Returns:    S_out        Final confidence score

        Calculate the final confidence score
    """
    
    S_Out = np.abs((Sum_PD - Sum_SNP) /(BF*2))
    np.savetxt('S_out.txt', S_Out, "%.3f")
    
    return S_Out


In [None]:
def evalutation(Classes_test, Final_vote, S_Out):
    """ Input:      Classes_test       Class label test data
                    Final_vote         Weighted vote classification

        Evaluation metrics from RFC on test data with
    """
    Output_pred = Final_vote
    print(f"              ***Final Evaluation***\n")
    print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
    print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")
    
    print(f"See file 'Classification.txt' for final classifications and confidence scores")
    np.savetxt('Classification.txt',
           np.column_stack([Final_vote, S_Out]),
           fmt = ["%.0f","%.3f"],
           delimiter ="      ",
           header = "Final classifications and confidence scores\n\n"
          )
    

### Main Program

In [None]:
file                         = 'E2.csv'
Output                       = Clean_data(file)
Training_Set, Testing_Set    = train(Output)
Input_CV, Output_CV, Protein_Groups = Group_data(Training_Set)

Input_train_list, Classes_train_list, Input_val_list, Classes_val_list = CV(Input_CV, Output_CV, Protein_Groups)

inData                       = pd.DataFrame(Input_train_list[0]).to_numpy()
classData                    = pd.DataFrame(Classes_train_list[0]).to_numpy()

minClass, minSize, maxSize   = find_minority_class(classData)
BF                           = Balance_ratio(maxSize, minSize)
usedLines                    = balance(inData, classData, minClass, minSize)

Input_balance, Label_balance = balance_data(inData, classData, usedLines)
Input_folds, Output_folds    = Balance_Folds(BF, usedLines, Input_balance, Label_balance)

# BF_RFC_HP                    = Hyperparameter(BF, Input_folds, Output_folds)
BF_RFC                       = BF_training(BF, Input_folds, Output_folds)
Prob_matrix                  = BFC_test(BF_RFC, Input_test)

Final_vote, Sum_PD, Sum_SNP  = Weighted_Vote(Prob_matrix, BF)
S_Out                        = Final_score(Sum_PD, Sum_SNP, BF)

evalutation(Classes_test, Final_vote,S_Out)
