Script that takes optimised models and performs final prediction on 20% testing set, returning the final MCC

In [None]:
""" Imports the required libraries and packages """

import pandas as pd                                                              # Data manipulation in dataframes
import numpy as np                                                               # Array manipulation
import xgboost as xgb                                                            # Gradient boosting package
import pickle                                                                    # Saving/loading GBM files

import random as rd                                                              # Random seed generation
import time                                                                      # Time program run time
import os

from sklearn.metrics import(
    matthews_corrcoef,                                                           # MCC for evaluation
    confusion_matrix,                                                            # Confusion matrix for classification evalutation
    )

np.set_printoptions(precision = 3,threshold=np.inf, suppress=True)               # Full array printing

In [None]:
def open_data(file_test):
    """  
    Input:         file_test        File to test on
        
    Returns:       Testing_Set      Normalised 20% testing set split as dataframe
            
    Opens the normalised training and testing data
    """
    Testing_Set = pd.read_csv(file_test,index_col = 0)
    
    return Testing_Set

In [None]:
def learning_data(Testing_Set):
    """      
    Input:      
                Testing_Set      20% testing set split

    Returns:    
                TestData         Testing features 
                TestLabels       Testing labels
            
    Separates training and testing data into features and labels
    """    
    TestData     = Testing_Set.drop(['AC Code','dataset'], axis =1)  
    TestLabels   = Testing_Set['dataset']  
    
    d_test = xgb.DMatrix(TestData, TestLabels)      
    
    return (TestData, TestLabels)

In [None]:
def final_BF_predict(TestData, TestLabels):
    """ 
    Input:      BF_RFC            List of RFCs trained on balancing folds
                d_test            Testing data as Dmatrix

                
    Returns:    rob_matrix     List of arrays. Each item is 2D matrix where the 1st dimension is the datapoint, 
                                2nd dimension is predicted probability
    
    Predicts the probabilty for every datapoint in the testing set.
    """
    prob_matrix = []
    dir_path = f"C:/Users/Shamin/Documents/UCL/Biochemistry/Year 4/BIOC0025/Figures/LargeDataSetAllModelRFC/rfc_model"
    # d_test = xgb.DMatrix(TestData, TestLabels)
        
    for file in os.listdir(dir_path):
        if file.startswith('RFC_'):
            with open(os.path.join(dir_path, file), "rb") as f:
                model = pickle.load(f)
                pred  = model.predict_proba(TestData)              #Predicts the probability of an instance belonging to the major/ positive class (PD/ 1). Output has shape (n_predictions,)         
                prob_matrix.append(pred)
                
    # for file in os.listdir(dir_path):
    #     if file.startswith('CV_'):
    #         with open(os.path.join(dir_path, file), "rb") as f:
    #             model = pickle.load(f)
    #             pred  = model.predict(d_test)              #Predicts the probability of an instance belonging to the major/ positive class (PD/ 1). Output has shape (n_predictions,)         
    #             prob_matrix.append(pred)
        

        
    return prob_matrix

In [None]:
def final_evaluation(Prob_matrix):
    """ 
    Input:      prob_matrix    List of all predicted probabilites from all optimised models
                TestLabels         True labels from unseen 20% testing data

    Returns:    Final vote         Prediction for if a point is PD or SNP

    Calculate the final predictions with weighted vote using confidence scores. 
    Evaluate votes agains true labels to give the final MCC
    """
    Sc_PD = []
    Sc_SNP = []
    
    for i in range(len(Prob_matrix)):
        Sc_SNP.append(Prob_matrix[i][:,0])
        Sc_PD.append(Prob_matrix[i][:,1])
    
    Sum_SNP = np.sum(Sc_SNP, axis = 0)     #Sum of all SNP confidence scores. 1D Array
    Sum_PD  = np.sum(Sc_PD, axis = 0)      #Sum of all PD confidence scores. 1D Array
                                                    
    # PD_prob_matrix = Prob_matrix
    
    # SNP_prob_matrix = []
    # for i in range(len(PD_prob_matrix)):                 #SNP probabilites are 1 - (PD probabilites)
    #     sub = 1 - PD_prob_matrix[i]
    #     SNP_prob_matrix.append(sub)
        
            
    # Sum_SNP = np.sum(SNP_prob_matrix, axis = 0)     #Sum of all SNP confidence scores. 1D Array
    # Sum_PD  = np.sum(PD_prob_matrix, axis = 0)      #Sum of all PD confidence scores. 1D Array
    

    Vote_arr  = [] 

    for i in range(len(Sum_PD)):
        if Sum_PD[i] >= Sum_SNP[i]:
            Vote_arr.append([1])                    #Append PD classifications to list
        elif Sum_SNP[i] > Sum_PD[i]:
            Vote_arr.append([0])                    #Append SNP classifications to list

    Final_vote = np.stack(Vote_arr)                 #Converts list of arrays to a 2D array
    Final_vote = Final_vote.ravel()                 #Flattens 2D array to 1D array
    
    return(Final_vote)


In [None]:
def final_report(Final_vote, TestLabels):
    """ 
    Input:      Final vote         Prediction for if a point is PD or SNP
                TestLabels         True labels from unseen 20% testing data

    Output the final predictor MCC and confusion matrix
    """
    
    CM = confusion_matrix(TestLabels, Final_vote)
    MCC = matthews_corrcoef(TestLabels, Final_vote)
    
    return CM, MCC


In [None]:
""""Main program"""

file_test                   = "C:/Users/Shamin/Documents/UCL/Biochemistry/Year 4/BIOC0025/Figures/LargeDatasetDefaultRFC/STesting_Set.csv"
Testing_Set                 = open_data(file_test)

TestData, TestLabels = learning_data(Testing_Set)
Prob_matrix = final_BF_predict(TestData, TestLabels)
Final_vote = final_evaluation(Prob_matrix) 
CM, MCC= final_report(Final_vote, TestLabels)

print(CM)
print(MCC)