### Import library

In [220]:
""" Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP
    Goal is to predict if mutation is SNP or PD
    CV branch
    
    Total samples: 3368
    2254 PD samples
    1111 SNP samples
"""

' Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP\n    Goal is to predict if mutation is SNP or PD\n    CV branch\n    \n    Total samples: 3368\n    2254 PD samples\n    1111 SNP samples\n'

In [221]:
""" Imports the required libraries and packages
"""

import pandas as pd  #Import for data manipulation in dataframes
import numpy as np  #Array manipulation and calculates mean

import random as rd
import time

from sklearn.metrics import(
    matthews_corrcoef,  # MCC for evaluation
    balanced_accuracy_score, #hyperparameter evaluation
    f1_score,  #hyperparameter evaluation
    confusion_matrix,  # confusion matrix for classification evalutation
    classification_report #Return the F1, precision, and recall of a prediction
    )
from sklearn.model_selection import(
    train_test_split,  # Splits data frame into the training set and testing set
    GridSearchCV,  # Cross validation to improve hyperparameters
    RandomizedSearchCV,
    StratifiedKFold, # K-fold CV
    GroupKFold
        )
from sklearn.ensemble import RandomForestClassifier #SK learn API for classificastion random forests
from sklearn.tree import DecisionTreeClassifier #Single tree decisions 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier #allows for confidence scores to be predicted for each
np.set_printoptions(threshold=np.inf, precision=3) #full array printing

### Clean dataset in pandas

In [222]:
def Clean_data(file):
    """ Input:      file        The dataset to read

        Returns:    Input       Dataframe with of input features for training
                    Output      Dataframe of class labels for each instance in Input

        Create, clean and convert dataset E2.csv to PD dataframe. Removes blank spaces
        and applies "One Hot Encoding" to convert classes (PD/SNP) to 1/0
    """

    df = pd.read_csv('E2.csv')
    #df.drop(['pdbcode:chain:resnum:mutation'], axis=1, inplace=True) #Remove 'pdbcode:chain:resnum:mutation' column

    #Remove unrequired column, replace blank spaces, reset index to run from 0
    df.dropna(inplace = True) #drop rows with missing values
    df.replace(' ', '_', regex=True, inplace=True)
    df.reset_index(drop=True, inplace = True)

    Input = df.drop('dataset', axis =1)
    Output_encoded = pd.get_dummies(df, columns=['dataset']) #Encode the PD and SNP columns
    Output = Output_encoded['dataset_pd'].copy().astype('int32') #PD = 1, SNP = 0

    return Input, Output #Datset in alphabetical order, NaNs removed and encoded PD label

In [223]:
Input, Output = Clean_data(file = 'E2.csv')

### Split dataset into training and testing

In [239]:
def train(Input, Output):
    """ Input:      Input           Dataframe with of input features for training
                    Output          Dataframe of class labels for each instance in Input

        Returns:    Input_train     Features training data
                    Input_test      Features test data
                    Classes_train   Class label training data
                    Classes_test    Class label test data

        80% training and 20% testing split. Strartify ensures fixed poportion of labels are in both sets. 
        Outputs the data to files.
        """

    Input_train, Input_test, Classes_train, Classes_test = train_test_split(Input, Output, train_size = 0.8, stratify=Output) 
    
    with open('Training Data.txt', 'w') as file: #Writes training features to text file
        file.write(Input_train.to_string())
    with open('Class labels.txt', 'w') as file: #Writes training class labels to text file
        file.write(Classes_train.to_string())
    with open('Test Data.txt', 'w') as file: #Writes testing features to text file
        file.write(Input_test.to_string())
    with open('Test labels.txt', 'w') as file: #Writes testing class labels to text file
        file.write(Classes_test.to_string())

    return Input_test, Classes_test, Input_train, Classes_train

In [268]:
Input_test, Classes_test, Input_train, Classes_train = train(Input, Output)
Input_train

Unnamed: 0,pdbcode:chain:resnum:mutation,Binding,SProtFT0,SProtFT1,SProtFT2,SProtFT3,SProtFT4,SProtFT5,SProtFT6,SProtFT7,...,NLargest5,NLargest6,NLargest7,NLargest8,NLargest9,NLargest10,Clash,Glycine,Proline,CisPro
567,2pqt:A:214:A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,87.334,78.970,75.953,74.407,73.148,72.188,-2.04,-100.0000,-100.000,0.0
1502,2ckj:A:1091:L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,531.621,494.177,466.073,450.487,399.528,387.221,-7.94,-100.0000,-100.000,0.0
483,3ecr:A:259:Y,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,90.109,87.237,85.686,78.023,75.493,73.455,-8.98,-100.0000,-100.000,0.0
766,1r46:A:261:D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,181.270,165.206,139.757,118.393,108.311,106.986,206.41,3.2278,-100.000,0.0
1220,2oay:A:458:M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,137.841,118.683,101.242,99.983,89.226,82.743,-9.89,-100.0000,-100.000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,2zw3:A:54:K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,48.736,46.279,40.626,32.063,31.670,30.148,-6.77,-100.0000,-100.000,0.0
974,1wsr:A:292:H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,142.788,136.118,123.033,108.006,77.482,74.867,-17.16,-100.0000,-100.000,0.0
295,3mr2:A:122:P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,139.080,113.181,95.782,94.900,84.491,81.914,1986.61,-100.0000,3.105,0.0
2565,3pm0:A:206:N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,205.211,192.820,175.625,174.998,174.523,153.242,-9.36,-100.0000,-100.000,0.0


### Initial evaluation

In [174]:
# def test(Initial_RFC, Input_test, Classes_test):
#     """ Input:  Input_test      Features test data
#                 Classes_test    Class label test data

#         Evaluates the training data before balancing. Random forest classifier makes prediction using the test features. True values 
#         are the class labels testing data
#     """

#     Output_pred = Initial_RFC.predict(Input_test) #Always perdict on the unseen test data, as train has been used by the estimastor
#     print(f"              **Initial Evaluation**\n")
#     print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
#     print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")


### Manual K-fold splits

In [272]:
df = Input_train
df.sort_values(by=['pdbcode:chain:resnum:mutation'], inplace = True) #Order column by PDB codes alphabetically
df.dropna(inplace = True) #drop rows with missing values

boundry = [] #Empty list
for i in range(len(df)):
    boundry.append(df.iloc[i][0].partition(':')[0]) #Obtain just the PDB code from the 'pdbcode:chain:resnum:mutation' entry

df.drop(['pdbcode:chain:resnum:mutation'], axis=1, inplace=True) #Remove 'pdbcode:chain:resnum:mutation' column
df.insert(0, 'PDB code' ,boundry) #Insert PDB codes as first column

# print(np.array(df['PDB code'].values))

boundry_count = 0
count = 0
df_list = [] #list of unique protein groups
for i in range(len(df)-1):
    First = df.iloc[i][0] #PDB code of row
    Second = df.iloc[i + 1][0] #PDB code of row n+1 

    if First == Second:
        boundry_count += 1 #increase count
    else: #if PDB code in row n is not same as row n+1
        df_list.append(np.array(df.iloc[(i - boundry_count): i + 1, 0:].values)) #List of samples as arrays, ordered
        boundry_count = 0
      

print(f"{df['PDB code'].nunique() -1} unique proteins")
# df.to_csv('order.csv', index = False)
        

839 unique proteins


In [298]:
# flatten_list = []
# for i in range(len(df_list)):
#     a = df_list[i].flatten()
#     flatten_list.append(a)
# np.split(df_list[0], 1)
type(boundry[0])

str

### Balancing

In [None]:
def find_minority_class(classData):
    """ Input:    classData  Array of class labels
        Returns:  minClass   The label for the minority class
                  minSize    The number of items in the minority class
                  maxSize    The number of items in the majority class

        Finds information about the inbalance in class sizes
    """
    
    Minority_count = 0
    Majority_count = 0
    for datum in classData:
        if datum == 1:
            Majority_count += 1
        elif datum == 0:
            Minority_count += 1

    minClass = 0
    minSize = Minority_count
    maxSize = Majority_count
    if Minority_count > Majority_count:
        minClass = 1
        minSize = Majority_count
        maxSize = Minority_count

    return minClass, minSize, maxSize

In [None]:
def Balance_ratio(maxSize, minSize): 
    """ Input:      maxSize     The number of items in the majority class
                    minSize     The number of items in the minority class

        Returns:    BF          Number of balancing folds

        Calculate the number of balancing folds needed using ratio of majority to minority class size. Double to ensure sufficient
        majority class instances are sampled, then + 1 to make odd to allow weighted vote.
    """
    Divide = maxSize/minSize
    BF = (2 * round(Divide)) + 1 #Double ratio to nearest integer
    return BF

In [None]:
def balance(inData, classData, minClass, minSize):
    """ Input:    inData          array of input data
                  classData       array of classes assigned
                  minorityClass   class label for the minority class
                  minoritySize    size of the minority class

        Returns: array of indexes that are of interest for a balanced dataset

        Perform the actual balancing between SNPs and PDs
    """
    usedLines = [False] * len(inData) #Array of false for length of data
    for i in range(len(inData)):
        if classData[i] == minClass:
            usedLines[i] = True
            
    usedCount = 0
    while usedCount < minSize:
        i = rd.randrange(len(inData))
        if usedLines[i] == False:
            usedCount += 1
            usedLines[i] = True       

    return usedLines

In [None]:
def balance_data(inData, classData, usedLines):
    """     Input:     inData      array of input training data
                       classData   array of classes assigned to training data
                       usedLines   array of line indexes to print

            Returns:   Input_balance  Array of balanced input training data
                       Label_balance  Array of balanced classes assigned to training data
        Create arrays for the input training data and classes, as needed for predicting the probability.
        The index [i] is the identifier between the two arrays
    """
    Input_balance = []
    Label_balance = []
    for i in range(len(inData)):
        if usedLines[i]:
            Input_balance.append(inData[i])
            Label_balance.append(classData[i])
            
    Input_balance = np.stack(Input_balance, axis =0)
    Label_balance = np.stack(Label_balance, axis =0)
    
    return Input_balance, Label_balance

### Balance for n folds

In [None]:
def Balance_Folds(BF, usedLines, Input_balance, Label_balance):
    """ Input:      BF                Number of balancing folds needed
                    usedLines         Array of line indexes to print
                    Input_balance     Input_balance  Array of balanced input training data
                    Label_balance     Array of balanced classes assigned to training data

        Returns:    Input_folds       List of 5 balanced arrays of training data
                    Output_folds      List of 5 balanced arrays of training data's labels

        Perform the balance_data() function n number of balancing fold times. Return lists for training data and labels
        where each item is the output of balance_data()
    """
    Input_folds = []
    Output_folds = []
    for fold in range(BF):
        Input_folds.append(Input_balance)
        Output_folds.append(Label_balance)


    with open('Balanced training data.txt', 'w') as f:
        for number, fold in zip(range(BF), Input_folds):
            f.write(f"Fold: {number}\n\n{fold}\n\n\n")
        
    return Input_folds, Output_folds

### RFC hyperparameter tuning

In [None]:
 def Hyperparameter(BF, Input_folds, Output_folds):
    """ Input:      BF                Number of balancing folds needed
                    Input_folds       List of 5 balanced arrays of training data
                    Output_folds      List of 5 balanced arrays of training data's labels

        Returns:    BF_RFC_HP         List of optimized hyperparameters for each RFC

        Perform RandomSearchCV on each RFC to optimize number of trees, max depth and max samples
    """  
    estimator = RandomForestClassifier()
    param_grid = {
                'n_estimators':np.arange(50,500,50),
                'max_depth': np.arange(2, 10, 2),
                'max_samples': np.arange(0.2, 1.2, 0.2)
                  }
    BF_RFC_HP = []

    for i in range(BF):
        HPtuning = RandomizedSearchCV(
            estimator,
            param_grid, 
            scoring = 'balanced_accuracy',
            cv = 10,
            n_jobs = 6, #how many cores to run in parallel
            verbose = 2
            ).fit(Input_folds[i], Output_folds[i].ravel())
        BF_RFC_HP.append(HPtuning.best_params_)
    
    return(BF_RFC_HP)

### Train RFC on balanced dataset

In [None]:
def BF_training(BF, Input_folds, Output_folds, BF_RFC_HP): 
    """ Input:      BF              Number of balancing folds
                    Input_folds     List of 5 balanced arrays for training data
                    Output_folds    List of 5 balanced arrays of training data's labels

        Returns:    BF_RFC          List of RFC's trained on data in each balancing fold

        Create a model that returns probability predictions for each fold, using Balance_Fold() as input
    """    
    BF_RFC = []
    
    for i in range(BF):
        BF_RFC.append(RandomForestClassifier(
                                             verbose = 1
                                            )) #Generates a RF for each fold 
        BF_RFC[i].fit(Input_folds[i], Output_folds[i].ravel()) #Fits the RFC to balanced training data    
        
    return BF_RFC

#### Test RFC on test set

In [None]:
def BFC_test(BF_RFC, Input_test):
    """ Input:  BF_RFC          List of RFC's trained on data in each balancing fold
                Input_test      20% unseen testing data split before the balancing folds
                
        Returns:Prob_matrix     List of arrays. Each item is 2D matrix where the 1st dimension is each subset in balancing fold, 
                                2nd dimension is predicted probability
    
        Test the trained RFCs on the test set, then for every instance, outputs the predicted probability for each class
    """
    Prob_matrix = [] #Empty list
    Prob_matrixlist = []
    for i in range(len(BF_RFC)):
        Prob_list = BF_RFC[i].predict_proba(Input_test.values)
        Prob_matrix.append(Prob_list)   
        
    with open('Test probabilities.txt', 'w') as f:
        for number, line in zip(range(BF), Prob_matrix ):
            f.write(f"Fold: {number}\n\n   SNP    PD\n{line}\n\n\n")

    return Prob_matrix

### Weighted voting

In [None]:
def Weighted_Vote(Prob_matrix, BF):
    """ Input:      Prob_matrix     List of arrays. 2D matrix where the 1st dimension is each subset in balancing fold, 
                                    2nd dimension is predicted probability
                    BF              Number of balancing folds

        Returns:    Final_vote      Weighted vote classification

        Calculate the final weighted vote using confidence scores (Sc). Binary classification formula Sc = 2|S0 - 0.5|
    """
    Sc_PD = [] #Empty list
    Sc_SNP = [] #Empty list
    for i in range(BF):
        Sc_PD.append(2* (Prob_matrix[i][:,1] - 0.5)) #Confidence scores for PD, for each fold
        Sc_SNP.append(2*(Prob_matrix[i][:,0] - 0.5)) #Confidence scores for SNP, for each fold
    

    Sum_PD = np.sum(Sc_PD, axis = 0) #Sum of all PD confidence scores. 1D Array
    Sum_SNP = np.sum(Sc_SNP, axis = 0) #Sum of all SNP confidence scores. 1D Array     

    Vote_arr = [] #Empty list
    
    for i in range(len(Classes_test)):
        if Sum_PD[i] >= Sum_SNP[i]:
            Vote_arr.append([1]) #Append PD classifications to list
        elif Sum_SNP[i] > Sum_PD[i]:
            Vote_arr.append([0]) #Append SNP classifications to list
            
    Final_vote = np.stack(Vote_arr) #Converts list of arrays to a 2D array, shape (674,1)
    Final_vote = Final_vote.ravel() #Flattens 2D array to 1D array
        
    return(Final_vote, Sum_PD, Sum_SNP) #Returns the final confidence scores


### Final confidence

In [None]:
def Final_score(Sum_PD, Sum_SNP, BF):
    """ Input:      Sum_PD      Sum of confidence score for PD predictions
                    Sum_SNP     Sum of confidence score for SNP predictions

        Returns:    S_out        Final confidence score

        Calculate the final confidence score
    """
    
    S_Out = np.abs((Sum_PD - Sum_SNP) /(BF*2))
    np.savetxt('S_out.txt', S_Out, "%.3f")
    
    return S_Out


In [None]:
def evalutation(Classes_test, Final_vote, S_Out):
    """ Input:      Classes_test       Class label test data
                    Final_vote         Weighted vote classification

        Evaluation metrics from RFC on test data with
    """
    Output_pred = Final_vote
    print(f"              ***Final Evaluation***\n")
    print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
    print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")
    
    print(f"See file 'Classification.txt' for final classifications and confidence scores")
    np.savetxt('Classification.txt',
           np.column_stack([Final_vote, S_Out]),
           fmt = ["%.0f","%.3f"],
           delimiter ="      ",
           header = "Final classifications and confidence scores\n\n"
          )
    

### Main Program

In [None]:

file                         = 'E2.csv'
Input, Output                = Clean_data(file)
RFC, Input_test, Classes_test, Input_train, Classes_train = train(Input, Output)
test(RFC,Input_test, Classes_test)

inData                       = pd.DataFrame(Input_train).to_numpy()
classData                    = pd.DataFrame(Classes_train).to_numpy()

minClass, minSize, maxSize   = find_minority_class(classData)
BF                           = Balance_ratio(maxSize, minSize)
usedLines                    = balance(inData, classData, minClass, minSize)

Input_balance, Label_balance = balance_data(inData, classData, usedLines)
Input_folds, Output_folds    = Balance_Folds(BF, usedLines, Input_balance, Label_balance)

BF_RFC_HP                    = Hyperparameter(BF, Input_folds, Output_folds)
BF_RFC                       = BF_training(BF, Input_folds, Output_folds, BF_RFC_HP)
Prob_matrix                  = BFC_test(BF_RFC, Input_test)

Final_vote, Sum_PD, Sum_SNP  = Weighted_Vote(Prob_matrix, BF)
S_Out                        = Final_score(Sum_PD, Sum_SNP, BF)

evalutation(Classes_test, Final_vote,S_Out)
