### Import library

In [17]:
""" Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP
    Goal is to predict if mutation is SNP or PD
    ImprovedBalancing branch
    
    Total samples: 3368
    2254 PD samples
    1111 SNP samples
"""

' Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP\n    Goal is to predict if mutation is SNP or PD\n    ImprovedBalancing branch\n    \n    Total samples: 3368\n    2254 PD samples\n    1111 SNP samples\n'

In [18]:

""" Imports the required libraries and packages
"""

import pandas as pd  #Import for data manipulation in dataframes
import numpy as np  #Array manipulation and calculates mean

import random as rd

from sklearn.metrics import(
    matthews_corrcoef,  # CC for evaluation
    f1_score,  #F1 score for evaluation
    confusion_matrix,  #Creates the confusion matrix - stats on how accurate the test set output is
    classification_report #Returns the F1 socre, precision, and recall of a prediction using a given model
    )
from sklearn.model_selection import(
    train_test_split,  # Splits data frame into the training set and testing set
    GridSearchCV,  # Cross validation to improve hyperparameters
    StratifiedKFold
        )
from sklearn.ensemble import RandomForestClassifier #SK learn API for classificastion random forests
from sklearn.tree import DecisionTreeClassifier #Single tree decisions 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier #allows for confidence scores to be predicted for each

np.set_printoptions(threshold=np.inf) #full array printing

### Clean dataset in pandas

In [19]:
def Clean_data():
    """ Input:      file        The dataset to read

        Returns:    Input       Dataframe with of input features for training
                    Output      Dataframe of class labels for each instance in Input

        Create, clean and convert dataset E2.csv to PD dataframe. Drops uneeded columns, removes blank spaces, 
        and applies "One Hot Encoding" to convert PD/SNP to 1/0
    """

    df = pd.read_csv('E2.csv')

    #Remove unrequired column, replace blank spaces, reset index to run from 0
    df.drop(['pdbcode:chain:resnum:mutation'], axis=1, inplace=True)
    df.replace(' ', '_', regex=True, inplace=True)
    df.reset_index(drop=True, inplace = True)

    Input = df.drop('dataset', axis =1).fillna('0') #Should remove the row 
    Output_encoded = pd.get_dummies(df, columns=['dataset']) #Encode the PD and SNP columns
    Output = Output_encoded['dataset_pd'].copy().astype('int32') #PD = 1, SNP = 0

    return Input, Output

### Split into training and testing, generate RF (whole dataset)

In [20]:
def train(Input, Output):
    """ Input:      Input           Dataframe with of input features for training
                    Output          Dataframe of class labels for each instance in Input

        Returns:    Input_train     Features training data
                    Input_test      Features test data
                    Classes_train   Class label training data
                    Classes_test    Class label test data

        80% training and 20% testing split. Strartify ensures fixed poportion of labels are in both sets. 
        Random forest defined as RFC with 1000 trees, seed = 42. Outputs the training data to files.
        """

    Input_train, Input_test, Classes_train, Classes_test = train_test_split(Input, Output, train_size = 0.8, random_state=42, stratify=Output) 
    RFC = RandomForestClassifier(random_state = 42, n_estimators = 1000, verbose = 1)
    RFC.fit(Input_train, Classes_train)

    with open('Training Data.txt', 'w') as file: #Writes class labels for all instances to text file
        file.write(Input_train.to_string())
    with open('Class labels.txt', 'w') as file: #Writes class labels for all instances to text file
        file.write(Classes_train.to_string())
    with open('Test Data.txt', 'w') as file: #Writes class labels for all instances to text file
        file.write(Input_test.to_string())
    with open('Test labels.txt', 'w') as file: #Writes class labels for all instances to text file
        file.write(Classes_test.to_string())

    return RFC, Input_test, Classes_test, Input_train, Classes_train

### Initial evaluation

In [22]:
def test(RFC, Input_test, Classes_test):
    """ Input:  Input_test      Features test data
                Classes_test    Class label test data

        Evaluates the training data. Random forest classifier makes prediction using the test features. True values 
        are the class labels testing data
    """

    Output_pred = RFC.predict(Input_test) #Always perdict on the unseen test data, as train has been used by the estimastor
    print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
    print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")


In [23]:
Input, Output = Clean_data()
RFC, Input_test, Classes_test, Input_train, Classes_train = train(Input, Output)
test(RFC,Input_test, Classes_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    4.1s finished


Confusion Matrix:
 [[145  78]
 [ 27 424]]
              precision    recall  f1-score   support

           0       0.84      0.65      0.73       223
           1       0.84      0.94      0.89       451

    accuracy                           0.84       674
   macro avg       0.84      0.80      0.81       674
weighted avg       0.84      0.84      0.84       674

MCC                0.6371468255225344


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished


In [7]:
a = 0
b = 0
for i in Classes_test:
    if i == 0:
        a = a + 1
    if i == 1:
        b = b + 1
print(f"{a} SNP samples")
print(f"{b} PD samples")

223 SNP samples
451 PD samples


### Balancing

In [24]:
def find_minority_class(classData):
    """ Input:    classData  Array of class labels
        Returns:  minClass   The label for the minority class
                  minSize    The number of items in the minority class
                  maxSize    The number of items in the majority class
    Finds information about the inbalance in class sizes
    """
    
    Minority_count = 0
    Majority_count = 0
    for datum in classData:
        if datum == 1:
            Majority_count += 1
        elif datum == 0:
            Minority_count += 1

    minClass = 0
    minSize = Minority_count
    maxSize = Majority_count
    if Minority_count > Majority_count:
        minClass = 1
        minSize = Majority_count
        maxSize = Minority_count

    return minClass, minSize, maxSize

In [25]:
""" Input:      maxSize     The number of items in the majority class
                minSize     The number of items in the minority class
                
    Returns:    BF          Number of balancing folds
    
    Calculate the number of balancing folds needed using ratio of majority to minority class size. Double to ensure sufficient
    majority class instances are sampled, then + 1 to make odd to allow weighted vote.
"""
def Balance_ratio(maxSize, minSize): 
    Divide = maxSize/minSize
    BF = (2 * round(Divide)) + 1 #Double ratio to nearest integer
    return BF

In [26]:
def balance(inData, classData, minClass, minSize):
    """ Input:    inData          array of input data
                  classData       array of classes assigned
                  minorityClass   class label for the minority class
                  minoritySize    size of the minority class
                  
         Returns: array of indexes that are of interest for a 
                  balanced dataset

    Perform the actual balancing between SNPs and PDs
    """
    
    usedLines = [False] * len(inData) #Array of false for length of data
    for i in range(len(inData)):
        if classData[i] == minClass:
            usedLines[i] = True
            
    usedCount = 0
    while usedCount < minSize:
        i = rd.randrange(len(inData))
        if usedLines[i] == False:
            usedCount += 1
            usedLines[i] = True       

    return usedLines

In [27]:
def balance_data(inData, classData, usedLines):
    """ Input:     inData      array of input training data
                   classData   array of classes assigned to training data
                   usedLines   array of line indexes to print
                
        Returns:   Input_balance  Array of balanced input training data
                   Label_balance  Array of balanced classes assigned to training data

    Create arrays for the input training data and its corresponding classes, as needed for predicting the probability.
    The index [i] is the identifier between the two arrays
    """
    Input_balance = []
    Label_balance = []
    for i in range(len(inData)):
        if usedLines[i]:
            Input_balance.append(inData[i])
            Label_balance.append(classData[i])
            
    Input_balance = np.stack(Input_balance, axis =0)
    Label_balance = np.stack(Label_balance, axis =0)
    
    return Input_balance, Label_balance

### Balance for n folds

In [28]:
""" Input:      BF                Number of balancing folds needed
                usedLines         Array of line indexes to print
                Input_balance     Input_balance  Array of balanced input training data
                Label_balance     Array of balanced classes assigned to training data

    Returns:    Input_folds       List of 5 balanced arrays of training data
                Output_folds      List of 5 balanced arrays of training data's labels

    Performs the balance_data() function n number of balancing fold times. Returns lists for training data and labels
    where each item is the output of balance_data()
"""
def Balance_Folds(BF, usedLines, Input_balance, Label_balance):
    Input_folds = []
    Output_folds = []
    for fold in range(BF):
        Input_folds.append(Input_balance)
        Output_folds.append(Label_balance)
        
    return Input_folds, Output_folds

### Train RFC on balanced dataset

In [29]:
def BF_training(BF, Input_folds, Output_folds): 
    """ Input:      BF              Number of balancing folds
                    Input_folds     List of 5 balanced arrays for training data
                    Output_folds    List of 5 balanced arrays of training data's labels

        Returns:    BF_RFC          List of RFC's trained on data in each balancing fold

        Creates a model that returns probability predictions for each fold, using Balance_Fold() as input
    """    
    BF_RFC = []
    
    for i in range(BF):
        BF_RFC.append(RandomForestClassifier(random_state = 42, n_estimators = 1000, verbose = 1)) #Defines a Random Forest for each fold. 42 seeds, 1000 trees
        BF_RFC[i].fit(Input_folds[i], Output_folds[i].ravel()) #Generates a random forest for each fold's training data        
        
    return BF_RFC

In [31]:
def BFC_test(BF_RFC, Input_test):
    """ Input:  BF_RFC          List of RFC's trained on data in each balancing fold
                Input_test      20% unseen testing data split before the balancing folds
                
    Returns:    Prob_matrix      2D matrix where the 1st dimension is each subset in balancing fold, 
                                 2nd dimension is predicted probability
    
    Tests the trained RFCs on the test set, then for every instance, outputs the predicted probability for each class
    """
    Prob_matrix = [] #Empty list
    Prob_matrixlist = []
    for i in range(len(BF_RFC)): #step through item in
        Prob_list = BF_RFC[i].predict_proba(Input_test.values)
        Prob_matrix.append(Prob_list)
        
        Output_pred = BF_RFC[i].predict(Input_test)
        print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
        print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")    
              
        
    with open('Balanced probabilities.txt', 'w') as f:
        for number, line in zip(range(BF), Prob_matrix ):
            f.write(f"Fold: {number}\n\n{line}\n\n\n")

    return Prob_matrix

### Weighted voting

In [None]:
""" Input:      Prob_matrix     2D matrix where the 1st dimension is each subset in balancing fold, 
                                2nd dimension is predicted probability
                BF              Number of balancing folds
                
    Returns:    S_Out           Confidence score for each predictor
    
    Calculates the final confidence score. Binary classification formula
"""
def Score(Prob_matrix, BF):
    BF_prob_PD =[]
    BF_prob_SNP =[]
    for i in range(BF):
        BF_prob_PD.append(Prob_matrix[i][:,1] - Prob_matrix[i][:,0]) #PD - SNP probabilty
        BF_prob_SNP.append(Prob_matrix[i][:,0] - Prob_matrix[i][:,1]) #SNP - PD probabilty     
    
    PD_Sum = 0
    SNP_Sum = 0
    for i in range(BF):
        PD_Sum += BF_prob_PD[i]
        SNP_Sum += BF_prob_SNP[i]
        
    S_Out = np.abs((PD_Sum - SNP_Sum)/BF)
    
    np.savetxt('S_out.txt', S_Out, "%.3f")
    
    return(S_Out) #Returns the final confidence scores


In [None]:
""" Input:      S_Out       Confidence score for each predictor
                
    Returns:    Vote        Number of PDs and SNPs predicted after weighted vote
    
    Calculates the final confidence score
"""
def Final_vote(S_Out):
    
    SNP_Count = 0
    PD_Count = 0
    FinalClass = []
    for i in range(len(S_Out)):
        if S_Out[i] >= 0.5:
            PD_Count += 1
            FinalClass.append(round(S_Out[i]))
        elif S_Out[i] < 0.5:
            SNP_Count += 1
            FinalClass.append(round(S_Out[i]))
    
    return FinalClass


In [32]:
# file = 'E2.csv'
# Input, Output = Clean_data(file)

inData    = pd.DataFrame(Input_train).to_numpy()
classData = pd.DataFrame(Classes_train).to_numpy()

minClass, minSize, maxSize   = find_minority_class(classData)
BF                           = Balance_ratio(maxSize, minSize)
usedLines                    = balance(inData, classData, minClass, minSize)

Input_balance, Label_balance = balance_data(inData, classData, usedLines)
Input_folds, Output_folds    = Balance_Folds(BF, usedLines, Input_balance, Label_balance)

BF_RFC                       = BF_training(BF, Input_folds, Output_folds)
Prob_matrix                  = BFC_test(BF_RFC, Input_test)

# S_Out                        = Score(Prob_matrix, BF)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[

Confusion Matrix:
 [[190  33]
 [ 80 371]]
              precision    recall  f1-score   support

           0       0.70      0.85      0.77       223
           1       0.92      0.82      0.87       451

    accuracy                           0.83       674
   macro avg       0.81      0.84      0.82       674
weighted avg       0.85      0.83      0.84       674

MCC                0.6477934984182191


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Confusion Matrix:
 [[190  33]
 [ 80 371]]
              precision    recall  f1-score   support

           0       0.70      0.85      0.77       223
           1       0.92      0.82      0.87       451

    accuracy                           0.83       674
   macro avg       0.81      0.84      0.82       674
weighted avg       0.85      0.83      0.84       674

MCC                0.6477934984182191


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Confusion Matrix:
 [[190  33]
 [ 80 371]]
              precision    recall  f1-score   support

           0       0.70      0.85      0.77       223
           1       0.92      0.82      0.87       451

    accuracy                           0.83       674
   macro avg       0.81      0.84      0.82       674
weighted avg       0.85      0.83      0.84       674

MCC                0.6477934984182191


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Confusion Matrix:
 [[190  33]
 [ 80 371]]
              precision    recall  f1-score   support

           0       0.70      0.85      0.77       223
           1       0.92      0.82      0.87       451

    accuracy                           0.83       674
   macro avg       0.81      0.84      0.82       674
weighted avg       0.85      0.83      0.84       674

MCC                0.6477934984182191
Confusion Matrix:
 [[190  33]
 [ 80 371]]
              precision    recall  f1-score   support

           0       0.70      0.85      0.77       223
           1       0.92      0.82      0.87       451

    accuracy                           0.83       674
   macro avg       0.81      0.84      0.82       674
weighted avg       0.85      0.83      0.84       674

MCC                0.6477934984182191


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.1s finished


In [None]:
# Final_vote(S_Out)) #Always perdict on the unseen test data, as train has been used by the estimastor
Output_pred = Final_vote(S_Out)

print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")