## Import library

In [1]:
""" Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP
    Goal is to predict if mutation is SNP or PD
    ImprovedBalancing branch
    
    Total samples: 3368
    2254 PD samples
    1111 SNP samples
"""

' Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP\n    Goal is to predict if mutation is SNP or PD\n    ImprovedBalancing branch\n    \n    Total samples: 3368\n    2254 PD samples\n    1111 SNP samples\n'

In [2]:
# Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP
# Goal is to predict if protein is a SNP or PD
#ImprovedBalancing branch

#Imports the required libraries and packages
import pandas as pd  #Import for data manipulation in dataframes
import numpy as np  #Array manipulation and calculates mean

import random as rd
import time
import sys

from sklearn import tree

from sklearn.metrics import(
    matthews_corrcoef,  # CC for evaluation
    f1_score,  #F1 score for evaluation
    balanced_accuracy_score, roc_auc_score, make_scorer,  #Scoring metrics
    confusion_matrix,  #Creates the confusion matrix - stats on how accurate the test set output is
    classification_report #Returns the F1 socre, precision, and recall of a prediction using a given model
    )
from sklearn.model_selection import(
    train_test_split,  # Splits data frame into the training set and testing set
    GridSearchCV,  # Cross validation to improve hyperparameters
    StratifiedKFold
        )
from sklearn.ensemble import RandomForestClassifier #SK learn API for classificastion random forests
from sklearn.tree import DecisionTreeClassifier #Single tree decisions 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle #shuffles rows
from sklearn.neighbors import KNeighborsClassifier #allows for confidence scores to be predicted for each

np.set_printoptions(threshold=np.inf) #full array printing

## Random Seed

In [4]:
def Random_Seed(): #Generates a random seed
    random1 = rd.randrange(1,100) #Random integet between 1 and 100
    random2 =  time.time() #Time since UTC epoch
    Seed = int(random2//random1//1000)
    return Seed
Random_Seed()

16904

## Read the whole dataset - revis

In [129]:
#Create, clean and convert dataset E2.csv to PD dataframe**
df = pd.read_csv('E2.csv')  #Create PD data frame from .csv
df.drop(['pdbcode:chain:resnum:mutation'], axis=1, inplace=True)  #Removes unrequired columns. PDBcode may be needed for manual validation 
df.columns = df.columns.str.replace(' ', '_')  # Removes any blank attributes
df.replace(' ', '_', regex=True, inplace=True)  # Replace all blank spaces with underscore (none were present)
df.reset_index(drop=True, inplace = True) #Resets index numbering from 0 and drops column
Input = df.drop('dataset', axis =1).fillna('0') #DF of input instances for classification training. Unknown attributes assigned 0
Output_encoded = pd.get_dummies(df, columns=['dataset']) #One hot encoding dataset column so "PD" and "SNP" attributes are numerical 0 or 1
Output = Output_encoded['dataset_pd'].copy().astype('int32') #Dataframe with 1 column where 1 = PD, 0 = SNP, integer

print("Total samples:", len(df))
Majority = len(df.loc[df['dataset'] == 'pd'])
print(f"{Majority} PD samples")
Minority = len(df.loc[df['dataset'] == 'snp'])
print(f"{Minority} SNP samples")

Total samples: 3368
2254 PD samples
1111 SNP samples


## Split into training and testing, generate RF (whole dataset)

In [130]:
Input_train, Input_test, Classes_train, Classes_test = train_test_split(Input, Output, train_size = 0.8, random_state=42, stratify=Output) #80% training and 20% testing split. Strartify ensures fixed poportion of output labels is in both sets. Input attributes and class labels, training attributes and class label etc
start=time.time() #Start timer for inital training model building
RFC = RandomForestClassifier(random_state = 42, n_estimators = 1000, verbose = 1) #Defines the Random Forest. 42 seeds, 1000 trees
RFC.fit(Input_train, Classes_train) #Generates a random forest from the training data
with open('Training Data.txt', 'w') as file: #Writes class labels for all instances to text file
    file.write(Input_train.to_string())
    
with open('Class labels.txt', 'w') as file: #Writes class labels for all instances to text file
    file.write(Classes_train.to_string())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    4.6s finished


### Training (revisit params)

In [131]:
# StandardScaler().fit(X_train).transform(X_train) #Scales data 
# pipeline = make_pipeline( #Sets the random forest parameters
#     StandardScaler(),
#     LogisticRegression(solver='saga', max_iter=2000),
#     verbose=2
# )
RFC.get_params()
# Evaluation of training before weighted vote

Output_pred = RFC.predict(Input_test) #Always perdict on the unseen test data, as train has been used by the estimastor
print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
print(f"MCC:\n {matthews_corrcoef(Classes_test, Output_pred)}")
print("F1:\n", f1_score(Classes_test, Output_pred))
print(classification_report(Classes_test, Output_pred))

Confusion Matrix:
 [[145  78]
 [ 27 424]]
MCC:
 0.6371468255225344
F1:
 0.8898216159496328
              precision    recall  f1-score   support

           0       0.84      0.65      0.73       223
           1       0.84      0.94      0.89       451

    accuracy                           0.84       674
   macro avg       0.84      0.80      0.81       674
weighted avg       0.84      0.84      0.84       674



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.1s finished


In [None]:
def print_data(inData, classData, usedLines):
    """ Input:  inData    array of input data
                classData array of classes assigned
                usedLines array of line indexes to print

    Prints the selected lines
    """
    for idx in range(len(inData)):
        if usedLines[idx]:
            print(inData[idx]+","+classData[idx])

### Find majority:minority ratio and number of balancing folds

In [132]:
def Balance_ratio(Major, Minor): #Finds ratio between the 2 classes (i.e the imbalance) and the number of folds required
    Divide = Majority/Minority
    if Divide <= 1:
        Ratio = 1/Divide
    else:
        Ratio = Divide
        
    if round(Ratio) % 2 == 0:
        BF = 2 * round(Ratio) + 1
    else:
        BF = round(Ratio)
        
    print(f"Imbalance ratio:\n{Ratio}:1\n{BF} balancing folds needed.")
    return BF

BF = Balance_ratio(Major = Majority, Minor = Minority)

Imbalance ratio:
2.028802880288029:1
5 balancing folds needed.


### Balancing via array index

In [144]:
def Minority_length(labels): #Finds the minority class size
    SNP = 0
    PD = 0
    for i in labels:
        if i == 0:
            SNP +=1
        if i == 1:
            PD +=1
    return SNP #returns the minority class length. [891]

labels = Classes_train

In [174]:
inData = pd.DataFrame(Input_train).to_numpy()
classData = pd.DataFrame(Classes_train).to_numpy()
minorityClass = 0
minoritySize = Minority_length(labels)

(2694, 1)

In [148]:
def Balance(Array,length, labels): #Balances the dataset 
    
    count_0 = 0
    count_1 = 0
    Indices = []
    Array_loop = shuffle(Array)
    
    for i in Array_loop:
        index = int(i[0:1]) #element index
        values = i[1:] #element values

        if values == 0 and count_0 < length:
            count_0 += 1
            Indices.append(index)
        elif values == 1 and count_1 < length:
            count_1 += 1
            Indices.append(index)
            
#     print(f"PD:{count_1}\nSNP:{count_0}\nTotal:{count_1+count_0}")

    return Indices #Returns a list of indicies with random, equal SNPs and PDs. [891]

Array = np.column_stack((labels.index, labels)) #Array of class labels. 1 = PD = majority, 0 = SNP = minority
length = Minority_length(labels)

### Train balanced data on random forest model

In [146]:
def n_balance_folds(BF, nfold):
    Indices_list = []
    for i in range(BF):
        nfold = Balance(length, labels) 
        Indices_list.append(nfold)
    return Indices_list


In [9]:
# def Balancing_Fold(Fold, BF): #Output n number of balance folds instances as a list
#     list_index_BF = []
#     for i in range(BF):
#         Fold #Call balanced data, different for each loop  
#         list_index_BF.append(Fold)
#     return list_index_BF

In [126]:
n_balance_folds(BF, nfold)

[[2417,
  339,
  2071,
  2888,
  1462,
  1253,
  1608,
  442,
  1588,
  1817,
  2874,
  838,
  496,
  2222,
  423,
  2662,
  699,
  1269,
  299,
  1017,
  2084,
  3221,
  1314,
  2209,
  2961,
  1146,
  190,
  2768,
  1808,
  142,
  3282,
  322,
  1098,
  668,
  1846,
  1792,
  1180,
  2024,
  560,
  1205,
  8,
  766,
  3155,
  518,
  1514,
  1783,
  409,
  1273,
  2535,
  2331,
  1711,
  3103,
  3183,
  3318,
  963,
  1538,
  539,
  794,
  3291,
  341,
  372,
  410,
  376,
  2454,
  266,
  155,
  2810,
  960,
  915,
  2140,
  1398,
  2320,
  638,
  1058,
  1873,
  2033,
  3251,
  2547,
  1145,
  3223,
  2952,
  2497,
  1343,
  3060,
  1397,
  358,
  2459,
  1510,
  1133,
  1840,
  337,
  970,
  2641,
  1954,
  2143,
  723,
  3093,
  1364,
  27,
  2986,
  1436,
  645,
  1421,
  2228,
  235,
  176,
  1720,
  1801,
  724,
  1523,
  2857,
  961,
  2352,
  3020,
  382,
  1088,
  2013,
  1010,
  2382,
  1021,
  464,
  1325,
  3100,
  293,
  1125,
  957,
  509,
  544,
  946,
  1950,
  1753,


In [149]:
def BF_training(BF): #Creates a model that returns probability predictions for each fold, using Indices_list as input
    BF_RFC = RandomForestClassifier(random_state = 42, n_estimators = 1000, verbose = 1) #Defines the Random Forest. 42 seeds, 1000 trees
    Prob_list = []
    Prob_liststr = [] #for file
    BF_data = []
    
    for i in range(BF):
        Folds = Balance(Array, length, labels)
        Input = Input_train.filter(Folds, axis = 0)
        Output = Classes_train.filter(Folds, axis = 0)
        
        combined = pd.concat([Input, Output], axis =1) #Combines input and output data so can be displayed easily
        BF_data.append(combined)
        
        BF_RFC.fit(Input, Output) #Generates a random forest for each fold's training data
        Prob = BF_RFC.predict_proba(Input) #Predicted class label from input training data
        
        combined[['SNP', 'PD']] = Prob
        Prob = combined.drop(labels=['Binding','SProtFT0','SProtFT1','SProtFT2','SProtFT3','SProtFT4','SProtFT5','SProtFT6','SProtFT7','SProtFT8','SProtFT9','SProtFT10','SProtFT11','SProtFT12','Interface','Relaccess','Impact','HBonds','SPhobic','CPhilic','BCharge','SSGeom','Voids','MLargest1','MLargest2','MLargest3','MLargest4','MLargest5','MLargest6','MLargest7','MLargest8','MLargest9','MLargest10','NLargest1','NLargest2','NLargest3','NLargest4','NLargest5','NLargest6','NLargest7','NLargest8','NLargest9','NLargest10','Clash','Glycine','Proline','CisPro','dataset_pd'], axis=1, inplace=False)
        #Becomes a list
        Prob_list.append(Prob) #List with probabilites for all instances, with indicies
        Prob_liststr.append(Prob.to_string()) #List with probabilites for all instances as string

    with open('Balanced probabilities.txt', 'w') as f:
        for number, line in zip(range(BF), Prob_liststr):
            f.write(f"Fold: {number}\n\n{line}\n\n\n")
        
    with open('Balanced training data.txt', 'w') as f:
        for number, fold in zip(range(BF), BF_data):
            f.write(f"Fold: {number}\n\n{fold}\n\n\n")
                
    return Prob_list #Returns n number of randomly balanced dataframes

### Weighted voting

In [150]:
#Apply weighted vote scheme for a predictor that outputs confidence value between 0 and 1 for each class. 
def Weighted_Proba(BF_Prob, BF):

    # Identify which instance appear in all folds, using the nth fold (random number)(change for all values)
    intersect_index = set(BF_Prob[0].index.values) #index of all 1782 instances in first fold
    
    for i in range(BF): #for all 5 folds
        intersect_index = intersect_index.intersection(set(BF_Prob[i].index.values)) #intersection checks if instances are in all folds
    intersect_index_list = list(intersect_index) #converts set to list, 951 items
    
    BF_common = []
    for i in range(BF): #for all 5 folds
        BF_Prob_instance = BF_Prob[i].loc[intersect_index_list,:] #Returns each df with the common instances
        BF_common.append(BF_Prob_instance)
        
  
    return BF_common #Returns the common instances

In [151]:
def Score(Instance, BF):
    BF_prob_PD =[]
    BF_prob_SNP =[]
    for i in range(BF):
        BF_prob_PD.append(Instance.iloc[:,1] - Instance.iloc[:,0]) #PD - SNP prob
        BF_prob_SNP.append(Instance.iloc[:,0] - Instance.iloc[:,1]) #SNP - PD prob     
    
    PD_Sum = 0
    for i in range(BF):
        PD_Sum += BF_prob_PD[i]
        
    SNP_Sum = 0
    for i in range(BF):
        SNP_Sum += BF_prob_SNP[i]
        
    S_Output = abs((PD_Sum - SNP_Sum)/(len(range(BF) * 2)))
    
    S_out = numpy.format_float_scientific(S_Output)
    
    string = S_out.to_string
    with open('S_out.txt', 'w') as f:
        f.write(string)
    
    return(S_out) #Returns the final confidence scores

# BF = Balance_ratio(Major = Majority, Minor = Minority)
# Score = (Instance = Weighted_Proba(BF_Prob = BF_training(BF), BF = BF), BF = BF)

In [None]:
def Score(Prob_matrix, BF):
    BF_prob_PD =[]
    BF_prob_SNP =[]
    for i in range(BF):
        BF_prob_PD.append(Prob_matrix[i][:,1] - Prob_matrix[i][:,0]) #PD - SNP probabilty
        BF_prob_SNP.append(Prob_matrix[i][:,0] - Prob_matrix[i][:,1]) #SNP - PD probabilty     
    
    PD_Sum = 0
    SNP_Sum = 0
    for i in range(BF):
        PD_Sum += BF_prob_PD[i]
        SNP_Sum += BF_prob_SNP[i]
        
    S_Out = np.abs((PD_Sum - SNP_Sum)/BF)
    
    np.savetxt('S_out.txt', S_Out, "%.3f")
    
    return(S_Out) #Returns the final confidence scores

In [13]:
# BF = Balance_ratio(Major = Majority, Minor = Minority)
# Instance = Weighted_Proba(BF_Prob = BF_training(BF), BF = BF)

In [155]:
# #Final vote
def Final_vote(Instance, BF):

    SNP = 0
    PD = 0
    for i in range(BF): 
        SNP += Instance[i].iloc[:,0]
        PD += Instance[i].iloc[:,1]

    SNP_Count = 0
    PD_Count = 0
    for i in range(len(SNP)):
        if PD.to_numpy()[i] > SNP.to_numpy()[i]:
            PD_Count += 1
        elif SNP.to_numpy()[i] > PD.to_numpy()[i]:
            SNP_Count += 1

    final = print(f"{PD_Count} samples predicted to be PD\n{SNP_Count} samples predicted to be SNP")
    
    return final  
    # #Evaluation of training after weighted vote
    # Classes_pred = RFC.predict(Input_test)
    # print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Classes_pred)}")
    # print(f"MCC:\n {matthews_corrcoef(Classes_test, Classes_pred)}")
    # print("F1:\n", f1_score(Classes_test, Classes_pred))

In [15]:
# TODO: retrive the probability from each tree for a single sample  

In [153]:
BF = Balance_ratio(Major = Majority, Minor = Minority)
Final_vote(Instance = Weighted_Proba(BF_Prob = BF_training(BF), BF = BF), BF = BF)

Imbalance ratio:
2.028802880288029:1
5 balancing folds needed.


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    2.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[

42 samples predicted to be PD
891 samples predicted to be SNP


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.2s finished


### Validation

In [18]:
# # **Split data into training and test**
# with open('SNPorPD.txt', 'w+') as f:
#         data=f.read()
#         f.write(str(y_test.to_string()))

# # pipeline.fit(X, y) #applies list if transformers to give a fitted model

# plt.scatter(Classes_test, Output_pred)

In [None]:
gridsearch = GridSearchCV( #validation
    estimator = LogisticRegression(solver='saga'),
    param_grid = {}, #dictionary of parameters to search through
    cv = StratifiedKFold(),
    n_jobs = 1, #how many processors to run in parallel
    scoring = 'f1',
    verbose = 3 
    ).fit(X_train, y_train)

In [None]:
# y_pred = clf.predict(X_test)
# print("Training time:", stop-start)
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# print("MCC:\n", matthews_corrcoef(y_test, y_pred))
# print("F1:\n", f1_score(y_test, y_pred))