## Import library

In [1]:
#Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP
#Goal is to predict if mutation is SNP or PD
#ImprovedBalancing branch

#Imports the required libraries and packages
import pandas as pd  #Import for data manipulation in dataframes
import numpy as np  #Array manipulation and calculates mean

import random as rd
import time

from sklearn.metrics import(
    matthews_corrcoef,  # CC for evaluation
    f1_score,  #F1 score for evaluation
    confusion_matrix,  #Creates the confusion matrix - stats on how accurate the test set output is
    classification_report #Returns the F1 socre, precision, and recall of a prediction using a given model
    )
from sklearn.model_selection import(
    train_test_split,  # Splits data frame into the training set and testing set
    GridSearchCV,  # Cross validation to improve hyperparameters
    StratifiedKFold
        )
from sklearn.ensemble import RandomForestClassifier #SK learn API for classificastion random forests
from sklearn.tree import DecisionTreeClassifier #Single tree decisions 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier #allows for confidence scores to be predicted for each

np.set_printoptions(threshold=np.inf) #full array printing

## Random Seed function

In [2]:
def Random_Seed(): #Generates a random seed
    random1 = rd.randrange(1,100) #Random integet between 1 and 100
    random2 =  time.time() #Time since UTC epoch
    Seed = int(random2//random1//1000)
    return Seed
Random_Seed()

53907

## Clean dataset in pandas

In [3]:
#Create, clean and convert dataset E2.csv to PD dataframe**
df = pd.read_csv('E2.csv')  #Create PD data frame from .csv
df.drop(['pdbcode:chain:resnum:mutation'], axis=1, inplace=True)  #Removes unrequired columns. PDBcode may be needed for manual validation 
df.columns = df.columns.str.replace(' ', '_')  # Removes any blank attributes
df.replace(' ', '_', regex=True, inplace=True)  # Replace all blank spaces with underscore (none were present)
df.reset_index(drop=True, inplace = True) #Resets index numbering from 0 and drops column
Input = df.drop('dataset', axis =1).fillna('0') #DF of input instances for classification training. Unknown attributes assigned 0
Output_encoded = pd.get_dummies(df, columns=['dataset']) #One hot encoding dataset column so "PD" and "SNP" attributes are numerical 0 or 1
Output = Output_encoded['dataset_pd'].copy().astype('int32') #Dataframe with 1 column where 1 = PD, 0 = SNP, integer

print("Total samples:", len(df))
Majority = len(df.loc[df['dataset'] == 'pd'])
print(f"{Majority} PD samples")
Minority = len(df.loc[df['dataset'] == 'snp'])
print(f"{Minority} SNP samples")

Total samples: 3368
2254 PD samples
1111 SNP samples


## Split into training and testing, generate RF (whole dataset)

In [4]:
Input_train, Input_test, Classes_train, Classes_test = train_test_split(Input, Output, train_size = 0.8, random_state=42, stratify=Output) #80% training and 20% testing split. Strartify ensures fixed poportion of output labels is in both sets. Input attributes and class labels, training attributes and class label etc
start=time.time() #Start timer for inital training model building
RFC = RandomForestClassifier(random_state = 42, n_estimators = 1000, verbose = 1) #Defines the Random Forest. 42 seeds, 1000 trees
RFC.fit(Input_train, Classes_train) #Generates a random forest from the training data
with open('Training Data.txt', 'w') as file: #Writes class labels for all instances to text file
    file.write(Input_train.to_string())
    
with open('Class labels.txt', 'w') as file: #Writes class labels for all instances to text file
    file.write(Classes_train.to_string())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    4.0s finished


### Training (revisit params)

In [37]:
# StandardScaler().fit(X_train).transform(X_train) #Scales data 
# pipeline = make_pipeline( #Sets the random forest parameters
#     StandardScaler(),
#     LogisticRegression(solver='saga', max_iter=2000),
#     verbose=2
# )
RFC.get_params()
# Evaluation of training before weighted vote

Output_pred = RFC.predict(Input_test) #Always perdict on the unseen test data, as train has been used by the estimastor
print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")


Confusion Matrix:
 [[145  78]
 [ 27 424]]
              precision    recall  f1-score   support

           0       0.84      0.65      0.73       223
           1       0.84      0.94      0.89       451

    accuracy                           0.84       674
   macro avg       0.84      0.80      0.81       674
weighted avg       0.84      0.84      0.84       674

MCC                0.6371468255225344


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished


# Balancing

In [None]:
# def Minority_length(labels): #Finds the minority class size
#     SNP = 0
#     PD = 0
#     for i in labels:
#         if i == 0:
#             SNP +=1
#         if i == 1:
#             PD +=1
#     return SNP #returns the minority class length. [891]

In [7]:
def find_minority_class(classData):

    Minority_count = 0
    Majority_count = 0
    for datum in classData:
        if datum == 1:
            Majority_count += 1
        elif datum == 0:
            Minority_count += 1

    minClass = 0
    minSize = Minority_count
    maxSize = Majority_count
    if Minority_count > Majority_count:
        minClass = 1
        minSize = Majority_count
        maxSize = Minority_count

    return minClass, minSize, maxSize

In [8]:
def balance(inData, classData, minClass, minSize):
    
    usedLines = [False] * len(inData) #sets used lines as 0
    for i in range(len(inData)):
        if classData[i] == minClass:
            usedLines[i] = True
        else:
            usedLines[i] = False
    usedCount = 0
    while usedCount < minSize:
        i = rd.randrange(len(inData))
        if usedLines[i] == False:
            usedCount += 1
            usedLines[i] = True       

    return usedLines

In [9]:
def print_data(inData, classData, usedLines):
    """ Input:  inData    array of input training data
                classData array of classes assigned to training data
                usedLines array of line indexes to print

    Prints the selected lines
    """
    Input_fold = []
    Label_fold = []
    for i in range(len(inData)):
        if usedLines[i]:
            Input_fold.append(inData[i])
            Label_fold.append(classData[i])
            
    Input_fold = np.stack(Input_fold, axis =0)
    Label_fold = np.stack(Label_fold, axis =0)
    
    return Input_fold, Label_fold

### Find majority:minority ratio and number of balancing folds

In [13]:
def Balance_ratio(): #Finds ratio between the 2 classes (i.e the imbalance) and the number of folds required
    Divide = maxSize/minSize
    if round(Divide) % 2 == 0:
        BF = 2 * round(Divide) + 1
    else:
        BF = round(Divide)
    return BF

### Define function arguments (find better method)

In [14]:
def Balance_Folds():
    fold_input = []
    fold_output = []
    for fold in range(BF):
        usedLines = balance(inData, classData, minClass, minSize)
        Input, Output = print_data(inData, classData, usedLines)
        fold_input.append(Input)
        fold_output.append(Output)
    return fold_input, fold_output
        

In [19]:
inData = pd.DataFrame(Input_train).to_numpy()
classData = pd.DataFrame(Classes_train).to_numpy()
minClass, minSize, maxSize = find_minority_class(classData)
BF = Balance_ratio()

fold_input, fold_output = Balance_Folds()

### Train balanced data on random forest model

In [20]:
def BF_training(BF): #Creates a model that returns probability predictions for each fold, using Indices_list as input
    
    BF_RFC = RandomForestClassifier(random_state = 42, n_estimators = 1000, verbose = 1) #Defines the Random Forest. 42 seeds, 1000 trees
    Prob_list = []
    
    for i in range(BF):
        BF_RFC.fit(fold_input[i], fold_output[i].ravel()) #Generates a random forest for each fold's training data
        Prob = BF_RFC.predict_proba(fold_input[i]) #Predicted class label from input training data
        Prob_list.append(Prob)
        
        with open('Balanced probabilities.txt', 'w') as f:
            for number, line in zip(range(BF), Prob_list):
                f.write(f"Fold: {number}\n\n{line}\n\n\n")
        
    return Prob_list

In [21]:
Prob_list = BF_training(BF)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[

### Weighted voting

In [23]:
def Score(Instance, BF):
    BF_prob_PD =[]
    BF_prob_SNP =[]
    for i in range(BF):
        BF_prob_PD.append(Instance[i][:,1] - Instance[i][:,0]) #PD - SNP prob
        BF_prob_SNP.append(Instance[i][:,0] - Instance[i][:,1]) #SNP - PD prob     
    
    PD_Sum = 0
    SNP_Sum = 0
    for i in range(BF):
        PD_Sum += BF_prob_PD[i]
        SNP_Sum += BF_prob_SNP[i]
        
    S_Out = np.abs((PD_Sum - SNP_Sum)/(BF * 2))
    
    np.savetxt('S_out.txt', S_Out, "%.3f")
    
    return(S_Out) #Returns the final confidence scores


In [24]:
#Final vote
def Final_vote(Value):
    
    SNP_Count = 0
    PD_Count = 0
    BalancedValues = []
    for i in range(len(Value)):
        if Value[i] > 0.5:
            PD_Count += 1
            BalancedValues.append(round(Value[i]))
        elif Value[i] < 0.5:
            SNP_Count += 1
            BalancedValues.append(round(Value[i]))


    final = print(f"{PD_Count} samples predicted to be PD\n{SNP_Count} samples predicted to be SNP")
    
    return BalancedValues  
    # #Evaluation of training after weighted vote
    # Classes_pred = RFC.predict(Input_test)
    # print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Classes_pred)}")
    # print(f"MCC:\n {matthews_corrcoef(Classes_test, Classes_pred)}")
    # print("F1:\n", f1_score(Classes_test, Classes_pred))

In [None]:
# TODO: retrive the probability from each tree for a single sample  

In [51]:
Value = Score(Instance = Prob_list, BF = BF)

vote = Final_vote(Value)

Array = np.stack(vote, axis =0)

324 samples predicted to be PD
1455 samples predicted to be SNP


In [27]:
Output_pred = Array #Always perdict on the unseen test data, as train has been used by the estimastor
print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")

ValueError: Found input variables with inconsistent numbers of samples: [674, 1779]

### Validation

In [None]:
# # **Split data into training and test**
# with open('SNPorPD.txt', 'w+') as f:
#         data=f.read()
#         f.write(str(y_test.to_string()))

# # pipeline.fit(X, y) #applies list if transformers to give a fitted model

# plt.scatter(Classes_test, Output_pred)

In [None]:
gridsearch = GridSearchCV( #validation
    estimator = LogisticRegression(solver='saga'),
    param_grid = {}, #dictionary of parameters to search through
    cv = StratifiedKFold(),
    n_jobs = 1, #how many processors to run in parallel
    scoring = 'f1',
    verbose = 3 
    ).fit(X_train, y_train)

In [None]:
# y_pred = clf.predict(X_test)
# print("Training time:", stop-start)
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# print("MCC:\n", matthews_corrcoef(y_test, y_pred))
# print("F1:\n", f1_score(y_test, y_pred))