### Import library

In [None]:
""" Example 2 is inbalanced data set; ~2200 in PD and ~1100 in SNP
    Goal is to predict if mutation is SNP or PD
    ImprovedBalancing branch
    
    Total samples: 3368
    2254 PD samples
    1111 SNP samples
"""

In [1]:


""" Imports the required libraries and packages
"""

import pandas as pd  #Import for data manipulation in dataframes
import numpy as np  #Array manipulation and calculates mean

import random as rd

from sklearn.metrics import(
    matthews_corrcoef,  # CC for evaluation
    f1_score,  #F1 score for evaluation
    confusion_matrix,  #Creates the confusion matrix - stats on how accurate the test set output is
    classification_report #Returns the F1 socre, precision, and recall of a prediction using a given model
    )
from sklearn.model_selection import(
    train_test_split,  # Splits data frame into the training set and testing set
    GridSearchCV,  # Cross validation to improve hyperparameters
    StratifiedKFold
        )
from sklearn.ensemble import RandomForestClassifier #SK learn API for classificastion random forests
from sklearn.tree import DecisionTreeClassifier #Single tree decisions 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier #allows for confidence scores to be predicted for each

np.set_printoptions(threshold=np.inf) #full array printing

### Clean dataset in pandas

In [2]:
""" Input:      file        The dataset to read

    Returns:    Input       Dataframe with of input features for training
                Output      Dataframe of class labels for each instance in Input

    Create, clean and convert dataset E2.csv to PD dataframe. Drops uneeded columns, removes blank spaces, 
    and applies "One Hot Encoding" to convert PD/SNP to 1/0
"""

df = pd.read_csv('E2.csv')
df.drop(['pdbcode:chain:resnum:mutation'], axis=1, inplace=True)
df.columns = df.columns.str.replace(' ', '_')
df.replace(' ', '_', regex=True, inplace=True)
df.reset_index(drop=True, inplace = True)

Input = df.drop('dataset', axis =1).fillna('0')
Output_encoded = pd.get_dummies(df, columns=['dataset'])
Output = Output_encoded['dataset_pd'].copy().astype('int32')


#file = E2.csv
#Input, Output = Clean_data(file)

### Split into training and testing, generate RF (whole dataset)

In [3]:
""" Input:      Input           Dataframe with of input features for training
                Output          Dataframe of class labels for each instance in Input

    Returns:    Input_train     Features training data
                Input_test      Features test data
                Classes_train   Class label training data
                Classes_test    Class label test data

    80% training and 20% testing split. Strartify ensures fixed poportion of labels are in both sets. 
    Random forest defined as RFC with 1000 trees, seed = 42. Outputs the training data to files.
    """

Input_train, Input_test, Classes_train, Classes_test = train_test_split(Input, Output, train_size = 0.8, random_state=42, stratify=Output) 
RFC = RandomForestClassifier(random_state = rd.seed(), n_estimators = 1000, verbose = 1)
RFC.fit(Input_train, Classes_train)

with open('Training Data.txt', 'w') as file: #Writes class labels for all instances to text file
    file.write(Input_train.to_string())
with open('Class labels.txt', 'w') as file: #Writes class labels for all instances to text file
    file.write(Classes_train.to_string())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    4.2s finished


### Initial evaluation

In [4]:
""" Input:      Input_test      Features test data
                Classes_test    Class label test data
                    
        Evaluates the training data. Random forest classifier makes prediction using the test features. True values 
        are the class labels testing data
"""

Output_pred = RFC.predict(Input_test) #Always perdict on the unseen test data, as train has been used by the estimastor
print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")


Confusion Matrix:
 [[143  80]
 [ 27 424]]
              precision    recall  f1-score   support

           0       0.84      0.64      0.73       223
           1       0.84      0.94      0.89       451

    accuracy                           0.84       674
   macro avg       0.84      0.79      0.81       674
weighted avg       0.84      0.84      0.83       674

MCC                0.629894066667426


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished


### Balancing

In [5]:
def find_minority_class(classData):
    """ Input:    classData  Array of class labels
        Returns:  minClass   The label for the minority class
                  minSize    The number of items in the minority class
                  maxSize    The number of items in the majority class
    Finds information about the inbalance in class sizes
    """
    
    Minority_count = 0
    Majority_count = 0
    for datum in classData:
        if datum == 1:
            Majority_count += 1
        elif datum == 0:
            Minority_count += 1

    minClass = 0
    minSize = Minority_count
    maxSize = Majority_count
    if Minority_count > Majority_count:
        minClass = 1
        minSize = Majority_count
        maxSize = Minority_count

    return minClass, minSize, maxSize

In [6]:
""" Input:      maxSize     The number of items in the majority class
                minSize     The number of items in the minority class
                
    Returns:    BF          Number of balancing folds
    
    Calculates the number of balancing folds needed using the majority and minority class size. If ratio between the two 
    classes is even, then made to be odd to allow weighted vote.
"""
def Balance_ratio(maxSize, minSize): 
    Divide = maxSize/minSize
    if round(Divide) % 2 == 0:
        BF = 2 * round(Divide) + 1
    else:
        BF = round(Divide)
    return BF

In [7]:
def balance(inData, classData, minClass, minSize):
    """ Input:    inData          array of input data
                  classData       array of classes assigned
                  minorityClass   class label for the minority class
                  minoritySize    size of the minority class
                  
         Returns: array of indexes that are of interest for a 
                  balanced dataset

    Performs the actual balancing between SNPs and PDs
    """
    
    usedLines = [False] * len(inData) #sets used lines as 0
    for i in range(len(inData)):
        if classData[i] == minClass:
            usedLines[i] = True
        else:
            usedLines[i] = False
    usedCount = 0
    while usedCount < minSize:
        i = rd.randrange(len(inData))
        if usedLines[i] == False:
            usedCount += 1
            usedLines[i] = True       

    return usedLines

In [8]:
def balance_data(inData, classData, usedLines):
    """ Input:     inData      array of input training data
                   classData   array of classes assigned to training data
                   usedLines   array of line indexes to print
                
        Returns:   Input_balance  Array of balanced input training data
                   Label_balance  Array of balanced classes assigned to training data

    Creates arrays for the input training data and its corresponding classes, as needed for predicting the probability.
    The index [i] is the identifier between the two arrays
    """
    Input_balance = []
    Label_balance = []
    for i in range(len(inData)):
        if usedLines[i]:
            Input_balance.append(inData[i])
            Label_balance.append(classData[i])
            
    Input_balance = np.stack(Input_balance, axis =0)
    Label_balance = np.stack(Label_balance, axis =0)
    
    return Input_balance, Label_balance

### Balance for n folds

In [9]:
""" Input:      BF                Number of balancing folds needed
                usedLines         Array of line indexes to print
                Input_balance     Input_balance  Array of balanced input training data
                Label_balance     Array of balanced classes assigned to training data

    Returns:    Input_folds       List of 5 balanced arrays of training data
                Output_folds      List of 5 balanced arrays of training data's labels

    Performs the balance_data() function n number of balancing fold times. Returns lists for training data and labels
    where each item is the output of balance_data().
"""
def Balance_Folds(BF, usedLines, Input_balance, Label_balance):
    Input_folds = []
    Output_folds = []
    for fold in range(BF):
        Input_folds.append(Input_balance)
        Output_folds.append(Label_balance)
        
    return Input_folds, Output_folds

In [12]:
inData = pd.DataFrame(Input_train).to_numpy()
classData = pd.DataFrame(Classes_train).to_numpy()
minClass, minSize, maxSize = find_minority_class(classData)
BF = Balance_ratio(maxSize, minSize)
usedLines = balance(inData, classData, minClass, minSize)
Input_balance, Label_balance = balance_data(inData, classData, usedLines)


Input_folds, Output_folds = Balance_Folds(BF, usedLines, Input_balance, Label_balance)

### Train balanced data on random forest model

In [13]:
""" Input:      BF              Number of balancing folds
                Input_folds     List of 5 balanced arrays for training data
                Output_folds    List of 5 balanced arrays of training data's labels
                
    Returns:    Prob_list       Predicted probability for each class, for each fold in list
    
    Creates a model that returns probability predictions for each fold, using Balance_Fold() as input
"""
def BF_training(BF): 
    
    BF_RFC = RandomForestClassifier(random_state = 42, n_estimators = 1000, verbose = 1) #Defines the Random Forest. 42 seeds, 1000 trees
    Prob_list = []
    
    for i in range(BF):
        BF_RFC.fit(Input_folds[i], Output_folds[i].ravel()) #Generates a random forest for each fold's training data
        Prob = BF_RFC.predict_proba(Input_folds[i]) 
        Prob_list.append(Prob)
        
        with open('Balanced probabilities.txt', 'w') as f:
            for number, line in zip(range(BF), Prob_list):
                f.write(f"Fold: {number}\n\n{line}\n\n\n")
        
    return Prob_list

In [None]:
Prob_list = BF_training(BF)

### Weighted voting

In [None]:
def Score(Instance, BF):
    BF_prob_PD =[]
    BF_prob_SNP =[]
    for i in range(BF):
        BF_prob_PD.append(Instance[i][:,1] - Instance[i][:,0]) #PD - SNP prob
        BF_prob_SNP.append(Instance[i][:,0] - Instance[i][:,1]) #SNP - PD prob     
    
    PD_Sum = 0
    SNP_Sum = 0
    for i in range(BF):
        PD_Sum += BF_prob_PD[i]
        SNP_Sum += BF_prob_SNP[i]
        
    S_Out = np.abs((PD_Sum - SNP_Sum)/(BF * 2))
    
    np.savetxt('S_out.txt', S_Out, "%.3f")
    
    return(S_Out) #Returns the final confidence scores


In [None]:
#Final vote
def Final_vote(Value):
    
    SNP_Count = 0
    PD_Count = 0
    BalancedValues = []
    for i in range(len(Value)):
        if Value[i] > 0.5:
            PD_Count += 1
            BalancedValues.append(round(Value[i]))
        elif Value[i] < 0.5:
            SNP_Count += 1
            BalancedValues.append(round(Value[i]))


    final = print(f"{PD_Count} samples predicted to be PD\n{SNP_Count} samples predicted to be SNP")
    
    return BalancedValues  
    # #Evaluation of training after weighted vote
    # Classes_pred = RFC.predict(Input_test)
    # print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Classes_pred)}")
    # print(f"MCC:\n {matthews_corrcoef(Classes_test, Classes_pred)}")
    # print("F1:\n", f1_score(Classes_test, Classes_pred))

In [None]:
# TODO: retrive the probability from each tree for a single sample  

In [None]:
Value = Score(Instance = Prob_list, BF = BF)

vote = Final_vote(Value)

Array = np.stack(vote, axis =0)
print(len(Array))

In [None]:
Output_pred = RFC.predict(Input_test) #Always perdict on the unseen test data, as train has been used by the estimastor
print(f"Confusion Matrix:\n {confusion_matrix(Classes_test, Output_pred)}")
print(f"{classification_report(Classes_test, Output_pred)}\nMCC                {matthews_corrcoef(Classes_test, Output_pred)}")


### Validation

In [None]:
# # **Split data into training and test**
# with open('SNPorPD.txt', 'w+') as f:
#         data=f.read()
#         f.write(str(y_test.to_string()))

# # pipeline.fit(X, y) #applies list if transformers to give a fitted model

# plt.scatter(Classes_test, Output_pred)

In [None]:
gridsearch = GridSearchCV( #validation
    estimator = LogisticRegression(solver='saga'),
    param_grid = {}, #dictionary of parameters to search through
    cv = StratifiedKFold(),
    n_jobs = 1, #how many processors to run in parallel
    scoring = 'f1',
    verbose = 3 
    ).fit(X_train, y_train)

In [None]:
# y_pred = clf.predict(X_test)
# print("Training time:", stop-start)
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# print("MCC:\n", matthews_corrcoef(y_test, y_pred))
# print("F1:\n", f1_score(y_test, y_pred))