In [None]:
import pandas as pd
import numpy as np

import random
from sklearn import metrics # for the evaluation
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
listControl = []
listPatient = []

#Reads in each of the 8 sheets for the patient and the control datasets
for i in range(1,9):
    #Imports the patient dataset and drops unneccessary rows/columns
    df = pd.read_excel('Data/ALL_PMS_Patient_Biolog_data_NORMALIZED.xlsx', sheet_name='PM-M' + str(i), engine='openpyxl')
    df = df[1:]
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.drop('well', axis=1)
    dfNames = df['CMS#']
    df = df.drop('CMS#', axis=1)

    #Imports the control dataset and removed unnecessary rows/columns
    df_control = pd.read_excel('Data/ABS_Normalized Control_PMS data.xlsx', sheet_name='PM-M' + str(i) + '_Control', engine='openpyxl')
    df_control = df_control[4:]
    cols = df_control.columns.tolist()
    to_remove = cols[0:59] + cols[109:]
    df_control = df_control.loc[:, ~df_control.columns.isin(to_remove)]

    #Converts to numpy array and adds the numpy array to a list
    arrayControl = df_control.to_numpy()
    arrayPatient = df.to_numpy()

    #Appends the array to a list
    listControl.append(arrayControl)
    listPatient.append(arrayPatient)

In [None]:
#Converts the lists to arrays and reshapes the arrays to be 2-dimensional instead of 3-dimensional
arrayControl_3D = np.array(listControl)
arrayControl = arrayControl_3D.reshape(-1,50)
arrayPatient_3D = np.array(listPatient)
arrayPatient = arrayPatient_3D.reshape(-1,48)

print(arrayControl.shape)
print(arrayPatient.shape)

In [None]:
medianDifference = []
meanDifference = []

arrayControl = np.delete(arrayControl, [412,413,414,415], 0)
arrayPatient = np.delete(arrayPatient, [412,413,414,415], 0)

#Calculate difference of the control median and patient median per well and appends value to the list
for i in range(0,764):
    controlMedian = np.median(arrayControl[i])
    patientMedian = np.median(arrayPatient[i])
    medianDifference.append(controlMedian-patientMedian)

    controlMean = np.mean(arrayControl[i])
    patientMean = np.mean(arrayPatient[i])
    meanDifference.append(controlMean-patientMean)

#Converts the list to a NumPy array
Mean = np.array(meanDifference)
Median = np.array(medianDifference)
print(np.median(abs(Mean)))
print(np.median(abs(Median)))

In [None]:
#In this case, we will use the first 50 columns as the control and the last 48 columns as the patient
x_array = np.append(arrayControl, arrayPatient, 1)

#Creates a set of values in the median differences array that are between -0.2 and 0.2 and deletes the values from x_array
toDrop = np.where(abs(Median) < 0.5)
x_array = np.delete(x_array, toDrop, 0)

#transposes the x_array
x_array_transpose = np.transpose(x_array)
print(x_array_transpose.shape)

#Creates a corresponding y array where 0 is control and 1 is patient
y_array = np.append(np.zeros(50), np.ones(48)) 
print(y_array.shape)

In [None]:
listAccuracyLiblinearScore = []
listLogLossLiblinearScore = []
listRecallLiblinearScore = []
listPrecisionLiblinearScore = []
listROCAUCLiblinearScore = []
listConfusionMatrices =[]

#Runs the logistic regression 100 times and calculates the accuracy score
for i in range (0,100):
    #Randomly shuffles the x and y data while keeping matching order
    temp = list(zip(x_array_transpose, y_array))
    random.shuffle(temp)
    x_shuffled, y_shuffled = zip(*temp)
    x_shuffled = np.array(x_shuffled)
    y_shuffled = np.array(y_shuffled)

    #Splits data into train and test split
    x_train = x_shuffled[0:80]
    x_test = x_shuffled[80:99]
    y_train = y_shuffled[0:80]
    y_test = y_shuffled[80:99]

    # Fitting Logistic Regression to the Training set
    lr = LogisticRegression(solver="liblinear", max_iter=10000)
    lr.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = lr.predict(x_test)
    y_pred_proba =lr.predict_proba(x_test)
    listAccuracyLiblinearScore.append(metrics.accuracy_score(y_test, y_pred))
    listLogLossLiblinearScore.append(metrics.log_loss(y_test, y_pred_proba))
    listRecallLiblinearScore.append(metrics.recall_score(y_test, y_pred))
    listPrecisionLiblinearScore.append(metrics.precision_score(y_test, y_pred))
    listROCAUCLiblinearScore.append(metrics.roc_auc_score(y_test, y_pred))
    listConfusionMatrices.append(metrics.confusion_matrix(y_test, y_pred))

print("Accuracy: " + str(np.mean(listAccuracyLiblinearScore)))
print("Log Loss: Mean: " + str(np.mean(listLogLossLiblinearScore)) + "  Median: " + str(np.median(listLogLossLiblinearScore)))
print("Recall: " + str(np.mean(listRecallLiblinearScore)))
print("Precision: " + str(np.mean(listPrecisionLiblinearScore)))
print("ROC-AUC: " + str(np.mean(listROCAUCLiblinearScore)))

In [None]:
plt.title("Accuracy across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("Accuracy Score")
plt.plot(listAccuracyLiblinearScore, 'ro', scalex=True)
l = [np.mean(listAccuracyLiblinearScore)] * 100
plt.plot(np.arange(0, 100), l, color="blue", label="mean")
plt.ylim(0, 1)
plt.show()

In [None]:
plt.title("Log Loss across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("Log Loss Score")
plt.plot(listLogLossLiblinearScore, 'ro')
plt.ylim(0, np.max(listLogLossLiblinearScore))
plt.show()

In [None]:
plt.title("Recall Score across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("Recall Score")
plt.plot(listRecallLiblinearScore, 'ro')
plt.ylim(0, 1)
plt.show()

In [None]:
plt.title("Precision across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("Precision Score")
plt.plot(listPrecisionLiblinearScore, 'ro')
plt.ylim(0, 1)
plt.show()

In [None]:
plt.title("ROCAUC across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("ROCAUC Score")
plt.plot(listPrecisionLiblinearScore, 'ro')
plt.ylim(0, 1)
plt.show()

In [None]:
listAccuracySagaScore = []
listLogLossSagaScore = []
listRecallSagaScore = []
listPrecisionSagaScore = []
listROCAUCSagaScore = []

#Runs the logistic regression 100 times and calculates the accuracy score
for i in range (0,100):
    #Randomly shuffles the x and y data while keeping matching order
    temp = list(zip(x_array_transpose, y_array))
    random.shuffle(temp)
    x_shuffled, y_shuffled = zip(*temp)
    x_shuffled = np.array(x_shuffled)
    y_shuffled = np.array(y_shuffled)

    #Splits data into train and test split
    x_train = x_shuffled[0:80]
    x_test = x_shuffled[80:99]
    y_train = y_shuffled[0:80]
    y_test = y_shuffled[80:99]

    # Fitting Logistic Regression to the Training set
    lr = LogisticRegression(solver="saga", max_iter=10000)
    lr.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = lr.predict(x_test)
    y_pred_proba =lr.predict_proba(x_test)
    listAccuracySagaScore.append(metrics.accuracy_score(y_test, y_pred))
    listLogLossSagaScore.append(metrics.log_loss(y_test, y_pred_proba))
    listRecallSagaScore.append(metrics.recall_score(y_test, y_pred))
    listPrecisionSagaScore.append(metrics.precision_score(y_test, y_pred))
    listROCAUCSagaScore.append(metrics.roc_auc_score(y_test, y_pred))

print("Accuracy: " + str(np.mean(listAccuracySagaScore)))
print("Log Loss: Mean: " + str(np.mean(listLogLossSagaScore)) + "  Median: " + str(np.median(listLogLossSagaScore)))
print("Recall: " + str(np.mean(listRecallSagaScore)))
print("Precision: " + str(np.mean(listPrecisionSagaScore)))
print("ROC-AUC: " + str(np.mean(listROCAUCSagaScore)))

In [None]:
plt.title("Accuracy across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("Accuracy Score")
plt.plot(listAccuracySagaScore, 'ko', label="Saga")
plt.plot(listAccuracyLiblinearScore, 'go', label="Liblinear")
plt.legend(loc="lower right")
plt.ylim(0, 1)
plt.show()

In [None]:
plt.title("Log Loss across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("Log Loss Score")
plt.plot(listLogLossSagaScore, 'ko', label="Saga")
plt.plot(listAccuracyLiblinearScore, 'go', label="Liblinear")
libMean = [np.mean(listLogLossLiblinearScore)] * 100
plt.plot(np.arange(0, 100), libMean, color="green", label="Liblinear Mean")
sagaMean = [np.mean(listLogLossSagaScore)] * 100
plt.plot(np.arange(0, 100), sagaMean, color="black", label="Saga Mean")
plt.legend(loc="lower right")
plt.ylim(0, 1)
plt.show()

In [None]:
plt.title("Recall across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("Recall Score")
plt.plot(listRecallSagaScore, 'ko', label="Saga")
plt.plot(listRecallLiblinearScore, 'go', label="Liblinear")
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.title("Precision across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("Precision Score")
plt.plot(listPrecisionSagaScore, 'ko', label="Saga")
plt.plot(listPrecisionLiblinearScore, 'go', label="Liblinear")
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.title("ROCAUC across all logistic regressions")
plt.xlabel("Regression #")
plt.ylabel("ROCAUC Score")
plt.plot(listROCAUCSagaScore, 'ko', label="Saga")
plt.plot(listROCAUCLiblinearScore, 'go', label="Liblinear")
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
listAccuracyDecTreeScore = []
listRecallDecTreeScore = []
listPrecisionDecTreeScore = []
listROCAUCDecTreeScore = []

#Runs the logistic regression 100 times and calculates the accuracy score
for i in range (0,100):
    #Randomly shuffles the x and y data while keeping matching order
    temp = list(zip(x_array_transpose, y_array))
    random.shuffle(temp)
    x_shuffled, y_shuffled = zip(*temp)
    x_shuffled = np.array(x_shuffled)
    y_shuffled = np.array(y_shuffled)

    #Splits data into train and test split
    x_train = x_shuffled[0:80]
    x_test = x_shuffled[80:99]
    y_train = y_shuffled[0:80]
    y_test = y_shuffled[80:99]

    # Fitting Logistic Regression to the Training set
    clf = DecisionTreeClassifier()
    clf.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = clf.predict(x_test)
    y_pred_proba = clf.predict_proba(x_test)
    listAccuracyDecTreeScore.append(metrics.accuracy_score(y_test, y_pred))
    listRecallDecTreeScore.append(metrics.recall_score(y_test, y_pred))
    listPrecisionDecTreeScore.append(metrics.precision_score(y_test, y_pred))
    listROCAUCDecTreeScore.append(metrics.roc_auc_score(y_test, y_pred))

print("Accuracy: " + str(np.mean(listAccuracyDecTreeScore)))
print("Recall: " + str(np.mean(listRecallDecTreeScore)))
print("Precision: " + str(np.mean(listPrecisionDecTreeScore)))
print("ROC-AUC: " + str(np.mean(listROCAUCDecTreeScore)))

tree.plot_tree(clf)

In [None]:
plt.title("Accuracy across all Saga/Decision Tree regressions")
plt.xlabel("Regression #")
plt.ylabel("Accuracy Score")
plt.plot(listAccuracyDecTreeScore, 'ro', label='Decision Tree')
plt.plot(listAccuracySagaScore, 'ko', label='saga')
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.title("Recall Score across all Saga/Decision Tree regressions")
plt.xlabel("Regression #")
plt.ylabel("Recall Score")
plt.plot(listRecallDecTreeScore, 'ro', label='Decision Tree')
plt.plot(listRecallSagaScore, 'ko', label='saga')
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.title("Precision across all Saga/Decision Tree regressions")
plt.xlabel("Regression #")
plt.ylabel("Precision Score")
plt.plot(listPrecisionDecTreeScore, 'ro', label='Decision Tree')
plt.plot(listPrecisionSagaScore, 'ko', label='saga')
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.title("ROCAUC across all Saga/Decision Tree regressions")
plt.xlabel("Regression #")
plt.ylabel("ROCAUC Score")
plt.plot(listROCAUCDecTreeScore, 'ro', label='Decision Tree')
plt.plot(listROCAUCSagaScore, 'ko', label='saga')
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.title("Accuracy across all regressions")
plt.xlabel("Regression #")
plt.ylabel("Accuracy Score")
plt.plot(listAccuracyDecTreeScore, 'ro', label='Decision Tree')
plt.plot(listAccuracySagaScore, 'ko', label='Saga')
plt.plot(listAccuracyLiblinearScore, 'go', label="Liblinear")
libMean = [np.mean(listAccuracyLiblinearScore)] * 100
plt.plot(np.arange(0, 100), libMean, color="green", label="Liblinear Mean")
sagaMean = [np.mean(listAccuracySagaScore)] * 100
plt.plot(np.arange(0, 100), sagaMean, color="black", label="Saga Mean")
decTreeMean = [np.mean(listAccuracyDecTreeScore)] * 100
plt.plot(np.arange(0, 100), decTreeMean, color="red", label="Decision Tree Mean")
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.title("Recall Score across all regressions")
plt.xlabel("Regression #")
plt.ylabel("Recall Score")
plt.plot(listRecallDecTreeScore, 'ro', label='Decision Tree')
plt.plot(listRecallSagaScore, 'ko', label='Saga')
plt.plot(listRecallLiblinearScore, 'go', label="Liblinear")
libMean = [np.mean(listRecallLiblinearScore)] * 100
plt.plot(np.arange(0, 100), libMean, color="green", label="Liblinear Mean")
sagaMean = [np.mean(listRecallSagaScore)] * 100
plt.plot(np.arange(0, 100), sagaMean, color="black", label="Saga Mean")
decTreeMean = [np.mean(listRecallDecTreeScore)] * 100
plt.plot(np.arange(0, 100), decTreeMean, color="red", label="Decision Tree Mean")
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.title("Precision across all regressions")
plt.xlabel("Regression #")
plt.ylabel("Precision Score")
plt.plot(listPrecisionDecTreeScore, 'ro', label='Decision Tree')
plt.plot(listPrecisionSagaScore, 'ko', label='Saga')
plt.plot(listPrecisionLiblinearScore, 'go', label="Liblinear")
libMean = [np.mean(listPrecisionLiblinearScore)] * 100
plt.plot(np.arange(0, 100), libMean, color="green", label="Liblinear Mean")
sagaMean = [np.mean(listPrecisionSagaScore)] * 100
plt.plot(np.arange(0, 100), sagaMean, color="black", label="Saga Mean")
decTreeMean = [np.mean(listPrecisionDecTreeScore)] * 100
plt.plot(np.arange(0, 100), decTreeMean, color="red", label="Decision Tree Mean")
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.title("ROCAUC across all regressions")
plt.xlabel("Regression #")
plt.ylabel("ROCAUC Score")
plt.plot(listROCAUCDecTreeScore, 'ro', label='Decision Tree')
plt.plot(listROCAUCSagaScore, 'ko', label='Saga')
plt.plot(listROCAUCLiblinearScore, 'go', label="Liblinear")
libMean = [np.mean(listROCAUCLiblinearScore)] * 100
plt.plot(np.arange(0, 100), libMean, color="green", label="Liblinear Mean")
sagaMean = [np.mean(listROCAUCSagaScore)] * 100
plt.plot(np.arange(0, 100), sagaMean, color="black", label="Saga Mean")
decTreeMean = [np.mean(listROCAUCDecTreeScore)] * 100
plt.plot(np.arange(0, 100), decTreeMean, color="red", label="Decision Tree Mean")
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()