# Analysis of diagnostic tests

 Dummy data is used to represent cancer detection dogs' data. It in no way reflects the true data collected.
 
 The input to this notebook is a table of the dogs' operant responses to various scent samples.

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix

%matplotlib inline

In [None]:
# User inputs
file = 'dog_behaviour_database_dummy.csv'

In [None]:
# Read in the data and remove unneeded rows and columns
data_input = pd.read_csv(file)
data = data_input[data_input['Is Info Row?']==False]
cols = [4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
data = data.drop(data.columns[cols],axis=1)
data.Run = data.Run.astype(int)
data.Pass = data.Pass.astype(int)
dogs = data['Dog name'].unique()
print('The dogs\' names are',dogs, '\n')
print('Example data rows:', '\n')
print(data.head())
print('\nDescription of the data:')
print(data.describe())

In [None]:
# Data slice. Show the data for one dog on one particular pass number
dog = data['Dog name'] == dogs[0]
pass_no = data['Pass'] == 1
print(data[dog & pass_no].head())


In [None]:
# Create re-shaped data, one row for each sample.
# Select position 1 samples
df_pos1 = data
df_pos1 = df_pos1.drop(['Concentration2', 'Concentration3','DogCorrect2', 'DogCorrect3','Result2', 'Result3'], axis=1)
df_pos1.rename(index=str, columns={'Concentration1': 'Concentration', 'DogCorrect1': 'DogCorrect', 'Result1': 'Result'}, inplace=True)
df_pos1['Position'] = 1
# Select position 2 samples
df_pos2 = data
df_pos2 = df_pos2.drop(['Concentration1', 'Concentration3','DogCorrect1', 'DogCorrect3','Result1', 'Result3'], axis=1)
df_pos2.rename(index=str, columns={'Concentration2': 'Concentration', 'DogCorrect2': 'DogCorrect', 'Result2': 'Result'}, inplace=True)
df_pos2['Position'] = 2
# Select position 3 samples
df_pos3 = data
df_pos3 = df_pos3.drop(['Concentration1', 'Concentration2','DogCorrect1', 'DogCorrect2','Result1', 'Result2'], axis=1)
df_pos3.rename(index=str, columns={'Concentration3': 'Concentration', 'DogCorrect3': 'DogCorrect', 'Result3': 'Result'}, inplace=True)
df_pos3['Position'] = 3
# Concatenate the three positions
df_samples = pd.concat([df_pos1, df_pos2, df_pos3])
# Add true class, y_true, and predicted class, y_pred. Class 0 is negative scent sample, class 1 is a positive scent sample.
df_samples['y_true'] = [1 if x > 0 else 0 for x in df_samples['Concentration']]
df_samples['y_pred'] = [1 if x == 'TP' or x=='FP' else 0 for x in df_samples['Result']]
print(df_samples.head())
df_samples.tail()

In [None]:
# Show each dog's results

fig, ax = plt.subplots(1, len(dogs))
if df_samples.shape[0] < 100:
    upper = 20
else:
    upper = 1200
    
order = ['TP', 'TN', 'FP', 'FN']
colors = ['lime', 'palegreen', 'lightsalmon', 'red']
i = 0
for d in dogs:
    dog = df_samples['Dog name'] == d
    axes = df_samples[dog]['Result'].value_counts().reindex(order).plot("bar", ax=ax[i], color=colors)
    axes.set_title(d)
    axes.set_ylim(0,upper)
    i = i+1


In [None]:
# Create bar chart to compare dog performance
pivot = pd.pivot_table(df_samples,index=['Dog name', 'Result'], values=['Concentration'], aggfunc=[len,max,min])
print(pivot)

colors = ['red', 'lightsalmon', 'palegreen', 'lime' ]   
df_samples.groupby('Dog name')['Result'] \
    .value_counts() \
    .sort_index(ascending=False) \
    .unstack(level=1) \
    .plot.bar(stacked=True, color=colors)

In [None]:
# Accuracy
temp = df_samples.groupby('Dog name')[['Dog name','DogCorrect']]
print(temp.describe())
print(temp.head())

In [None]:
# Results per dog and results as they relate to concentration  
print('\nCorrectness count:')   
print(df_samples.groupby(['Dog name', 'DogCorrect'])['DogCorrect'].aggregate(len).unstack())
print('\nResults count:')
print(df_samples.groupby(['Dog name', 'Result'])['Result'].aggregate(len).unstack())
print('\nMinimum concentration:')
print(df_samples.groupby(['Dog name', 'Result'])['Concentration'].aggregate(min).unstack())
print('\nMaximum concentration:')
print(df_samples.groupby(['Dog name', 'Result'])['Concentration'].aggregate(max).unstack())




In [None]:
# High level description of the data
df_cat = df_samples.astype('category')

df_cat.dtypes
df_cat.describe()

In [None]:
# Calculate and display metrics
print('\nResults count:')   
results = df_samples.pivot_table('Run', index='Dog name', columns='Result', aggfunc=len, fill_value=0, margins=True)
print(results)


accuracy = results.div( results.iloc[:,-1], axis=0 )
print('\nAccuracy:')
print(accuracy)

# Calculate ratios. FNR, false negative ratio, etc.
results['TPR'] = results.TP/(results.FP+results.TP)
results['FPR'] = results.FP/(results.FP+results.TP)
results['TNR'] = results.TN/(results.FN+results.TN)
results['FNR'] = results.FN/(results.FN+results.TN)
print('\nRatios:')
print('Sensitivity (aka recall) = TPR')
print('Specificity = TNR')
print(results[['TPR', 'FPR', 'TNR', 'FNR']])

# Calculate likelihood ratios and diagnostic odds ratio
results['LR+'] = results.TPR/(results.FPR)
results['LR-'] = (1-results.TPR)/(results.TNR)
results['DOR'] = results['LR+']/results['LR-']
print('\nLikelihood ratios:')
print('Tharwat, A. (2018). Classification assessment methods. Applied Computing and Informatics.')
print('LR+ measures how much the odds of the disease increases when the diagnositic test is positive')
print('LR- measures how much the odds of the disease decreases when the diagnositic test is negative')
print('DOR, diagnostic odds ratio, an estimate of the discriminative ability of the diagnostic test')
print('DOR can be used to compare two diagnostic tests')
print('DOR = LR+/LR-')
print(results[['LR+', 'LR-', 'DOR']])

# Calculate Youden's index
results['YI'] = results.TPR+results.TNR-1
print('\nYouden\'s Index, YI:')
print('Tharwat, A. (2018). Classification assessment methods. Applied Computing and Informatics.')
print('aka Bookmaker Informedness, BM')
print('YI range is 0-1, with 1 representing a perfect diagnostic test')
print(results[['YI']])     

In [None]:
# Confusion matrix
cm = confusion_matrix(df_samples.y_true, df_samples.y_pred)
print('Confusion matrix:')
print(cm)
print('\nNormalised confusion matrix:')
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm)

def plot_confusion_matrix(cm, title='Normalised confusion matrix'):
    # Plot the normalised confusion matrix
    # Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    # 'Confusion Matrix' https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
    classes = ['Negative', 'Positive']
    cmap=plt.cm.Blues
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
plot_confusion_matrix(cm)

In [None]:
# Confusion matrix for one dog
dog_name = dogs[0]
dog = df_samples['Dog name'] == dog_name
print('Data for', dog_name)
cm = confusion_matrix(df_samples[dog].y_true, df_samples[dog].y_pred)
print('\nNormalised confusion matrix for', dog_name, ' :')
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm) 
title = 'Normalised confusion matrix for '+ dog_name
plot_confusion_matrix(cm, title=title)

In [None]:
# Confusion matrix for excluding samples below a certain concentration level
threshold = 1/1e5
txt = '1/100,000'
above = df_samples['Concentration'] >= threshold 
negative = df_samples.y_true == 0
cond = above | negative
#print(df_samples[cond].head(10))
cm = confusion_matrix(df_samples[cond].y_true, df_samples[cond].y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
title = 'Normalised confusion matrix for concentrations above or equal to '+ str(threshold)+' (i.e. '+txt+')'
plot_confusion_matrix(cm, title=title)

In [None]:
# Confusion matrix for excluding samples below a certain concentration level
threshold = 1/25e6
txt = '1/25M'
above = df_samples['Concentration'] >= threshold 
negative = df_samples.y_true == 0
cond = above | negative
#print(df_samples[cond].head(10))
cm = confusion_matrix(df_samples[cond].y_true, df_samples[cond].y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
title = 'Normalised confusion matrix for concentrations above or equal to '+ str(threshold)+' (i.e. '+txt+')'
plot_confusion_matrix(cm, title=title)

In [None]:
# Confusion matrix for only the last pass in any run
cond = df_samples['IsLastPass'] == True 
print(df_samples[cond].describe())
cm = confusion_matrix(df_samples[cond].y_true, df_samples[cond].y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
title = 'Normalised confusion matrix for only the last pass in any run'
plot_confusion_matrix(cm, title=title)