# Understanding predictions for QUMIA project

In [None]:
## import libraries
import os       # using operating system dependent functionality (folders)
import glob
import shutil 
import itertools
import tempfile 
import subprocess

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets

from sklearn import metrics
from sklearn.metrics import confusion_matrix

import datetime
from shutil import copytree

from IPython.display import Image

import imageio
import skimage

from IPython.display import Image

In [None]:
## import data
data_test = pd.read_csv('../df_test_predictions.csv')
data_train= pd.read_csv('../df_train_predictions.csv')
data_validation= pd.read_csv('../df_validation_predictions.csv')

In [None]:
## optinal examine
data_validation.head(3)

In [None]:
## concatenate data
data = pd.concat(
    [data_test,
    data_train,
    data_validation,]
)
data.tail(3)

In [None]:
data.columns

In [None]:
data_per_patient_for_age2 = data.drop(['exam_id','Weight', 'Length',
        'muscle', 'side', 'z_score', 'h_score', 'image_file', 'bmi',
       'has_markers', 'li_x', 'li_y', 're_x', 're_y', 'id_x', 'id_y', 'bmi',
       'prediction', 'rounded_prediction', 'label'], axis=1)
data_per_patient_for_age2 = data_per_patient_for_age2.drop_duplicates()#.groupby(['anon_id']).mean()

In [None]:
data_per_patient_for_age = data.drop(['exam_id','Weight', 'Length',
       'diagnosis', 'muscle', 'side', 'z_score', 'h_score', 'image_file',
       'has_markers', 'li_x', 'li_y', 're_x', 're_y', 'id_x', 'id_y', 'bmi',
       'prediction', 'rounded_prediction', 'label'], axis=1)
data_per_patient_for_age = data_per_patient_for_age.drop_duplicates()#.groupby(['anon_id']).mean()

In [None]:
#print(type(data_per_patient_for_age))

In [None]:
data_per_patient_for_age2

In [None]:
data_per_patient_for_age#.groupby(['Sex'],['Age_exam']).size().unstack().plot(kind='bar', stacked=True, color=color_menu )
#data.groupby([bars.value, colors.value]).size().unstack().plot(kind='bar', stacked=True, color=color_menu )

# First a bit about the data
## Underlying reality examination


In [None]:
colors = ["#d99c9a", "#4374B3"]# Set your custom color palette
sns.set_palette(sns.color_palette(colors))
sns.catplot(
    data=data,  x="label", y="Age_exam", col="Sex",  hue="Sex",
    kind="bar", height=4, aspect=.6,
)
plt.savefig("data_by_sex_and_label.png")

We can also look at what kinds of categories split the data eg. per muscle etx. 

In [None]:
# stack my data through a widget
possibilities = ['Age_exam',
 'Sex',
 'diagnosis',
 'muscle',
 'side',
 'h_score',
 'bmi',               
 'prediction',
 'rounded_prediction',
 'label']
from ipywidgets import interactive
bars = widgets.Dropdown(
    options=possibilities,
    value='muscle',
    description='Bars',
    disabled=False
)
colors = widgets.Dropdown(
    options= possibilities,
    value='diagnosis',
    description='Colors',
    disabled=False
)
def update_dropdown(fru, col):
    info = f"I will graph {fru.lower()} on {col.lower()} grouping!"
    display(info)  
        
w = interactive(update_dropdown, col=bars, fru=colors) 
display(w)

In [None]:
#data.bmi.max()
color_menu= ['pink', 'steelblue', 'green', 'purple', 'red','orange', 'brown', 'magenta']
data.groupby([bars.value, colors.value]).size().unstack().plot(kind='bar', stacked=True, color=color_menu )
#plt.xticks([-1,100, 200,300])

In [None]:
color_menu= ['pink', 'steelblue', 'green', 'purple', 'red','orange', 'brown', 'magenta']
data.groupby([bars.value, colors.value]).size().unstack().plot(kind='bar', stacked=True, color=color_menu )
#plt.savefig("muscles_versus_label.png")
#plt.xticks([0,250])#,[0,10,20,30,40,50,60,70,80,90,100]) # for doing age per patient

## A first rough graph
about predictions overall

## here we can do to using test or validation data or test data
watch out for the next cell, we can change it to use validation data, or whichever data, but next cell must be changed

In [None]:
data = data_test

In [None]:
prediction =data.prediction
label= data.label
plt.figure(figsize=(10,10))
plt.scatter(label,prediction , c='teal', alpha = 0.15)
p1 = max(max(prediction), max(label))
p2 = min(min(prediction), min(label))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values/Labels', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

In [None]:
## Let's jitter that a bit so we can see it better

In [None]:
sns.catplot(data=data,  x="label",y="prediction", color='teal', jitter=True, alpha = 0.15)

## That had too much overlap to be useful, but let's try splitting on a categorical variable anyways

In [None]:
## the below plot takes a zillion years to run, so never mind
plt.figure(figsize=(14,14))
sns.swarmplot(data=data, x="label", y="prediction", hue="bmi", s=0.80)

# Now let's look at specific muscles

Pick a muscle for visualization

In [None]:
features = widgets.Dropdown(
    options=data.muscle.unique().tolist(),
    value='Biceps',
    description='Muscle',
    disabled=False
)
features

In [None]:
muscle =features.value
data_filtered =  data[data["muscle"] == muscle]
y_axis = "prediction"
sns.set_style("whitegrid")
sns.catplot(data = data_filtered, x = 'label', y = y_axis, hue ="Sex", alpha = 0.35 ).set(title='Biceps Samples Only')


# We can also look at specific diagnoses

In [None]:
features_d = widgets.Dropdown(
    options=data.diagnosis.unique().tolist(),
    value=3,
    description='Diagnosis Number',
    disabled=False
)
features_d

In [None]:
diagnosis =features_d.value
data_filtered =  data[data["diagnosis"] == diagnosis]
y_axis = "prediction"
sns.set_style("whitegrid")
sns.catplot(data = data_filtered, x = 'label', y = y_axis, hue ="Sex", alpha = 0.35 )


## Of course if we round out predictions we can then do a confusion matrix.
Confusion matrices
in this case need to re-round our predictions because we have 4 labels, and 16 possible rounded predictions. Let's first add a re-rounded prediction column

In [None]:
data.h_score.unique()

In [None]:
data.label.unique()

In [None]:
data.prediction.describe()

Now below we can decide how to remap values explcitly in the map function
## Alert- this is very important
I did not create reround the way the program did

In [None]:
data['reround'] = data['rounded_prediction']
print(data['reround'].unique())
#data['reround'] =  data['reround'].map({ 1: 2, 2:2, 3:2, 4:4, 5:4, 6:4,7:8, 8:8,9:8, 10:8, 11:8, 12:16, 13:16, 14:16, 15:16, 16:16})
print(data['reround'].unique())

In [None]:
data.label.unique()

In [None]:
data.reround.unique()

In [None]:
#data["rounded_prediction"]

In [None]:
#len(data.rounded_prediction.unique())

In [None]:

def plot_confusion_matrix(cm, classes,
                          title='Simple confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm = metrics.confusion_matrix(data.label, data.reround)
plot_confusion_matrix(cm, classes=['1', '2', '3', '4'], )

In [None]:
y = data["label"]
y_pred = data["reround"]
cf_matrix = confusion_matrix(y, y_pred)

In [None]:
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, xticklabels=[1,2,3,4], yticklabels=[1,2,3,4],
            fmt='.2%', cmap='Blues')

# Now we want to take a closer look at things that were miscategorized:

Let's pick everything that was off into lists and examine:

In [None]:

reality = widgets.Dropdown(
    options=data.label.unique().tolist(),
    value= 2,
    description='Label/Reality',
    disabled=False
)

prediction = widgets.FloatRangeSlider(
    value=[1, 2.5],
    min=0,
    max=16.0,
    step=0.1,
    description='Test:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.1f',
)
widgets.VBox([reality, prediction])

In [None]:
new_data = data[data['label'] == reality.value]

filtered_data = new_data[new_data['rounded_prediction'].between(prediction.value[0], prediction.value[1])]
wanted_pictures = list(filtered_data['exam_id'])
print("There are ", len(wanted_pictures) , "images in this category, you can choose them to look at")

In [None]:
maybe_wrong = widgets.Dropdown(
    options=wanted_pictures,
    value= wanted_pictures[0],
    description='Picture',
    disabled=False
)
maybe_wrong 

In [None]:
maybe_wrong.value

In the cell below you need to put in a source folder e.g. C:/Projects/merged, then press the enter key

In [None]:
source_root = input("add directory (without qoutes):")

In [None]:
source_root

In [None]:
source = os.path.join(source_root,maybe_wrong.value )
#source = os.path.join(source_root,"baby" )

skimage_list = os.listdir(source)
from IPython.display import display
def f(a):
    display(a)
    image_l = os.path.join(source,skimage_list[a])
    display(Image(filename=image_l))
    return a
w = interactive(f, a=widgets.IntSlider(min=0,max=(len(skimage_list)-1),step=1))
w

## We can also look at mean absolute error and explained variance, 
Although I'm not sure explained variance really makes sense here...

### In general
we will look over all at explained variance and MAE

In [None]:
y_test = data.label
y_pred = data.prediction
metrics.explained_variance_score(y_test, y_pred)

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

### Looking at specific muscles

In [None]:
for muscle in data.muscle.unique().tolist():
    data_m = data[data["muscle"] == muscle]
    y_test = data_m.label
    y_pred = data_m.prediction
    print(muscle, "MAE:", metrics.mean_absolute_error(y_test, y_pred), "Explained variance:", metrics.explained_variance_score(y_test, y_pred))

### Looking at specific diagnoses

In [None]:
for diagnosis in data.diagnosis.unique().tolist():
    data_d = data[data["diagnosis"] == diagnosis]
    y_test = data_d.label
    y_pred = data_d.prediction
    print(diagnosis, "MAE:", metrics.mean_absolute_error(y_test, y_pred), "Explained variance:", metrics.explained_variance_score(y_test, y_pred))