In [None]:
# if you dont have matplotlib, numpy or pandas installed on your machine uncomment this lines and pip install them:
# %pip install matplotlib
# %pip install numpy
# %pip install pandas

# and of course install quibbler!
%pip install pyquibbler

In [None]:
from pyquibbler import iquib, initialize_quibbler, q, quiby
initialize_quibbler()
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider
import matplotlib.patches as patches
import numpy as np
import pandas as pd
%matplotlib tk

In [None]:
df = pd.read_pickle('data/ICD_df.pkl')
chapters_abb = pd.read_pickle('data/chapters_abb.pkl')

In [None]:
@quiby
def filter_by_frequency(frequencies, frequency_threshold):
    return np.where(frequencies > frequency_threshold)

@quiby
def filter_text_by_frequency(bins_height, cluster_threshold, data):
    below_cluster_threshold = np.where((bins_height < cluster_threshold) & (bins_height > 1e-6))
    rows_with_relevant_clusters = data[np.where(np.isin(data[:, 5], below_cluster_threshold)), 1].flatten()
    if len(rows_with_relevant_clusters) == 0:
        return ''
    descriptions = '\n\n'.join(rows_with_relevant_clusters.tolist())
    return descriptions

@quiby
def get_indices_from_array(array, indices):
    return array[indices]

@quiby
def points_in_ellipse_indices(center, width, height, points):
    ellipse = ((center[0] - points[:,0])/width)**2 + ((center[1] - points[:,1])/height)**2 
    return np.where(ellipse<=1)[0]

In [None]:
# define variables to control scatter plot
frequency_threshold = iquib(100)
df_as_array = df.to_numpy()
indexes_to_plot = filter_by_frequency(df_as_array[:,2].astype(int) , frequency_threshold)
above_threshold = get_indices_from_array(df_as_array, indexes_to_plot)
rows_to_plot = above_threshold[:, (6,7,5)] # two last columns are 2D TSNE of codes

# create scatter plot of TSNE
fig = plt.figure(figsize=(20,10))
ax_scatter = fig.add_axes([0, 0, 0.5, 1])
ax_scatter.scatter(rows_to_plot[:,0], rows_to_plot[:,1], s=4, c=get_indices_from_array(plt.get_cmap('tab20')(np.linspace(0,1,len(chapters_abb))), q(np.asarray,rows_to_plot[:,2], dtype=int)))

# define ellipse quibbed variables
center = iquib(np.array([35., -28.]))
width = iquib(15.)
height = iquib(10.)

# plot ellipse
t = np.linspace(0, 2*np.pi, 100)
ax_scatter.plot(center[0] + width*np.cos(t), center[1] + height*np.sin(t), c='r', linewidth=2)
ax_scatter.plot(center[0], center[1], '+', c='r', markersize=10, markeredgewidth=2)

# add dragging points to ellipse
for angle in (0.25, 0.75, 1.25, 1.75):
    ax_scatter.plot(width*np.cos(angle * np.pi) + center[0], height*np.sin(angle * np.pi) + center[1], '*', c='black', markersize=5, markeredgewidth=2)

# add slider for ellipse variables
ax_width = fig.add_axes([0.58, 0.0, 0.12, 0.05])
ax_width.set_xlim(0, 10)
slider_width = Slider(ax_width, 'Width', 0, 20, valinit=width, valstep=0.1)
ax_height = fig.add_axes([0.58, 0.04, 0.12, 0.05])
ax_height.set_xlim(0, 10)
slider_height = Slider(ax_height, 'Height', 0, 20, valinit=height, valstep=0.1)

# add slider for minimum frequency of ICD9 code in the dataset
ax_freq = fig.add_axes([0.58, 0.08, 0.12, 0.05])
ax_freq.set_xlim(1,5000)
Slider(ax_freq, valmin=1, valmax=5000, valinit=frequency_threshold, label='Min Frequency')

# calculate indcies of points inside ellipse
points_inside_ellipse = points_in_ellipse_indices(center, width, height, rows_to_plot)

# plot bars that represent the number of diseases inside the ellipse and add threshold to define cluster
chapter_to_plot = q(np.asarray, above_threshold[points_inside_ellipse, 5], dtype=np.int64)
num_chapters = len(chapters_abb)
normalized_bins_height = q(np.bincount, chapter_to_plot, minlength=num_chapters)/(q(len, chapter_to_plot)+ 1e-6)
chapter_names = list(chapters_abb.values())

cluster_threshold = iquib(0.1)
ax_bars = fig.add_axes([0.5, 0.35, 0.25, 0.65])
ax_bars.bar(chapter_names, normalized_bins_height, color=plt.get_cmap('tab20')(np.linspace(0,1,num_chapters)))
ax_bars.set_xticks(chapter_names)
ax_bars.set_xticklabels(chapter_names, rotation=90)
ax_bars.axhline(cluster_threshold, linestyle='--', color='r')

# plot the descriptions of codes that does not pass the threshold
text_to_plot = filter_text_by_frequency(normalized_bins_height, cluster_threshold, above_threshold[points_inside_ellipse])
ax_text = fig.add_axes([0.75, 0, 0.25, 1])
ax_text.axis(False)
ax_text.text(x=0.01 , y=0.05, s=text_to_plot, fontsize=9, fontstretch='ultra-condensed', wrap=True)
plt.show()