# Unsupervised machine learning

Previously, we have taken MR images and attempt assign to them values of normal/diseased, at a whole image level (classification) or a pixel level (segmentation). But what if we don't have labels?

We are going to spent this tutorial exploring how to find patterns in data using unsupervised machine learning. We'll introduce three new tools that require no labels (though we'll keep track of them to evaluate our performance):

- Autoencoders: deep neural network designed minimize the "reconstruction error" between the input and output (which are the same)
- K-means clustering: a way of automatically find groups of unlabelled data points in space based on distances between them
- K-nearest neighbours: assigning a new data point a label based on its proximity to other labelled data points

In [None]:
%load_ext autoreload
train_path = 'C:/Users/jxb29/Dropbox (Partners HealthCare)/Teaching/BRATS_10_Updated/*/*.nii.gz'
sequences = ['t1', 't2', 't1ce', 'flair']

In [None]:
from skimage.measure import label, regionprops

def normalize_images(channel_copy):
        
    label_image = label(channel_copy == 0)

    largest_label, largest_area = None, 0
    for region in regionprops(label_image):
        if region.area > largest_area:
            largest_area = region.area
            largest_label = region.label

    mask = label_image == largest_label     
    masked_channel = np.ma.masked_where(mask, channel_copy)

    masked_channel = masked_channel - np.mean(masked_channel)
    masked_channel = masked_channel / np.std(masked_channel)
    masked_channel = np.ma.getdata(masked_channel)
    return masked_channel

In [None]:
from glob import glob
import nibabel as nib
from os.path import basename, join
import numpy as np 
import pandas as pd
from PIL import Image

all_images = glob(train_path)

slices = []
labels = []

thumb_data = []

no_slices = 40

for nifti_file in all_images:
    
    seq = basename(nifti_file).split('.')[0].split('_')[-1]
    
    if seq not in sequences:
        continue
    
    # Load Nifti file, normalize it
    vol = nib.load(nifti_file).get_data()
    vol = normalize_images(vol)
    
    # Take a middle-ish section of the volume
    halfway_point = vol.shape[2] // 2
    sample = [vol[:,:,i] for i in range(halfway_point-(no_slices//2), halfway_point+(no_slices//2))]
    slices.extend(sample)
    
    # Generate thumbnails
    for i, np_arr in enumerate(sample):
        
        pil_img = Image.fromarray(np_arr).resize((100, 100))
        #file_name = basename(nifti_file).split('.')[0] + '_' + str(i) + '.tif'
        file_name = join('thumbnails', basename(nifti_file).split('.')[0] + f'_{i}.tif')
        pil_img.save(file_name)
        
        thumb_dict = dict(file_name=file_name, subject_name=basename(nifti_file).split('.')[0], sequence=seq)
        thumb_data.append(thumb_dict)
        
    # Keep track of the labels (sequence ID: 0 == t1, 1 == t2)
    index = sequences.index(seq)
    index_list = [index] * no_slices
    labels.extend(index_list)
    
    
df = pd.DataFrame(data=thumb_data)
print(df)
df.to_csv('thumbs.csv')

In [None]:
# (samples: 40 * N, rows: 240, columns: 240, channels: 1)
X = np.expand_dims(np.asarray(slices), axis=-1)
y = np.asarray(labels)
print(X.shape, y.shape)

In [None]:
import h5py
from keras.utils.io_utils import HDF5Matrix
from keras.preprocessing.image import ImageDataGenerator


def save_hdf5_file(train_data, output_filename):
    
    with h5py.File(output_filename, 'w') as file_handle:
        file_handle.create_dataset('train', data=train_data, dtype=train_data.dtype)

class ReconGenerator:
  
    def __init__(self, save_file, training_data=None, batch_size=20, augmentation=None):
    
        self.training_data = training_data
        self.save_file = save_file
        self.batch_size = batch_size
        self.augmentation = augmentation
        self.seed = 1989
        
        if training_data is not None:
            save_hdf5_file(training_data, self.save_file)
            
        self.X_train = HDF5Matrix(self.save_file, 'train')
        self.image_shape = self.X_train.shape[1:]
        self.steps = self.X_train.shape[0] // self.batch_size

    def generate(self):

        aug_dict = dict()
        if self.augmentation is not None:
            aug_dict = self.augmentation

        X_datagen = ImageDataGenerator(**aug_dict)
        X_generator = X_datagen.flow(self.X_train, seed=self.seed, batch_size=self.batch_size, shuffle=True)

        return zip(X_generator, X_generator)

In [None]:
batch_size = 20
recon_gen = ReconGenerator('ae_data.h5')

In [None]:
%autoreload 2
import models
ae = models.autoencoder(image_shape=recon_gen.image_shape)
ae.compile(loss='mse', optimizer='sgd', metrics=['mae'])
ae.fit_generator(recon_gen.generate(), epochs=10, steps_per_epoch=recon_gen.steps)
ae.save_weights('ae_weights.h5')

In [None]:
import models
encoder = models.autoencoder(image_shape=recon_gen.image_shape, encoder_only=True)
encoder.load_weights('ae_weights.h5', by_name=True)

In [None]:
import numpy as np

In [None]:
features = encoder.predict(recon_gen.X_train)
np.savez('features.npz', features)

In [None]:
features = np.load('features.npz')['arr_0']  # bit weird, but necessary
print(features.shape)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, verbose=1)
T = tsne.fit_transform(features)

In [None]:
print(T.shape)

In [None]:
import seaborn as sns
sns.set_style('white')
import matplotlib.pyplot as plt
%matplotlib inline

def plot_embedding(T, y):
    plt.figure(figsize=(10,10))

    for c, i in {'r': 0, 'b': 1, 'c': 2, 'm': 3}.items():

        idx = y == i
        plt.scatter(T[idx, 0], T[idx, 1], c=c, marker='.', s=10, alpha=.5, label=sequences[i].upper())
    plt.legend()
    plt.show()

In [None]:
!conda install -c conda-forge umap-learn
import umap
embedding = umap.UMAP().fit_transform(features)

In [None]:
import pandas as pd

df = pd.read_csv('thumbs.csv', index_col=0)
df['x'] = embedding[:, 0]
df['y'] = embedding[:, 1]
df['color'] = df['sequence'].replace({'t1': '#c866d1', 't2': '#6674d1', 't1ce': '#66d171', 'flair': '#f4b942'})

In [None]:
df.to_csv('thumbs.csv')

# Plan for today

* Review last week
* Demonstrate Bokeh plots
* Create some thumbnail versions of our data
* Create an interactive visualization in Bokeh

In [None]:
import pandas as pd
df = pd.read_csv('thumbs.csv', index_col=0)
df.head()

In [None]:
from bokeh.plotting import figure, output_file, show, ColumnDataSource

output_file("bokeh.html")

source = ColumnDataSource(data=dict(
    x=df['x'],
    y=df['y'],
    desc=df['subject_name'],
    imgs=df['file_name'],
    color=df['color']
))

TOOLTIPS = """
    <div>
        <div>
            <img
                src="@imgs" height="100" alt="@imgs" width="100"
                style="float: left; margin: 0px 15px 15px 0px;"
                border="2"
            ></img>
        </div>
        <div>
            <span style="font-size: 17px; font-weight: bold;">@desc</span>
            <span style="font-size: 15px; color: #966;">[$index]</span>
        </div>
        <div>
            <span style="font-size: 15px;">Location</span>
            <span style="font-size: 10px; color: #696;">($x, $y)</span>
        </div>
    </div>
"""

p = figure(plot_width=800, plot_height=800, tooltips=TOOLTIPS,
           title="UMAP applied to autoencoded MR features")

p.circle('x', 'y', fill_color='color', fill_alpha=0.5, line_alpha=0., size=8, source=source)

show(p)
