In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (80, 80)

import numpy as np 
import pandas as pd

import requests
from io import BytesIO
from PIL import Image

from umap import UMAP
from sklearn.cluster import AgglomerativeClustering

the whole point of this thing is to open up corners of the collection that were otherwise inacessible because of the bad algorithm. But how do we know that we've done that? We can much around with the demo but it's not really telling us how much of the collection we can _actually_ access through the mapping of one manifold to another.

I want to float through all of sentence space and see how many of the works i can touch

In [None]:
image_ids = np.load('../data/image_ids.npy')
embeddings = np.load('../data/embeddings.npy').reshape(-1, 4096)

In [None]:
n_samples = 20000
sample_indicies = np.random.choice(len(embeddings), 
                                   size=n_samples, 
                                   replace=False)

image_id_sample = image_ids[sample_indicies]
embeddings_sample = embeddings[sample_indicies]

# dimensionality reduction
### 2D projection

In [None]:
fitter = UMAP(n_neighbors=10, 
              n_components=2,
              metric='cosine')

embeddings_2d = fitter.fit_transform(embeddings)

In [None]:
labels_2d = (AgglomerativeClustering(n_clusters=20)
             .fit_predict(embeddings_2d))

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(embeddings_2d[:, 0], 
           embeddings_2d[:, 1],
           s=4,  alpha=0.2,
          );

### 3D projection

In [None]:
fitter = UMAP(n_neighbors=10, 
              n_components=3,
              metric='cosine')

embeddings_3d = fitter.fit_transform(embeddings_sample)

In [None]:
labels_3d = (AgglomerativeClustering(n_clusters=20)
             .fit_predict(embeddings_3d))

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(embeddings_3d[:, 0], 
           embeddings_3d[:, 1],
           embeddings_3d[:, 2],
           s=1, c=labels_3d, alpha=0.2,
          );

# plotting with datashader

In [None]:
import datashader as ds
import datashader.transfer_functions as tf
from bokeh.models import BoxZoomTool
from bokeh.plotting import figure, output_notebook, show
import datashader as ds
from datashader.bokeh_ext import InteractiveImage
from functools import partial
output_notebook()

### plain

In [None]:
df = pd.DataFrame(embeddings_2d)
df.columns = ['x', 'y']

In [None]:
cvs = ds.Canvas(plot_width=400, plot_height=400)
agg = cvs.points(df, 'x', 'y')
img = tf.shade(agg, how='eq_hist')
img

### with bokeh

In [None]:
def base_plot(tools='pan, wheel_zoom, reset',
              plot_width=plot_width, 
              plot_height=plot_height, 
              **plot_args):
    p = figure(tools=tools, 
               plot_width=plot_width, 
               plot_height=plot_height,
               x_range=x_range, 
               y_range=y_range, 
               outline_line_color='grey',
               min_border=0, 
               min_border_left=0, 
               min_border_right=0,
               min_border_top=0, 
               min_border_bottom=0, 
               **plot_args)
    
    p.axis.visible = False
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.add_tools(BoxZoomTool(match_aspect=True))
    return p


def create_image(x_range, y_range, width=950, height=950):
    cvs = ds.Canvas(plot_width=width, plot_height=height, 
                    x_range=x_range, y_range=y_range)
    agg = cvs.points(df, 'x', 'y')
    img = tf.shade(agg, how='eq_hist')
    return tf.dynspread(img, max_px=5, shape='circle')

In [None]:
x_range = df['x'].min(), df['x'].max()
y_range = df['y'].min(), df['y'].max()

In [None]:
p = base_plot(background_fill_color='white')
InteractiveImage(p, create_image)

Datashader doesn't allow you to add annotations to points... very annoying. Back to static plots then...
# Grabbing little clusters

In [None]:
fitter = UMAP(n_neighbors=10, 
              n_components=2,
              metric='cosine')

embeddings_2d = fitter.fit_transform(embeddings_sample)

somewhere around 50 points per cluster

In [None]:
n_clusters = int(n_samples / 50)

In [None]:
labels_2d = (AgglomerativeClustering(n_clusters=n_clusters)
             .fit_predict(embeddings_sample))

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(embeddings_2d[:, 0], 
           embeddings_2d[:, 1],
           s=4, alpha=0.8, c=labels_2d
          );

In [None]:
df = pd.DataFrame(index=image_id_sample,
                  data=np.hstack([embeddings_2d, 
                                  labels_2d.reshape(-1,1)]))

df.columns = ['x', 'y', 'cluster']
df.head()

In [None]:
chosen_cluster = np.random.choice(df['cluster'].unique())
cluster_df = df[df['cluster'] == chosen_cluster]
print(len(cluster_df))
cluster_df.head()

In [None]:
side_length = int(np.floor(np.sqrt(len(cluster_df))))
n_images = int(side_length ** 2)

In [None]:
cluster_image_ids = np.random.choice(cluster_df.index.values, 
                                     size=n_images, 
                                     replace=False)

In [None]:
base_url = 'https://iiif.wellcomecollection.org/image/{}.jpg/full/760,/0/default.jpg'
urls = [base_url.format(image_id) for image_id in cluster_image_ids]

In [None]:
def image_from_url(url, image_size=200):
    raw_image = Image.open(BytesIO(requests.get(url).content))
    resized_image = raw_image.resize((image_size, image_size), 
                                     resample=Image.BILINEAR)
    return resized_image

In [None]:
images = (np.array([np.array(image_from_url(url)) for url in urls])
          .reshape(side_length, side_length, 200, 200, 3)
          .transpose(0, 2, 1, 3, 4)
          .reshape(side_length*200, side_length*200, 3))

In [None]:
Image.fromarray(images)

# wikitext

In [None]:
with open('/Users/pimh/datasets/wikitext-103/wiki.train.tokens') as f:
    articles = f.read().split('=')

In [None]:
from tqdm import tqdm_notebook as tqdm
import spacy
nlp = spacy.load('en')

In [None]:
sentences = []

for article in tqdm(articles[:10000]):
    for sentence in nlp(article).sents:
        if len(sentence) > 1:
            sentences.append(str(sentence))

In [None]:
import torch
from torch import nn

class SentenceEncoder(nn.Module):
    def __init__(self):
        super(SentenceEncoder, self).__init__()
        self.enc_lstm = nn.LSTM(
            input_size=300, hidden_size=2048, num_layers=1, bidirectional=True
        )

    def forward(self, wv_batch):
        embedded, _ = self.enc_lstm(wv_batch)
        max_pooled = torch.max(embedded, 1)[0] 
        return max_pooled

In [None]:
model = SentenceEncoder()
model.load_state_dict(
    torch.load('../../../apps/devise/data/sentence-encoder-2018-10-16.pt', 
               map_location='cpu'))

In [None]:
import pickle 
word_to_index = pickle.load(open('../../../apps/devise/data/word_to_index.pkl', 'rb'))
index_to_wordvec = np.load('../../../apps/devise/data/index_to_wordvec.npy')

In [None]:
import torch
import numpy as np
from nltk.tokenize import word_tokenize


def sentence_to_indexes(sentence, word_to_index):
    sentence = ''.join([c if c.isalpha() else ' ' 
                        for c in sentence.lower()])
    tokenised = word_tokenize(sentence)
    indexes = [word_to_index[word] for word in tokenised if word in word_to_index]
    return indexes


def embed(sentence, model, word_to_index, index_to_wordvec):
    indexes = (
        [word_to_index["<s>"]]
        + sentence_to_indexes(sentence, word_to_index)
        + [word_to_index["</s>"]]
    )
    wvs = np.stack([index_to_wordvec[i] for i in indexes])
    embedding = model(torch.Tensor([wvs])).detach().numpy()
    return embedding.squeeze()


def search(
    query_string, search_index, model, image_ids, word_to_index, index_to_wordvec, k=10
):
    query_embedding = embed(query_string, model, word_to_index, index_to_wordvec)
    neighbour_indexes, _ = search_index.knnQuery(query_embedding, k)
    return image_ids[neighbour_indexes]

In [None]:
sentence_embeddings = np.array([embed(sentence=sentence,
                                      model=model,
                                      word_to_index=word_to_index,
                                      index_to_wordvec=index_to_wordvec)
                                for sentence in tqdm(sentences)])

In [None]:
fitter = UMAP(n_neighbors=10, 
              n_components=2,
              metric='cosine')

sentence_embeddings_2d = fitter.fit_transform(sentence_embeddings)

n_clusters = int(n_samples / 50)

labels_2d = (AgglomerativeClustering(n_clusters=n_clusters)
             .fit_predict(sentence_embeddings))

fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(sentence_embeddings_2d[:, 0], 
           sentence_embeddings_2d[:, 1],
           s=40, alpha=0.8, c=labels_2d
          );

In [None]:
chosen_cluster = np.random.choice(np.unique(labels_2d))
print(np.array(sentences)[np.where(labels_2d == chosen_cluster)])

colours = np.full(len(sentences), fill_value='#606060')
for index in np.where(labels_2d == chosen_cluster): 
    colours[index] = '#f44242'

sizes = np.full(len(sentences), fill_value=40)
for index in np.where(labels_2d == chosen_cluster): 
    sizes[index] = 150

    
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(sentence_embeddings_2d[:, 0], 
           sentence_embeddings_2d[:, 1],
           s=sizes, alpha=0.8, c=colours
          );