In [1]:
import bokeh.io
import glasbey as gb
import json
import numpy as np
import pandas as pd
import panel as pn
import thisnotthat as tnt
import umap
import vectorizers as vz
import vectorizers.transformers as vzt
from zipfile import ZipFile

In [2]:
bokeh.io.output_notebook()
pn.extension()

In [3]:
with ZipFile("archive.zip") as file_data:
    train = json.loads(file_data.read("train.json"))
    test = json.loads(file_data.read("test.json"))
    data = pd.DataFrame(train + test).set_index("id").sort_index()
data

In [4]:
%%time
vz_ngram = vz.NgramVectorizer().fit(data["ingredients"])
vz_ngram

In [5]:
vz_ngram._train_matrix

In [6]:
len(vz_ngram.column_index_dictionary_)

In [7]:
[vz_ngram.column_index_dictionary_[i] for i in range(10)]

In [34]:
counts_raw = vz_ngram._train_matrix
distribution_tokens = np.asarray(np.sum(counts_raw, axis=0)).squeeze()
pd.Series(np.log10(distribution_tokens)).hist(bins=[0, 1, 2, 3, 4])

In [35]:
counts_iwt = vzt.InformationWeightTransformer().fit_transform(counts_raw)
counts_iwt

In [36]:
distribution_iwt = np.asarray(np.sum(counts_iwt, axis=0)).squeeze()
pd.Series(np.log10(distribution_iwt)).hist(bins=[0, 1, 2, 3, 4])

In [38]:
%%time
u2 = umap.UMAP(metric="hellinger", unique=True).fit_transform(counts_iwt)
u2

In [12]:
cuisines = data["cuisine"].copy().fillna("UNKNOWN")
types_cuisine = sorted(cuisines.unique())
types_cuisine

In [28]:
palette = gb.extend_palette(["#eeeeee"], len(types_cuisine))
palette

In [33]:
plot = tnt.BokehPlotPane(
    u2,
    labels=list(cuisines),
    hover_text=cuisines + " | " + data["ingredients"].map(", ".join),
    label_color_mapping=dict(zip(types_cuisine, palette)),
    width=800,
    height=800,
    show_legend=False,
    marker_size=.025
)
editor = tnt.LabelEditorWidget(labels=plot.labels, color_factors=types_cuisine, color_palette=palette, selectable_legend=True)
editor.link_to_plot(plot)
pn.Row(plot, editor, height=1000)