In [1]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import CategoricalColorMapper, ColumnDataSource, HoverTool
from bokeh.palettes import Category10, Colorblind, Viridis, Viridis256
from bokeh.transform import linear_cmap
from bokeh.io import output_notebook, export_png
from bokeh.layouts import column, gridplot
output_notebook()

import numpy as np
import numba
import umap
from numpy.random import default_rng
rng = default_rng(seed=1234569)


In [2]:
factors = [
    ['large', 'small'],
    ['blue','brown','cyan','gray','green','purple','red','yellow'],
    ['metal','rubber'],
    ['cube','cylinder','sphere'],
]
# The factors correspond to distinct 0.0 / 1.0 along individual dimensions
# len(factors)     # 4
nfactors = [ len(factors[i]) for i in range(len(factors)) ]
ndistinct = np.prod(nfactors)
hid = np.sum(nfactors)
print("nfactors", nfactors, "hid", hid, "ndistinct",ndistinct)
# The dimensionality is alread low

nfactors [2, 8, 2, 3] hid 15 ndistinct 96


In [3]:
facprobs = []
for i in range(len(factors)):
    facprobs.extend( [1.0/nfactors[i] for j in range(nfactors[i])] )
print(facprobs)
def create_pool(n=100, d=hid):
    ret = np.zeros((n,d), dtype=np.float32)
    print("ret.shape",ret.shape,"zeros")
    for i in range(n):
        dim = 0
        for j in range(len(factors)):
            sel = rng.choice(nfactors[j])
            print(i,j,dim,sel)
            ret[i,dim+sel] = 1.0
            dim += nfactors[j]
    return ret
def code_up(code):
    """ Cyclic counting in digits base nfactors[] """
    assert( code.shape[0] == len(nfactors) )
    digits = len(nfactors)
    for i in range(digits):
        #carry = 0
        irev = digits - 1 - i
        bump = code[irev] + 1
        if bump < nfactors[irev]:
            code[irev] = bump
            break;
        code[irev] = 0
        #carry = 1 # just continue to 'bump' the next (reversed) digits
def code2vec(code):
    ret = np.zeros(hid, dtype=np.float32)
    dim = 0
    for i in range(code.shape[0]):
        sel = code[i]
        ret[dim+sel] = 1.0
        dim += nfactors[i]
    return ret

def pool_str(x, d=hid):
    if x.ndim == 1:
        x = x.reshape((1,x.shape[0]))
    s = ""
    for i in range(x.shape[0]):
        ss = "" if i==0 else "\n"
        dim = 0
        for j in range(len(factors)):
            for k in range(nfactors[j]):
                if x[i,dim+k]:
                    #print(i,j,k,factors[j])
                    s += " " + factors[j][k]
            dim += nfactors[j]
        s += ss
    return s
            
a = create_pool(2)
print(pool_str(a[0]))
print(pool_str(a))
print("Generating sequential codes")
print("code      vector                                          string")
code = np.array([0,0,0,0], dtype=np.int32)
for i in range(10):
    v = code2vec(code)
    print(code, v, pool_str(v))
    code_up(code)

def create_all():  # since ndistinct is only 96, create one of each "full universe"
    """ return a full universe of items """
    ret = np.zeros((ndistinct,hid), dtype=np.float32)
    code = np.array([0,0,0,0], dtype=np.int32)
    for i in range(ndistinct):
        v = code2vec(code)
        ret[i,:] = v
        code_up(code)
    return ret
univ = create_all()
ustr = [pool_str(univ[i,:]) for i in range(univ.shape[0])]
print("\nfirst and last in universe:")
print(univ[0,:], pool_str(univ[0,:]), ustr[0])
print(univ[-1,:], pool_str(univ[-1,:]), ustr[-1])

snitch = rng.choice(ndistinct)
goods = np.array([], dtype=np.int32)
print("snitch",snitch)
#

[0.5, 0.5, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.5, 0.5, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
ret.shape (2, 15) zeros
0 0 0 1
0 1 2 0
0 2 10 1
0 3 12 2
1 0 0 0
1 1 2 7
1 2 10 1
1 3 12 1
 small blue rubber sphere
 small blue rubber sphere large yellow rubber cylinder

Generating sequential codes
code      vector                                          string
[0 0 0 0] [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.]  large blue metal cube
[0 0 0 1] [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0.]  large blue metal cylinder
[0 0 0 2] [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]  large blue metal sphere
[0 0 1 0] [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]  large blue rubber cube
[0 0 1 1] [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]  large blue rubber cylinder
[0 0 1 2] [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.]  large blue rubber sphere
[0 1 0 0] [1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.]  large brown metal cube
[0 1 0 1] [1. 0. 0. 

### Universe distances
How would UMAP clustering organize these factors?

In [4]:
%%time
def epoch_normal(pts):
    # torch: pts.sub_(pts.mean(axis=0))
    pts -= np.mean(pts,axis=0)
    d2 = np.average(np.sum(np.square(pts), axis=1))
    print("avg d2", d2)
    pts *= (1.0/np.sqrt(d2))
# note: we might also do an svd and rotate to some "standard" orientation

emb0 = umap.UMAP(
    n_neighbors=50, learning_rate=0.5, random_state=12345, init="random", min_dist=0.001,
    output_constrain={'final_pt': epoch_normal},
).fit_transform(univ)
# alternative, could just do it by hand...
#epoch_normal(emb0)

output_constrain keys dict_keys(['final_pt'])
avg d2 10.760185
CPU times: user 6.6 s, sys: 4.11 ms, total: 6.6 s
Wall time: 6.64 s


In [5]:
def plotit(emb0, snitch, goods):
    #output_file("iris2a.html")

    #targets = [str(d) for d in iris.target_names]
    #targets += ["good","bad"]
    source = ColumnDataSource(
        data = dict(
            x0=emb0[:,0],
            y0=emb0[:,1],
            #label=[ustr[d] for d in range(emb0.shape[0])],
        )
    )
    #for i in range(len(iris.feature_names)
    #    source.data[iris.feature_names[i]] = iris.data[i,]
    # 4 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
    #source.data["Sepal_Length"] = iris.data[:,0]
    #source.data["Sepal_Width"]  = iris.data[:,1]
    #source.data["Petal_Length"] = iris.data[:,2]
    #source.data["Petal_Width"]  = iris.data[:,3]
    source.data["text"] = [ustr[d] for d in range(emb0.shape[0])]
    tooltips = [
        ("(x,y)",  "(@x0,@y0)"),   # tooltips[1] can be modified in later plots
        ("Text", "@text"),
        #("Sepal Length,Width", "@Sepal_Length{0.0}, @Sepal_Width{0.0}"),
        #("Petal Length,Width", "@Petal_Length{0.0}, @Petal_Width{0.0}"),
    ]
    #print(tooltips[0])

    p1 = figure(title="Test UMAP on Clevr text",
                tooltips=tooltips)
    circles = p1.circle(source=source, x="x0", y="y0",
        size=8, fill_alpha=0.5,
        #color={"field": "label", "transform": cmap},
        #legend_label="species",
        #legend_group="label"
    )
    # tooltips are only for circles
    hover = p1.select_one(HoverTool)
    hover.renderers = [circles]

    # gray boxes around snitch and goods
    cmap = CategoricalColorMapper(factors=["snitch","goods"], palette=Category10[10])
    gb = np.vstack([emb0[snitch,], emb0[goods,]])
    gbcat = np.hstack((np.repeat("snitch",1), np.repeat("good",goods.shape[0])))
    gbsource = ColumnDataSource( dict(
            x0 = gb[:,0],
            y0 = gb[:,1],
            label = gbcat,
        ))
    p1.square(source=gbsource, x="x0", y="y0",
              size=16, line_alpha=0.7, line_width=4, fill_alpha=0.0,
              color={"field": "label", "transform": cmap},
              legend_group="label",
    )
    #p1.add_layout(p1.legend[0], 'right') # outside, plot rectangular!
    #p1.legend.location = 'top_left'
    #p1.legend.location = 'top_right'
    #p1.legend.location = 'center_center'

    return p1

fig0 = plotit(emb0, snitch, goods)
show(fig0)

In [6]:
from scipy.spatial import distance_matrix
import numpy.ma as ma
rr = distance_matrix(emb0,emb0)
print("rr.shape",rr.shape)
def greedy_far(n, emb, perm0=None):
    assert(n>=1)
    # pbeg will flag if we are given an initial set of far-away
    # indices, to which we'll add 'n' more
    pbeg = 0 if perm0 is None else perm0.shape[0]
    pend = pbeg + n
    perm = np.zeros(pend, dtype=np.int32)
    lens = np.zeros(n, dtype=np.float32)
    if pbeg:
        perm[0:pbeg] = perm0[0:pbeg]
    
    if pbeg==0:
        # special case: choose first point "closest to origin"
        dists = np.sqrt(np.sum(np.square(emb), axis=1))
        print("dists",dists.shape, dists[0:5])
        i0 = np.argmin(dists)
        print("min dists to origin @", i0, "is", dists[i0])
        perm[pbeg] = i0
        pbeg = pbeg + 1
    print("pbeg",pbeg,"pend",pend)
    
    dists = distance_matrix(emb, emb)
    #print(dists[i0,:])
    for i in range(pbeg,pend):
        print("i",i)
        # Find max of (min dist to any previously selected point)
        close = np.amin(dists[perm[0:i],:], axis=0)
        idx = np.argmax(close)
        #print("i,idx", i,idx, "len", close[idx])
        perm[i] = idx
        lens[i] = close[idx]
    return (perm,lens)
# instead of clustering, initial selection can come from "greedy furthest"
# For example, asking for n=12 such will give one point in each cluster
(choices,lens) = greedy_far(12,emb0)
print("choices",choices)
print("lens",lens)

rr.shape (96, 96)
dists (96,) [1.1206018 0.5575227 1.2683991 1.1240767 0.5968729]
min dists to origin @ 70 is 0.5086673
pbeg 1 pend 12
i 1
i 2
i 3
i 4
i 5
i 6
i 7
i 8
i 9
i 10
i 11
choices [70  2 39 48 31 65 79  5 69 80  6 46]
lens [0.         1.7722623  1.4980221  1.451209   0.79465044 0.7785066
 0.7550602  0.7433248  0.73706293 0.70710135 0.69869155 0.65252936]


In [7]:
fig1 = plotit(emb0, snitch, choices)
show(fig1)

In [8]:
# Model a reasonably carefully player. Choose nothing if no factors match,
# or a select all/one with max number of matching factors.  Factors are equally
# important.  Chooser never makes a mistake.
#
# Given a selection of hid-vectors (n * hid)
# select one at random that has a largest number of correct factors
# if no factors are correct, return -1
# else return selection in [0,n-1]
def vec_select(x, cmp = univ[snitch,:], all=True, verbose=False):
    """ return indices in x that best matches snitch (or -1).
    
    Parameters
    ----------
    x:   ndarray(n x hid) options presented to player.
    cmp: the snitch, to witch we compare matching factors
    all: default True returns all best matches, else randomly pick one of the best.
    """
    assert(len(x.shape) == 2)
    nmatch = np.sum(np.logical_and(cmp,x), axis=1)
    best = np.amax(nmatch)
    if best==0:
        best = -1
    if verbose:
        print("nmatch",nmatch, best, np.where(nmatch==best))
    sel = np.where(nmatch==best)[0]
    if all == True or best == -1:
        return sel
    else:
        selend = np.int32(sel.shape[0])
        #print("selend",selend)
        sel1 = rng.choice(selend)
        #print("sel1",sel1)
        return sel[sel1]

print(np.arange(1))
#(choices,lens) = greedy_far(2,emb0)
options = univ[choices,:]
#print("options\n",options)
#print("snitch\n", univ[snitch,:])
sel = vec_select(options)
print("sel",sel,"\n", options[sel,:])
sel_just1 = vec_select(options,all=False)
print("sel_just1",sel_just1, options[sel_just1,:])

# choose from 3 with zero common features
options = univ[[3,5,9],:]
#print("options\n",options)
#print("snitch\n", univ[snitch,:])
sel_none = vec_select(options)
print("sel_none",sel_none, options[sel_none,:])


[0]
sel [ 2 11] 
 [[1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0.]]
sel_just1 2 [1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0.]
sel_none [0 2] [[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]]
