### pin_mask demo
- We get a $1^{st}$ embedding (using random init)
- linearly transform so "good" and "bad" samples so their x values end up at [-1,+1]
    - (in least-squares sense)
- then re-embed "pinning" those x values *exactly* at -1, +1
    - (could random-init by-hand, then fix x values for single pin_mask umap fit)
- umap applies no gradient (but does apply rescaling)
- we determine the new linear rescaling
- and remap the embedding back to good|bad x-values -1|+1

### New
- We select a good/bad entry based on feature 0 "sepal length"
  for each iris species.  Lowest pinned to x=-1, highest pinned to x=+1
  
  
- still perhaps better to ROTATE (see Kabsch algorithm) the initial embedding
  such that good|bad points are aligned *towards* (-1,0)|(+1,0) idealized drag
  positions
   - then rescale and shift to put their x-centroids exactly at (-1,0)|(+1,0)
- Now we rotate, scale & shift

#### behavior is not *as expected*!
- Still no *constraint* of having unpinned points having x-values between (-1,+1)  


In [1]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import CategoricalColorMapper, ColumnDataSource
from bokeh.palettes import Category10, Colorblind, Viridis
from bokeh.io import output_notebook
from bokeh.layouts import column
output_notebook()

import umap
import numpy as np
from sklearn.datasets import load_iris
import pandas as pd

In [2]:
iris = load_iris()

In [3]:
print(type(iris.data))
print(iris.data.shape, iris.data[0:5,])
print(iris.target.shape, iris.target[0:5])
print(iris.target_names.size, iris.target_names)
print(iris.feature_names)
# I'm interested in feature 0 (sepal length) really small.
feature_of_interest = 0
fi_name = iris.feature_names[feature_of_interest]
di = data_of_interest = iris.data[:,feature_of_interest]
if True:
    # This time, choose good/bad from each iris species
    nFeat = iris.target_names.size
    good10 = np.zeros(nFeat,dtype=np.int32)
    bad10 = np.zeros(nFeat,dtype=np.int32)
    for t,name in enumerate(iris.target_names):
        print(t,name)
        mask = (iris.target==t)
        #print(di[mask])
        dilo = np.argmin(di[mask]) # index within masked group
        dihi = np.argmax(di[mask])
        #print(dilo)
        diilo = np.arange(di.shape[0]) [mask] [dilo] # index within original
        diihi = np.arange(di.shape[0]) [mask] [dihi]
        #print(diilo)
        good10[t] = diilo
        bad10[t] = diihi

    print("\nSelected shortest (good) and longest (bad)",
          fi_name, "of each iris species")
    print(fi_name, "good/bad values:")
    row_names = ["good", "bad"]
    col_names = iris.target_names
    matrix = np.zeros((2,3))
    for t,name in enumerate(col_names):
        matrix[0,t] = iris.data[ good10[t], feature_of_interest ]
        matrix[1,t] = iris.data[ bad10[t], feature_of_interest ]
    df = pd.DataFrame(matrix, columns=col_names, index=row_names)
    print(df)
    print("\n")
    

if False: # older case
    # choose 2 "interesting" examples and 2 uninteresting
    nFeat = 4
    print("feature_of_interest",feature_of_interest)
    best = np.argmin(data_of_interest)
    #good3 = np.argpartition(iris.data[:,0], 3)
    #print("good",good, "good3",good3)
    #print(iris.data[good3,])
    goods = np.argsort(data_of_interest)
    good10 = goods[0:nFeat]
    bad10 = goods[-nFeat:,]

print("good10",good10,"\ndata of goods:\n",iris.data[good10,])
print("bad10",bad10,  "\ndata of bads:\n",iris.data[bad10])
#

<class 'numpy.ndarray'>
(150, 4) [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
(150,) [0 0 0 0 0]
3 ['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
0 setosa
1 versicolor
2 virginica

Selected shortest (good) and longest (bad) sepal length (cm) of each iris species
sepal length (cm) good/bad values:
      setosa  versicolor  virginica
good     4.3         4.9        4.9
bad      5.8         7.0        7.9


good10 [ 13  57 106] 
data of goods:
 [[4.3 3.  1.1 0.1]
 [4.9 2.4 3.3 1. ]
 [4.9 2.5 4.5 1.7]]
bad10 [ 14  50 131] 
data of bads:
 [[5.8 4.  1.2 0.2]
 [7.  3.2 4.7 1.4]
 [7.9 3.8 6.4 2. ]]


In [4]:
embedding = umap.UMAP(
    n_neighbors=50, learning_rate=0.5, random_state=12345, init="random", min_dist=0.001
).fit_transform(iris.data)
print(embedding[0:5,])

[[6.943264 8.191586]
 [6.781186 9.38057 ]
 [7.418375 9.443502]
 [7.23914  9.680083]
 [7.12616  8.248806]]


In [5]:
#output_file("iris2a.html")

targets = [str(d) for d in iris.target_names]
targets += ["good","bad"]
source = ColumnDataSource(
    dict(
        x=[e[0] for e in embedding],
        y=[e[1] for e in embedding],
        #g=[i in good10 for e,i in enumerate(embedding) # ?
        #b=[i in bad10  for i in range(embedding.shape[0])] # equiv for bad ?
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p1 = figure(title="Test UMAP on Iris dataset")
p1.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    #legend_label="species",
    legend_group="label"
)

# gray boxes around good/bad points and (fake) category
gb = np.vstack([embedding[good10,], embedding[bad10,]])
gbcat = np.hstack((np.repeat("good",nFeat), np.repeat("bad",nFeat)))
#print(embedding[good10,])
#print("gb   ",gb)
#print("gbcat",gbcat)
#print([e[0] for e in gb])
#print([e[1] for e in gb])
gbsource = ColumnDataSource( dict(
        x=[e[0] for e in gb],
        y=[e[1] for e in gb],
        label=gbcat,
    ))
p1.square(x="x", y="y", size=16, line_alpha=0.7, line_width=4, fill_alpha=0.0,
          source=gbsource, color={"field": "label", "transform": cmap},
          #legend_label="good/bad",
          legend_group="label",
)
#p1.add_layout(p1.legend[0], 'right') # outside, plot rectangular!
p1.legend.location = 'top_left'

show(p1)

In [6]:
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])

good10:
 [[ 7.242771   9.893241 ]
 [-3.922997   1.2515678]
 [-4.067657   0.5437252]]
bad10:
 [[ 7.8313046  7.592354 ]
 [-4.7280955 -1.4341671]
 [-3.450067  -4.4144154]]


In [7]:
# simulate drag'n'drop of goods to left, bads to right

print("good10:\n",  embedding[good10,])
print("bad10:\n",   embedding[bad10,])# simulate drag'n'drop of goods to left, bads to right
method_names=["force", "linear", "rot,scale,trans"]
method=2
#
# method 1: "best" linear transform of x-coords of embedding
#
# y = [e 1] @ [m c] st. e'[good10,][0] ~ -1 and e'[bad10,][0] ~ 1
def emb_linear(emb0, emb1, pt0, pt1, coord=0 ):
    """ shift emb0, emb1 towards goals t0, t1 returning mx+c that best shifts x (axis=0) values """
    e = np.hstack((emb0[:,coord], emb1[:,coord]))
    # hoping for first half ~ -1, rest ~ +1
    y = np.hstack((np.repeat(pt0[coord],emb0.shape[0]), np.repeat(pt1[coord],emb1.shape[0])))
    print(type(e), type(y), e.shape, y.shape)
    assert( e.size == y.size )
    A = np.vstack([e, np.ones(len(e))]).T  # add a one's column
    print(type(A), A.shape, "\n")
    print("A", A)
    print("y", y)

    #x, residuals, rank, s = np.linalg.lstsq(A, y, rcond=None)
    #print("lstsq -> x=",x)
    #m,c = x
    m,c = np.linalg.lstsq(A, y, rcond=None)[0]
    print("m,c",np.round(m,3),np.round(c,3))
    print("fit",m*e+c)
    return [m, c]

def emb_linear_apply(x, embedding):
    m = x[0]
    c = x[1]
    emb2 = embedding
    # re-embed all data w/ "best" linear transform of 'x' values
    # rescale 'y' too, (keep rel. distances, don't care about y shift)
    emb2[:,0] = m * embedding[:,0] + c
    emb2[:,1] = m * embedding[:,1]
    emb2[:,1] -= np.average(emb2, axis=1) # 'y' centroid --> zeroprint("good10:\n", embedding[good10,])
    return embedding

def opa(a, b):
    """ return rot, scale, translation, and rmsd of shifting `b` to concord with `a`.
    
    `a` and `b` are N D-dim vectors.
    
    Suppose we return r, s, t, d.
    
    To apply the recovered transform to other M D-dim vectors X, calculate
    `X.dot(r) * s + t`
    """
    assert( a.shape == b.shape )
    aT = a.mean(0)
    bT = b.mean(0)
    A = a - aT 
    B = b - bT
    aS = np.sum(A * A)**.5
    bS = np.sum(B * B)**.5
    A /= aS
    B /= bS
    U, _, V = np.linalg.svd(np.dot(B.T, A))
    aR = np.dot(U, V)
    if np.linalg.det(aR) < 0:
        V[1] *= -1
        aR = np.dot(U, V)
    aS = aS / bS
    aT-= (bT.dot(aR) * aS)
    # the original only returned a rotation-only "rms"... between scaled+translated points
    aD = (np.sum((A - B.dot(aR))**2) / len(a))**.5
    # the xform in general is : a[1] = a[1].dot(r) * s + t
    # if we actually DO the full transform "LONG HAND"
    #aD = np.sqrt(((a - (b.dot(aR) * aS + aT))**2).sum() / len(a))
    # equivalently, include scaling into previous rmsd as
    aD *= (aS * bS)
    return aR, aS, aT, aD 
        
def emb_opa(emb0, emb1, pt0, pt1, coord=0):
    """ rotate,scale,translate s.t. coord of emb0,emb1 somewhat match 2 points pt0,pt1."""
    print(len(pt0), emb0.shape)
    D = emb0.shape[1]
    assert( len(pt0) == D ) # emb0 and pt0 are both D-dim
    assert( emb1.shape[1] == D )
    assert( len(pt1) == D )
    e = np.vstack((emb0, emb1))
    print(e.shape, e)
    # hoping for first half ~ -1, rest ~ +1
    # if pt0,pt1 were scalar target values for a single coord...
    #y = np.zeros_like(e)
    #y[:,coord] = np.hstack((np.repeat(pt0,emb0.shape[0]), np.repeat(pt1,emb1.shape[0])))
    #print("y[,feat]",y[:,feature_of_interest])
    # if pt0,pt1 are D-dim target points for each class emb0/emb1
    y = np.repeat( np.vstack((pt0,pt1)), [emb0.shape[0],emb1.shape[0]], axis=0 )
    print(y.shape, y)
    
    print(type(e), type(y), e.shape, y.shape)
    assert( e.size == y.size )
    #r,s,t,d = opa(y,e)
    return opa(y,e)
def emb_opa_apply(x, embedding):
    """ given x=[r,s,t,d]"""
    return embedding.dot(x[0]) * x[1] + x[2]

#
# method 0: naive, brute force
#
if method==0:
    emb2 = embedding
    emb2[good10,0] = -1.0
    emb2[bad10,0] = +1.0
    # --- without clamping, we totally lose the "init" state

#
# method 1: "best" linear transform of x-coords of embedding
#
if method==1:
    print("good10:\n",  embedding[good10,])
    print("bad10:\n",  embedding[bad10,])
    x = emb_linear( embedding[good10,], embedding[bad10,], [-1,0], [1,0] )
    print("x", x)
    emb2 = emb_linear_apply( x, embedding)

#
# method 2: "best" rotate, scale and translate
#
if method==2:
    x = emb_opa( embedding[good10,], embedding[bad10,], [-1,0], [1,0] )
    print("x", x)
    emb2 = emb_opa_apply( x, embedding)

print("UMAP pinning init method", method_names[method])
print("good10:\n",  emb2[good10,])
print("bad10:\n",  emb2[bad10,])
#embedding = emb2

good10:
 [[ 7.242771   9.893241 ]
 [-3.922997   1.2515678]
 [-4.067657   0.5437252]]
bad10:
 [[ 7.8313046  7.592354 ]
 [-4.7280955 -1.4341671]
 [-3.450067  -4.4144154]]
2 (3, 2)
(6, 2) [[ 7.242771   9.893241 ]
 [-3.922997   1.2515678]
 [-4.067657   0.5437252]
 [ 7.8313046  7.592354 ]
 [-4.7280955 -1.4341671]
 [-3.450067  -4.4144154]]
(6, 2) [[-1  0]
 [-1  0]
 [-1  0]
 [ 1  0]
 [ 1  0]
 [ 1  0]]
<class 'numpy.ndarray'> <class 'numpy.ndarray'> (6, 2) (6, 2)
x (array([[ 0.04029259,  0.99918792],
       [-0.99918792,  0.04029259]]), 0.13511954908302604, array([0.30324218, 0.01244518]), 1.245682673837354)
UMAP pinning init method rot,scale,trans
good10:
 [[-0.99301058  1.04415235]
 [ 0.11291019 -0.51038402]
 [ 0.20768833 -0.53376826]]
bad10:
 [[-0.67916406  1.11108338]
 [ 0.47112758 -0.63370222]
 [ 0.88044835 -0.47738123]]


In [8]:
#plot emb2

targets = [str(d) for d in iris.target_names]
targets += ["good","bad"]
print([targets[d] for d in iris.target[0:5]])
source = ColumnDataSource(
    dict(
        x=[e[0] for e in emb2],
        y=[e[1] for e in emb2],
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p2 = figure(title=("Iris "+method_names[method]+" drag'n'drop init"))
p2.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    #legend_label="species",
    legend_group="label"
)
# gray boxes around good/bad points and (fake) category
gb = np.vstack([emb2[good10,], emb2[bad10,]])
gbcat = np.hstack((np.repeat("good",nFeat), np.repeat("bad",nFeat)))
#print(embedding[good10,])
#print("gb   ",gb)
#print("gbcat",gbcat)
#print([e[0] for e in gb])
#print([e[1] for e in gb])
gbsource = ColumnDataSource( dict(
        x=[e[0] for e in gb],
        y=[e[1] for e in gb],
        label=gbcat,
    ))
#cmap = CategoricalColorMapper(factors=["good","bad"], palette=Category10[10])
p2.square(x="x", y="y", size=16, line_alpha=0.7, line_width=4, fill_alpha=0.0,
          source=gbsource, color={"field": "label", "transform": cmap},
          #legend_label="dragged",
          legend_group="label"
)

show(p2)

['setosa', 'setosa', 'setosa', 'setosa', 'setosa']


In [20]:
# Pinned UMAP (and undo UMAP internal rescaling)
emb3 = emb2
emb3[good10,0] = -1
emb3[bad10,0]  = +1
print("embedding.shape",emb3.shape)
print("good10:\n", emb3[good10,])
print("bad10:\n",  emb3[bad10,])# re-embed just with new init conditions
pin_mask = np.ones(emb3.shape, dtype=np.float32) # todo: allow float32
pin_mask[good10,0] = 0.0 # zero gradient, so zero movement of init embedding
pin_mask[bad10,0] = 0.0
print("pin_mask.shape",pin_mask.shape)
print("pin_mask good10:\n", pin_mask[good10,])
print("pin_mask bad10:\n",  pin_mask[bad10,])# re-embed just with new init conditions
#   NOTE: should have pin_mask in UMAP constructor !
embedder = umap.UMAP(
    n_neighbors=50, learning_rate=0.5, random_state=12346, init=emb3,
    negative_sample_rate=5, repulsion_strength=0.40,
    min_dist=0.001, spread=3.0, #a=0.1, b=0.9,
)
emb3 = embedder.fit_transform(iris.data, pin_mask=pin_mask)
print(emb3[0:10,])
print("good10:\n", emb3[good10,])
print("bad10:\n",  emb3[bad10,])

# UMAP has rescaled things "behind our back".
# Probably UMAP should avoid the rescale if pin_mask is not None
# (or maybe if "enough" points have been pinned?)

emb4 = emb3
if False: # old code (this coord-rescale method is actually what we want to do.)
    # Oh-oh.  umap is doing some internal rescaling -- let's undo that.
    goodx = emb3[good10[0],0]
    badx  = emb3[bad10[0],0]
    print("umap --> good,bad=",goodx,badx)
    x = np.array([goodx,badx])
    A = np.array([[goodx,1.0],[badx,1.0]])
    y = np.array([-1.0,1.0])
    print("A\n",A,"\ny\n",y)
    m, c = np.linalg.lstsq(A, y, rcond=None)[0]
    print("m,c",np.round(m,3),np.round(c,3))
    print("fit",m*x+c)
    # scaling factor applies to BOTH x and y
    emb4[:,0] = m*emb3[:,0] + c
    emb4[:,1] = m*emb3[:,1]
    emb4[:,1] -= np.average(embedding, axis=1) # 'y' centroid --> zero

if True: # new: support several "re-project" methods
    # This reproject should ONLY SCALE
    rescale = 1
    if rescale==0:
        emb4[good10,0] = -1.0
        emb4[bad10,0] = +1.0
    if rescale==1: # use x-values to determine space scalings
        print("good10:\n",  emb3[good10,])
        print("bad10:\n",  emb3[bad10,])
        x = emb_linear( emb3[good10,], emb3[bad10,], [-1,0], [1,0] )
        print("x", x)
        emb4 = emb_linear_apply( x, emb3)
    if rescale==2:
        # rotate/scale/translate WILL NOT RE-PIN the x-values as desired!
        x = emb_opa( emb3[good10,], emb3[bad10,], [-1,0], [1,0] )
        print("x", x)
        emb4 = emb_opa_apply( x, emb3)
print("re-shift good10:\n", emb4[good10,])
print("re-shift bad10:\n",  emb4[bad10,])


embedding.shape (150, 2)
good10:
 [[-1.          1.04415235]
 [-1.         -0.51038402]
 [-1.         -0.53376826]]
bad10:
 [[ 1.          1.11108338]
 [ 1.         -0.63370222]
 [ 1.         -0.47738123]]
pin_mask.shape (150, 2)
pin_mask good10:
 [[0. 1.]
 [0. 1.]
 [0. 1.]]
pin_mask bad10:
 [[0. 1.]
 [0. 1.]
 [0. 1.]]
X.shape (150, 4)
pin_mask.shape (150, 2)
[[ 0.9433661  16.705515  ]
 [ 1.2101948  14.155561  ]
 [ 0.46222728 14.784152  ]
 [ 0.67124796 14.323999  ]
 [ 1.4296885  16.536469  ]
 [ 2.8384988  16.816952  ]
 [ 0.46702704 15.095856  ]
 [ 1.1209314  16.002743  ]
 [ 0.27782184 14.245778  ]
 [ 1.2493968  14.488393  ]]
good10:
 [[ 0.       14.581481]
 [ 0.       -8.128715]
 [ 0.       -7.381854]]
bad10:
 [[10.        16.818481 ]
 [10.        -7.398991 ]
 [10.        -3.5377245]]
good10:
 [[ 0.       14.581481]
 [ 0.       -8.128715]
 [ 0.       -7.381854]]
bad10:
 [[10.        16.818481 ]
 [10.        -7.398991 ]
 [10.        -3.5377245]]
<class 'numpy.ndarray'> <class 'numpy.nda

In [21]:
#output_file("iris4.html")

targets = [str(d) for d in iris.target_names]
targets += ["good","bad"]
print([targets[d] for d in iris.target[0:5]])
source = ColumnDataSource(
    dict(
        x=[e[0] for e in emb4],
        y=[e[1] for e in emb4],
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p3 = figure(title="Iris UMAP post drag'n'drop")
p3.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    #legend_label="species",
    legend_group="label"
)
# gray boxes around good/bad points and (fake) category
gb = np.vstack([emb4[good10,], emb4[bad10,]])
gbcat = np.hstack((np.repeat("good",nFeat), np.repeat("bad",nFeat)))
#print(emb4[good10,])
#print("gb   ",gb)
#print("gbcat",gbcat)
#print([e[0] for e in gb])
#print([e[1] for e in gb])
gbsource = ColumnDataSource( dict(
        x=[e[0] for e in gb],
        y=[e[1] for e in gb],
        label=gbcat,
    ))
#cmap = CategoricalColorMapper(factors=["good","bad"], palette=Category10[10])
p3.square(x="x", y="y", size=16, line_alpha=0.7, line_width=4, fill_alpha=0.0,
          source=gbsource, color={"field": "label", "transform": cmap},
          #legend_label="dragged",
          legend_group="label"
)

output_notebook()
show(p3)
#output_file("iris4.html")
#show(column(p1,p2,p3))

['setosa', 'setosa', 'setosa', 'setosa', 'setosa']


In [None]:
x = np.array([0, 1, 2, 3])
y = np.array([-1, 0.2, 0.9, 2.1])
A = np.vstack([x, np.ones(len(x))]).T
print("A\n",A,"\ny\n",y)
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
print("m,c",np.round(m,3),np.round(c,3))
print("fit",m*x+c)

In [None]:
a=[1,2,3]; a+=[4,5]; print(a)
a=np.array([1,2,3]); a = np.hstack((a, [4,5])); print(a)