### pin_mask demo
- We get a $1^{st}$ embedding (using random init)
- linearly transform so "good" and "bad" samples so their x values end up at [-1,+1]
    - (in least-squares sense)
- then re-embed "pinning" those x values *exactly* at -1, +1
    - (could random-init by-hand, then fix x values for single pin_mask umap fit)
- umap applies no gradient (but does apply rescaling)
- we determine the new linear rescaling
- and remap the embedding back to good|bad x-values -1|+1

### New
- We select a good/bad entry based on feature 0 "sepal length"
  for each iris species.  Lowest pinned to x=-1, highest pinned to x=+1
  
  
- still perhaps better to ROTATE (see Kabsch algorithm) the initial embedding
  such that good|bad points are aligned *towards* (-1,0)|(+1,0) idealized drag
  positions
   - then rescale and shift to put their x-centroids exactly at (-1,0)|(+1,0)

In [1]:
import calculate_rmsd

from bokeh.plotting import figure, output_file, show
from bokeh.models import CategoricalColorMapper, ColumnDataSource
from bokeh.palettes import Category10, Colorblind, Viridis
from bokeh.io import output_notebook
from bokeh.layouts import column
output_notebook()

import umap
import numpy as np
from sklearn.datasets import load_iris
import pandas as pd

In [2]:
iris = load_iris()

In [3]:
print(type(iris.data))
print(iris.data.shape, iris.data[0:5,])
print(iris.target.shape, iris.target[0:5])
print(iris.target_names.size, iris.target_names)
print(iris.feature_names)
# I'm interested in feature 0 (sepal length) really small.
feature_of_interest = 0
fi_name = iris.feature_names[feature_of_interest]
di = data_of_interest = iris.data[:,feature_of_interest]
if True:
    # This time, choose good/bad from each iris species
    nFeat = iris.target_names.size
    good10 = np.zeros(nFeat,dtype=np.int32)
    bad10 = np.zeros(nFeat,dtype=np.int32)
    for t,name in enumerate(iris.target_names):
        print(t,name)
        mask = (iris.target==t)
        #print(di[mask])
        dilo = np.argmin(di[mask]) # index within masked group
        dihi = np.argmax(di[mask])
        #print(dilo)
        diilo = np.arange(di.shape[0]) [mask] [dilo] # index within original
        diihi = np.arange(di.shape[0]) [mask] [dihi]
        #print(diilo)
        good10[t] = diilo
        bad10[t] = diihi

    print("Selected shortest (good) and longest (bad)",
          fi_name, "of each iris species")
    print(fi_name, "good/bad values:")
    row_names = ["good", "bad"]
    col_names = iris.target_names
    matrix = np.zeros((2,3))
    for t,name in enumerate(col_names):
        matrix[0,t] = iris.data[ good10[t], feature_of_interest ]
        matrix[1,t] = iris.data[ bad10[t], feature_of_interest ]
    df = pd.DataFrame(matrix, columns=col_names, index=row_names)
    print(df)
    

if False: # older case
    # choose 2 "interesting" examples and 2 uninteresting
    nFeat = 4
    print("feature_of_interest",feature_of_interest)
    best = np.argmin(data_of_interest)
    #good3 = np.argpartition(iris.data[:,0], 3)
    #print("good",good, "good3",good3)
    #print(iris.data[good3,])
    goods = np.argsort(data_of_interest)
    good10 = goods[0:nFeat]
    bad10 = goods[-nFeat:,]

print("good10",good10,"\ndata of goods:\n",iris.data[good10,])
print("bad10",bad10,  "\ndata of bads:\n",iris.data[bad10])
#

<class 'numpy.ndarray'>
(150, 4) [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
(150,) [0 0 0 0 0]
3 ['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
0 setosa
1 versicolor
2 virginica
Selected shortest (good) and longest (bad) sepal length (cm) of each iris species
sepal length (cm) good/bad values:
      setosa  versicolor  virginica
good     4.3         4.9        4.9
bad      5.8         7.0        7.9
good10 [ 13  57 106] 
data of goods:
 [[4.3 3.  1.1 0.1]
 [4.9 2.4 3.3 1. ]
 [4.9 2.5 4.5 1.7]]
bad10 [ 14  50 131] 
data of bads:
 [[5.8 4.  1.2 0.2]
 [7.  3.2 4.7 1.4]
 [7.9 3.8 6.4 2. ]]


In [4]:
embedding = umap.UMAP(
    n_neighbors=50, learning_rate=0.5, random_state=12345, init="random", min_dist=0.001
).fit_transform(iris.data)
print(embedding[0:5,])

umap-->optimize_layout_euclidean
TRIAL: opt+mask+version 1
constraints
[[14.760284   2.7686224]
 [12.777846   3.5021129]
 [13.122247   2.6024058]
 [12.780585   2.8045743]
 [14.829005   2.460272 ]]


In [5]:
#output_file("iris2a.html")

targets = [str(d) for d in iris.target_names]
targets += ["good","bad"]
source = ColumnDataSource(
    dict(
        x=[e[0] for e in embedding],
        y=[e[1] for e in embedding],
        #g=[i in good10 for e,i in enumerate(embedding) # ?
        #b=[i in bad10  for i in range(embedding.shape[0])] # equiv for bad ?
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p1 = figure(title="Test UMAP on Iris dataset")
p1.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    #legend_label="species",
    legend_group="label"
)

# gray boxes around good/bad points and (fake) category
gb = np.vstack([embedding[good10,], embedding[bad10,]])
gbcat = np.hstack((np.repeat("good",nFeat), np.repeat("bad",nFeat)))
#print(embedding[good10,])
#print("gb   ",gb)
#print("gbcat",gbcat)
#print([e[0] for e in gb])
#print([e[1] for e in gb])
gbsource = ColumnDataSource( dict(
        x=[e[0] for e in gb],
        y=[e[1] for e in gb],
        label=gbcat,
    ))
p1.square(x="x", y="y", size=16, line_alpha=0.7, line_width=4, fill_alpha=0.0,
          source=gbsource, color={"field": "label", "transform": cmap},
          #legend_label="good/bad",
          legend_group="label",
)
#p1.add_layout(p1.legend[0], 'right') # outside, plot rectangular!
p1.legend.location = 'top_left'

show(p1)

In [6]:
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])

good10:
 [[12.413155    2.4463196 ]
 [-7.3454866   0.21340056]
 [-5.5902033   0.16069451]]
bad10:
 [[16.295218   3.0028875]
 [-5.13806    3.4190023]
 [-1.0692389  7.0088596]]


In [7]:
# simulate drag'n'drop of goods to left, bads to right

#
# method 0: naive, brute force
#
if False:
    embedding[good10,0] = -1.0
    embedding[bad10,0] = +1.0
    # --- without clamping, we totally lose the "init" state

#
# method 1: "best" linear transform of x-coords of embedding
#
if False:
    #  On harder input data, this FAILS to "move good to left and bad to right"
    # Task: find a m,c linear transform ONLY of 'x' values
    # s.t.  y = [e 1] @ [m c] st. e'[good10,][0] ~ -1 and e'[bad10,][0] ~ 1
    e = np.hstack([embedding[good10,0], embedding[bad10,0]])
    print("e", e.shape, e)
    # hoping for first half ~ -1, rest ~ +1
    y = np.hstack((np.repeat(-1,nFeat), np.repeat(+1,nFeat)))
    print("y", y.shape, y)
    print(type(e), type(y), e.shape, y.shape)
    assert( e.size == y.size )
    A = np.vstack([e, np.ones(len(e))]).T  # add a one's column
    print(type(A), A.shape, "\n")
    #print(A)
    #print(y)

    #x, residuals, rank, s = np.linalg.lstsq(A, y, rcond=None)
    #print("lstsq -> x=",x)
    #m,c = x
    m,c = np.linalg.lstsq(A, y, rcond=None)[0]
    print("m,c",np.round(m,3),np.round(c,3))
    print("fit",m*e+c)
    # re-embed all data w/ "best" linear transform of 'x' values
    # rescale 'y' too, (keep rel. distances, don't care about y shift)
    embedding[:,0] = m * embedding[:,0] + c
    embedding[:,1] = m * embedding[:,1]
    embedding[:,1] -= np.average(embedding, axis=1) # 'y' centroid --> zeroprint("good10:\n", embedding[good10,])

#
# method 2: "best" rotate, scale and translate
#
def opa(a, b):
    """ return rot, scale, translation, and rmsd of shifting `b` to concord with `a`.
    
    `a` and `b` are N D-dim vectors.
    
    Suppose we return r, s, t, d.
    
    To apply the recovered transform to other M D-dim vectors X, calculate
    `X.dot(r) * s + t`
    """
    aT = a.mean(0)
    bT = b.mean(0)
    A = a - aT 
    B = b - bT
    aS = np.sum(A * A)**.5
    bS = np.sum(B * B)**.5
    A /= aS
    B /= bS
    U, _, V = np.linalg.svd(np.dot(B.T, A))
    aR = np.dot(U, V)
    if np.linalg.det(aR) < 0:
        V[1] *= -1
        aR = np.dot(U, V)
    aS = aS / bS
    aT-= (bT.dot(aR) * aS)
    # the original only returned a rotation-only "rms"... between scaled+translated points
    aD = (np.sum((A - B.dot(aR))**2) / len(a))**.5
    # the xform in general is : a[1] = a[1].dot(r) * s + t
    # if we actually DO the full transform "LONG HAND"
    #aD = np.sqrt(((a - (b.dot(aR) * aS + aT))**2).sum() / len(a))
    # equivalently, include scaling into previous rmsd as
    aD *= (aS * bS)
    return aR, aS, aT, aD 
        
if True:
    e = np.vstack([embedding[good10,], embedding[bad10,]])
    print(e.shape, e)
    # hoping for first half ~ -1, rest ~ +1
    y = np.zeros_like(e)
    y[:,feature_of_interest] = np.hstack((np.repeat(-1,nFeat), np.repeat(+1,nFeat)))
    print("y[,feat]",y[:,feature_of_interest])
    print(type(e), type(y), e.shape, y.shape)
    assert( e.size == y.size )
    r,s,t,d = opa(y,e)
    embedding2 = embedding.dot(r) * s + t
    embedding = embedding2
    #print("good10:\n",  embedding2[good10,])
    #print("bad10:\n",   embedding2[bad10,])    
print("good10:\n",  embedding[good10,])
print("bad10:\n",   embedding[bad10,])

(6, 2) [[12.413155    2.4463196 ]
 [-7.3454866   0.21340056]
 [-5.5902033   0.16069451]
 [16.295218    3.0028875 ]
 [-5.13806     3.4190023 ]
 [-1.0692389   7.0088596 ]]
y[,feat] [-1. -1. -1.  1.  1.  1.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'> (6, 2) (6, 2)
good10:
 [[ 0.7803862  -0.8191435 ]
 [-0.84529793  0.47639716]
 [-0.71943796  0.34274626]]
bad10:
 [[ 1.1085033  -1.0649713 ]
 [-0.44515228  0.5501872 ]
 [ 0.12099853  0.5147841 ]]


In [8]:
targets = [str(d) for d in iris.target_names]
targets += ["good","bad"]
print([targets[d] for d in iris.target[0:5]])
source = ColumnDataSource(
    dict(
        x=[e[0] for e in embedding],
        y=[e[1] for e in embedding],
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p2 = figure(title="Iris linear drag'n'drop mods")
p2.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    #legend_label="species",
    legend_group="label"
)
# gray boxes around good/bad points and (fake) category
gb = np.vstack([embedding[good10,], embedding[bad10,]])
gbcat = np.hstack((np.repeat("good",nFeat), np.repeat("bad",nFeat)))
#print(embedding[good10,])
#print("gb   ",gb)
#print("gbcat",gbcat)
#print([e[0] for e in gb])
#print([e[1] for e in gb])
gbsource = ColumnDataSource( dict(
        x=[e[0] for e in gb],
        y=[e[1] for e in gb],
        label=gbcat,
    ))
#cmap = CategoricalColorMapper(factors=["good","bad"], palette=Category10[10])
p2.square(x="x", y="y", size=16, line_alpha=0.7, line_width=4, fill_alpha=0.0,
          source=gbsource, color={"field": "label", "transform": cmap},
          #legend_label="dragged",
          legend_group="label"
)

show(p2)

['setosa', 'setosa', 'setosa', 'setosa', 'setosa']


In [9]:
###this should be markdown!
#### the "linear" transform really sucked.
#### probably want calculate rmsd
#### with extension to also arbitrarily scale.
#### translate to origin, rotate, scale (then rotate again?)

In [10]:
embedding[good10,0] = -1
embedding[bad10,0]  = +1
print("embedding.shape",embedding.shape)
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])# re-embed just with new init conditions
pin_mask = np.ones(embedding.shape, dtype=np.float32) # todo: allow float32
pin_mask[good10,0] = 0.0 # zero gradient, so zero movement of init embedding
pin_mask[bad10,0] = 0.0
print("pin_mask.shape",pin_mask.shape)
print("pin_mask good10:\n", pin_mask[good10,])
print("pin_mask bad10:\n",  pin_mask[bad10,])# re-embed just with new init conditions
#   NOTE: should have pin_mask in UMAP constructor !
embedder = umap.UMAP(
    n_neighbors=50, learning_rate=0.5, random_state=12346, init=embedding,
    negative_sample_rate=5, repulsion_strength=0.40,
    min_dist=0.001, spread=3.0, #a=0.1, b=0.9,
)
embedding = embedder.fit_transform(iris.data, pin_mask=pin_mask)
print(embedding[0:10,])
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])
# Oh-oh.  umap is doing some internal rescaling -- let's undo that.
goodx = embedding[good10[0],0]
badx  = embedding[bad10[0],0]
print("umap --> good,bad=",goodx,badx)
x = np.array([goodx,badx])
A = np.array([[goodx,1.0],[badx,1.0]])
y = np.array([-1.0,1.0])
print("A\n",A,"\ny\n",y)
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
print("m,c",np.round(m,3),np.round(c,3))
print("fit",m*x+c)
# scaling factor applies to BOTH x and y
embedding[:,0] = m*embedding[:,0] + c
embedding[:,1] = m*embedding[:,1]
embedding[:,1] -= np.average(embedding, axis=1) # 'y' centroid --> zero

embedding.shape (150, 2)
good10:
 [[-1.         -0.8191435 ]
 [-1.          0.47639716]
 [-1.          0.34274626]]
bad10:
 [[ 1.        -1.0649713]
 [ 1.         0.5501872]
 [ 1.         0.5147841]]
pin_mask.shape (150, 2)
pin_mask good10:
 [[0. 1.]
 [0. 1.]
 [0. 1.]]
pin_mask bad10:
 [[0. 1.]
 [0. 1.]
 [0. 1.]]
X.shape (150, 4)
pin_mask.shape (150, 2)
TRIAL: opt+mask+version 1
pin_mask
sample 13 pin head[ 0 ] begins at -1.0
sample 14 pin head[ 0 ] begins at 1.0
sample 50 pin head[ 0 ] begins at 1.0
sample 57 pin head[ 0 ] begins at -1.0
sample 106 pin head[ 0 ] begins at -1.0
sample 131 pin head[ 0 ] begins at 1.0
[[ -0.60654604 -13.212102  ]
 [ -2.7192879  -14.408144  ]
 [ -1.7846605  -14.635337  ]
 [ -2.2235272  -14.771394  ]
 [ -0.45547208 -13.350984  ]
 [  0.4202119  -12.577012  ]
 [ -1.467135   -14.478303  ]
 [ -1.1410079  -13.44253   ]
 [ -1.937371   -15.152409  ]
 [ -2.4106376  -14.1264305 ]]
good10:
 [[ -1.       -15.041074]
 [ -1.         4.906268]
 [ -1.         5.658761]]


In [11]:
#output_file("iris3.html")

targets = [str(d) for d in iris.target_names]
targets += ["good","bad"]
print([targets[d] for d in iris.target[0:5]])
source = ColumnDataSource(
    dict(
        x=[e[0] for e in embedding],
        y=[e[1] for e in embedding],
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p3 = figure(title="Iris UMAP post drag'n'drop")
p3.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    #legend_label="species",
    legend_group="label"
)
# gray boxes around good/bad points and (fake) category
gb = np.vstack([embedding[good10,], embedding[bad10,]])
gbcat = np.hstack((np.repeat("good",nFeat), np.repeat("bad",nFeat)))
#print(embedding[good10,])
#print("gb   ",gb)
#print("gbcat",gbcat)
#print([e[0] for e in gb])
#print([e[1] for e in gb])
gbsource = ColumnDataSource( dict(
        x=[e[0] for e in gb],
        y=[e[1] for e in gb],
        label=gbcat,
    ))
#cmap = CategoricalColorMapper(factors=["good","bad"], palette=Category10[10])
p3.square(x="x", y="y", size=16, line_alpha=0.7, line_width=4, fill_alpha=0.0,
          source=gbsource, color={"field": "label", "transform": cmap},
          #legend_label="dragged",
          legend_group="label"
)

output_notebook()
show(p3)
output_file("iris3.html")
show(column(p1,p2,p3))

['setosa', 'setosa', 'setosa', 'setosa', 'setosa']


In [12]:
x = np.array([0, 1, 2, 3])
y = np.array([-1, 0.2, 0.9, 2.1])
A = np.vstack([x, np.ones(len(x))]).T
print("A\n",A,"\ny\n",y)
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
print("m,c",np.round(m,3),np.round(c,3))
print("fit",m*x+c)

A
 [[0. 1.]
 [1. 1.]
 [2. 1.]
 [3. 1.]] 
y
 [-1.   0.2  0.9  2.1]
m,c 1.0 -0.95
fit [-0.95  0.05  1.05  2.05]


In [13]:
a=[1,2,3]
a+=[4,5]
a

[1, 2, 3, 4, 5]