### pin_mask demo
- We get a $1^{st}$ embedding (using random init)
- linearly transform so "good" and "bad" samples so their x values end up at [-1,+1]
- then re-embed "pinning" those x values
    - (could random-init by-hand, then fix x values for single pin_mask umap fit)
- umap applies no gradient (but does apply rescaling)
- we determine the new linear rescaling
- and remap the embedding back to good|bad x-values -1|+1

### Next (other notebooks)
- Kinda' works, but perhaps want additional constraints, like
  clipping ALL x values to range -1,+1 ?
   - Alt. select the lowest|highest feature *of each species* (target)
     and add one each to good|bad sets  (should look way different)
- perhaps better to ROTATE (see Kabsch algorithm)
  such that good|bad points are aligned *towards* (-1,0)|(+1,0)
   - then rescale and shift to put their x-centroids exactly at (-1,0)|(+1,0)

In [1]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import CategoricalColorMapper, ColumnDataSource
from bokeh.palettes import Category10
from bokeh.io import output_notebook
from bokeh.layouts import column
output_notebook()

import umap
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [3]:
print(type(iris.data))
print(iris.data[0:10,])
print(iris.target[0:10])
print(iris.target_names)
print(iris.feature_names)
# I'm interested in feature 0 (sepal length) really small.
# choose 2 "interesting" examples and 2 uninteresting
feature_of_interest = 0
nFeat = 4
print("feature_of_interest",feature_of_interest)
data_of_interest = iris.data[:,feature_of_interest]
import numpy as np
best = np.argmin(data_of_interest)
#good3 = np.argpartition(iris.data[:,0], 3)
#print("good",good, "good3",good3)
#print(iris.data[good3,])
goods = np.argsort(data_of_interest)
good10 = goods[0:nFeat]
print("good10",good10,
      "\ndata of goods:\n",iris.data[good10,])
bad10 = goods[-nFeat:,]
print("bad10",bad10,
      "\ndata of bads:\n",iris.data[bad10])
#

<class 'numpy.ndarray'>
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]
[0 0 0 0 0 0 0 0 0 0]
['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
feature_of_interest 0
good10 [13 42 38  8] 
data of goods:
 [[4.3 3.  1.1 0.1]
 [4.4 3.2 1.3 0.2]
 [4.4 3.  1.3 0.2]
 [4.4 2.9 1.4 0.2]]
bad10 [118 117 135 131] 
data of bads:
 [[7.7 2.6 6.9 2.3]
 [7.7 3.8 6.7 2.2]
 [7.7 3.  6.1 2.3]
 [7.9 3.8 6.4 2. ]]


In [4]:
embedding = umap.UMAP(
    n_neighbors=50, learning_rate=0.5, init="random", min_dist=0.001
).fit_transform(iris.data)
print(embedding[0:5,])

[[9.079593   0.44307488]
 [9.639339   1.4087708 ]
 [9.110716   1.6770729 ]
 [9.093158   1.8146694 ]
 [9.052745   0.43679297]]


In [5]:
#output_file("iris2a.html")

targets = [str(d) for d in iris.target_names]

source = ColumnDataSource(
    dict(
        x=[e[0] for e in embedding],
        y=[e[1] for e in embedding],
        #g=[i in good10 for e,i in enumerate(embedding) # ?
        #b=[i in bad10  for i in range(embedding.shape[0])] # equiv for bad ?
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p1 = figure(title="Test UMAP on Iris dataset")
p1.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    legend_label="label",
)

# gray boxes around good/bad points and (fake) category
gb = np.vstack([embedding[good10,], embedding[bad10,]])
gbcat = np.hstack((np.repeat("good",nFeat), np.repeat("bad",nFeat)))
#print(embedding[good10,])
#print("gb   ",gb)
#print("gbcat",gbcat)
#print([e[0] for e in gb])
#print([e[1] for e in gb])
gbsource = ColumnDataSource( dict(
        x=[e[0] for e in gb],
        y=[e[1] for e in gb],
        label=gbcat,
    ))
p1.square(x="x", y="y", size=16, line_alpha=0.5, line_width=1, fill_alpha=0.3,
          source=gbsource, color={"field": "label", "transform": cmap},
          legend_label="dragged",
)

show(p1)

In [6]:
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])

good10:
 [[9.307444  1.8785704]
 [9.183707  1.8469243]
 [9.253241  1.9231378]
 [9.276938  1.9048102]]
bad10:
 [[-7.2268677  1.236402 ]
 [-7.2965145  1.6139722]
 [-7.0130916  1.2190703]
 [-7.293201   1.5624129]]


In [7]:
# simulate drag'n'drop of goods to left, bads to right

#
# method 0: naive, brute force
#
#embedding[good10,0] = -1.0
#embedding[bad10,0] = +1.0
# --- without clamping, we totally lose the "init" state

#
# method 1: "best" linear transform of x-coords of embedding
#
# y = [e 1] @ [m c] st. e'[good10,][0] ~ -1 and e'[bad10,][0] ~ 1
e = np.hstack([embedding[good10,0], embedding[bad10,0]])
# hoping for first half ~ -1, rest ~ +1
y = np.hstack((np.repeat(-1,nFeat), np.repeat(+1,nFeat)))
print(type(e), type(y), e.shape, y.shape)
assert( e.size == y.size )
A = np.vstack([e, np.ones(len(e))]).T  # add a one's column
print(type(A), A.shape, "\n")
#print(A)
#print(y)

#x, residuals, rank, s = np.linalg.lstsq(A, y, rcond=None)
#print("lstsq -> x=",x)
#m,c = x
m,c = np.linalg.lstsq(A, y, rcond=None)[0]
print("m,c",np.round(m,3),np.round(c,3))
print("fit",m*e+c)

# re-embed all data w/ "best" linear transform of 'x' values
# rescale 'y' too, (keep rel. distances, don't care about y shift)
embedding[:,0] = m * embedding[:,0] + c
embedding[:,1] = m * embedding[:,1]
embedding[:,1] -= np.average(embedding, axis=1) # 'y' centroid --> zeroprint("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])

<class 'numpy.ndarray'> <class 'numpy.ndarray'> (8,) (8,)
<class 'numpy.ndarray'> (8, 2) 

m,c -0.121 0.124
fit [-1.0062162  -0.99118567 -0.999632   -1.0025107   1.0022486   1.0107088
  0.9762807   1.0103062 ]
bad10:
 [[ 1.0022486  -0.5762187 ]
 [ 1.0107088  -0.60338104]
 [ 0.9762807  -0.5621821 ]
 [ 1.0103062  -0.60004824]]


In [8]:
targets = [str(d) for d in iris.target_names]
targets += ["good","bad"]
print([targets[d] for d in iris.target[0:5]])
source = ColumnDataSource(
    dict(
        x=[e[0] for e in embedding],
        y=[e[1] for e in embedding],
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p2 = figure(title="Iris linear drag'n'drop mods")
p2.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    #legend_label="species",
    legend_group="label"
)
# gray boxes around good/bad points and (fake) category
gb = np.vstack([embedding[good10,], embedding[bad10,]])
gbcat = np.hstack((np.repeat("good",nFeat), np.repeat("bad",nFeat)))
#print(embedding[good10,])
#print("gb   ",gb)
#print("gbcat",gbcat)
#print([e[0] for e in gb])
#print([e[1] for e in gb])
gbsource = ColumnDataSource( dict(
        x=[e[0] for e in gb],
        y=[e[1] for e in gb],
        label=gbcat,
    ))
#cmap = CategoricalColorMapper(factors=["good","bad"], palette=Category10[10])
p2.square(x="x", y="y", size=16, line_alpha=0.7, line_width=4, fill_alpha=0.0,
          source=gbsource, color={"field": "label", "transform": cmap},
          #legend_label="dragged",
          legend_group="label"
)

show(p2)

['setosa', 'setosa', 'setosa', 'setosa', 'setosa']


In [9]:
embedding[good10,0] = -1
embedding[bad10,0]  = +1
print("embedding.shape",embedding.shape)
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])# re-embed just with new init conditions
pin_mask = np.ones(embedding.shape, dtype=np.float32) # todo: allow float32
pin_mask[good10,0] = 0.0 # zero gradient, so zero movement of init embedding
pin_mask[bad10,0] = 0.0
print("pin_mask.shape",pin_mask.shape)
print("pin_mask good10:\n", pin_mask[good10,])
print("pin_mask bad10:\n",  pin_mask[bad10,])# re-embed just with new init conditions
#   NOTE: should have pin_mask in UMAP constructor !
embedder = umap.UMAP(
    n_neighbors=50, learning_rate=0.5, init=embedding, min_dist=0.001
)
embedding = embedder.fit_transform(iris.data, pin_mask=pin_mask)
print(embedding[0:10,])
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])
# Oh-oh.  umap is doing some internal rescaling -- let's undo that.
goodx = embedding[good10[0],0]
badx  = embedding[bad10[0],0]
print("umap --> good,bad=",goodx,badx)
x = np.array([goodx,badx])
A = np.array([[goodx,1.0],[badx,1.0]])
y = np.array([-1.0,1.0])
print("A\n",A,"\ny\n",y)
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
print("m,c",np.round(m,3),np.round(c,3))
print("fit",m*x+c)
# scaling factor applies to BOTH x and y
embedding[:,0] = m*embedding[:,0] + c
embedding[:,1] = m*embedding[:,1]
embedding[:,1] -= np.average(embedding, axis=1) # 'y' centroid --> zero

embedding.shape (150, 2)
good10:
 [[-1.          0.38901073]
 [-1.          0.38341755]
 [-1.          0.38301176]
 [-1.          0.38556427]]
bad10:
 [[ 1.         -0.5762187 ]
 [ 1.         -0.60338104]
 [ 1.         -0.5621821 ]
 [ 1.         -0.60004824]]
pin_mask.shape (150, 2)
pin_mask good10:
 [[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]
pin_mask bad10:
 [[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]
X.shape (150, 4)
pin_mask.shape (150, 2)
[[-1.38334179e+00  1.20152388e+01]
 [-8.28161016e-02  1.17050772e+01]
 [-1.18898004e-01  1.23327494e+01]
 [ 7.08470959e-03  1.22110300e+01]
 [-1.40753102e+00  1.22448177e+01]
 [-2.03768373e+00  1.26921816e+01]
 [-1.63150296e-01  1.23846407e+01]
 [-9.16360319e-01  1.20397882e+01]
 [ 2.42479712e-01  1.21629543e+01]
 [-2.43698925e-01  1.17875061e+01]]
good10:
 [[ 0.24247971 12.371516  ]
 [ 0.24247971 12.323803  ]
 [ 0.24247971 12.268821  ]
 [ 0.24247971 12.162954  ]]
bad10:
 [[10.        -1.1500739]
 [10.        -1.3904124]
 [10.        -1.1325891]
 [10.        -

In [10]:
#output_file("iris2.html")

targets = [str(d) for d in iris.target_names]
print([targets[d] for d in iris.target[0:5]])
source = ColumnDataSource(
    dict(
        x=[e[0] for e in embedding],
        y=[e[1] for e in embedding],
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p3 = figure(title="Iris UMAP post drag'n'drop")
p3.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    legend_label="label",
)
# gray boxes around good/bad points and (fake) category
gb = np.vstack([embedding[good10,], embedding[bad10,]])
gbcat = np.hstack((np.repeat("good",nFeat), np.repeat("bad",nFeat)))
#print(embedding[good10,])
#print("gb   ",gb)
#print("gbcat",gbcat)
#print([e[0] for e in gb])
#print([e[1] for e in gb])
gbsource = ColumnDataSource( dict(
        x=[e[0] for e in gb],
        y=[e[1] for e in gb],
        label=gbcat,
    ))
p3.square(x="x", y="y", size=16, line_alpha=0.5, line_width=1, fill_alpha=0.3,
          source=gbsource, color={"field": "label", "transform": cmap},
          legend_label="dragged",
)

output_notebook()
show(p2)
output_file("iris2.html")
show(column(p1,p2))

['setosa', 'setosa', 'setosa', 'setosa', 'setosa']


In [11]:
x = np.array([0, 1, 2, 3])
y = np.array([-1, 0.2, 0.9, 2.1])
A = np.vstack([x, np.ones(len(x))]).T
print("A\n",A,"\ny\n",y)
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
print("m,c",np.round(m,3),np.round(c,3))
print("fit",m*x+c)

A
 [[0. 1.]
 [1. 1.]
 [2. 1.]
 [3. 1.]] 
y
 [-1.   0.2  0.9  2.1]
m,c 1.0 -0.95
fit [-0.95  0.05  1.05  2.05]
