### pin_mask demo
- We get a $1^{st}$ embedding (using random init)
- linearly transform so "good" and "bad" samples so their x values end up at [-1,+1]
- then re-embed "pinning" those x values
    - (could random-init by-hand, then fix x values for single pin_mask umap fit)
- umap applies no gradient (but does apply rescaling)
- we determine the new linear rescaling
- and remap the embedding back to good|bad x-values -1|+1

### Next (other notebooks)
- Kinda' works, but perhaps want additional constraints, like
  clipping ALL x values to range -1,+1 ?
   - Alt. select the lowest|highest feature *of each species* (target)
     and add one each to good|bad sets  (should look way different)
- perhaps better to ROTATE (see Kabsch algorithm)
  such that good|bad points are aligned *towards* (-1,0)|(+1,0)
   - then rescale and shift to put their x-centroids exactly at (-1,0)|(+1,0)

In [1]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import CategoricalColorMapper, ColumnDataSource
from bokeh.palettes import Category10

import umap
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [3]:
print(type(iris.data))
print(iris.data[0:10,])
print(iris.target[0:10])
print(iris.target_names)
print(iris.feature_names)
# I'm interested in feature 0 (sepal length) really small.
# choose 2 "interesting" examples and 2 uninteresting
feature_of_interest = 0
nFeat = 4
print("feature_of_interest",feature_of_interest)
data_of_interest = iris.data[:,feature_of_interest]
import numpy as np
best = np.argmin(data_of_interest)
#good3 = np.argpartition(iris.data[:,0], 3)
#print("good",good, "good3",good3)
#print(iris.data[good3,])
goods = np.argsort(data_of_interest)
good10 = goods[0:nFeat]
print("good10",good10,
      "\ndata of goods:\n",iris.data[good10,])
bad10 = goods[-nFeat:,]
print("bad10",bad10,
      "\ndata of bads:\n",iris.data[bad10])
#

<class 'numpy.ndarray'>
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]
[0 0 0 0 0 0 0 0 0 0]
['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
feature_of_interest 0
good10 [13 42 38  8] 
data of goods:
 [[4.3 3.  1.1 0.1]
 [4.4 3.2 1.3 0.2]
 [4.4 3.  1.3 0.2]
 [4.4 2.9 1.4 0.2]]
bad10 [118 117 135 131] 
data of bads:
 [[7.7 2.6 6.9 2.3]
 [7.7 3.8 6.7 2.2]
 [7.7 3.  6.1 2.3]
 [7.9 3.8 6.4 2. ]]


In [4]:
embedding = umap.UMAP(
    n_neighbors=50, learning_rate=0.5, init="random", min_dist=0.001
).fit_transform(iris.data)
print(embedding[0:5,])

[[13.903257   6.715585 ]
 [12.864181   5.927591 ]
 [12.816177   6.367345 ]
 [12.600025   6.5241337]
 [13.792113   6.9806633]]


In [5]:
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])

good10:
 [[12.468941   6.4441166]
 [12.493552   6.447857 ]
 [12.429651   6.2999654]
 [12.539996   6.233395 ]]
bad10:
 [[0.4277385  0.5804448 ]
 [0.2932484  0.8172438 ]
 [0.66964686 0.7818548 ]
 [0.35314932 0.8300789 ]]


In [6]:
# simulate drag'n'drop of goods to left, bads to right

#
# method 0: naive, brute force
#
#embedding[good10,0] = -1.0
#embedding[bad10,0] = +1.0
# --- without clamping, we totally lose the "init" state

#
# method 1: "best" linear transform of x-coords of embedding
#
# y = [e 1] @ [m c] st. e'[good10,][0] ~ -1 and e'[bad10,][0] ~ 1
e = np.hstack([embedding[good10,0], embedding[bad10,0]])
# hoping for first half ~ -1, rest ~ +1
y = np.hstack((np.repeat(-1,nFeat), np.repeat(+1,nFeat)))
print(type(e), type(y), e.shape, y.shape)
assert( e.size == y.size )
A = np.vstack([e, np.ones(len(e))]).T  # add a one's column
print(type(A), A.shape, "\n")
#print(A)
#print(y)

#x, residuals, rank, s = np.linalg.lstsq(A, y, rcond=None)
#print("lstsq -> x=",x)
#m,c = x
m,c = np.linalg.lstsq(A, y, rcond=None)[0]
print("m,c",np.round(m,3),np.round(c,3))
print("fit",m*e+c)

# re-embed all data w/ "best" linear transform of 'x' values
# rescale 'y' too, (keep rel. distances, don't care about y shift)
embedding[:,0] = m * embedding[:,0] + c
embedding[:,1] = m * embedding[:,1]
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])

<class 'numpy.ndarray'> <class 'numpy.ndarray'> (8,) (8,)
<class 'numpy.ndarray'> (8, 2) 

m,c -0.166 1.072
fit [-0.99735665 -1.0014412  -0.9908359  -1.0091493   1.001058    1.0233786
  0.9609097   1.0134372 ]
good10:
 [[-0.99735665 -1.0694959 ]
 [-1.0014412  -1.0701166 ]
 [-0.9908359  -1.0455719 ]
 [-1.0091493  -1.0345236 ]]
bad10:
 [[ 1.001058   -0.09633335]
 [ 1.0233786  -0.13563363]
 [ 0.9609097  -0.12976031]
 [ 1.0134372  -0.1377638 ]]


In [7]:
embedding[good10,0] = -1
embedding[bad10,0]  = +1
print("embedding.shape",embedding.shape)
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])# re-embed just with new init conditions
pin_mask = np.ones(embedding.shape, dtype=np.float32) # todo: allow float32
pin_mask[good10,0] = 0.0 # zero gradient, so zero movement of init embedding
pin_mask[bad10,0] = 0.0
print("pin_mask.shape",pin_mask.shape)
print("pin_mask good10:\n", pin_mask[good10,])
print("pin_mask bad10:\n",  pin_mask[bad10,])# re-embed just with new init conditions
#   NOTE: should have pin_mask in UMAP constructor !
embedder = umap.UMAP(
    n_neighbors=50, learning_rate=0.5, init=embedding, min_dist=0.001
)
embedding = embedder.fit_transform(iris.data, pin_mask=pin_mask)
print(embedding[0:10,])
print("good10:\n", embedding[good10,])
print("bad10:\n",  embedding[bad10,])
# Oh-oh.  umap is doing some internal rescaling -- let's undo that.
goodx = embedding[good10[0],0]
badx  = embedding[bad10[0],0]
print("umap --> good,bad=",goodx,badx)
x = np.array([goodx,badx])
A = np.array([[goodx,1.0],[badx,1.0]])
y = np.array([-1.0,1.0])
print("A\n",A,"\ny\n",y)
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
print("m,c",np.round(m,3),np.round(c,3))
print("fit",m*x+c)
# scaling factor applies to BOTH x and y
embedding[:,0] = m*embedding[:,0] + c
embedding[:,1] = m*embedding[:,1]

embedding.shape (150, 2)
good10:
 [[-1.        -1.0694959]
 [-1.        -1.0701166]
 [-1.        -1.0455719]
 [-1.        -1.0345236]]
bad10:
 [[ 1.         -0.09633335]
 [ 1.         -0.13563363]
 [ 1.         -0.12976031]
 [ 1.         -0.1377638 ]]
pin_mask.shape (150, 2)
pin_mask good10:
 [[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]
pin_mask bad10:
 [[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]
X.shape (150, 4)
pin_mask.shape (150, 2)
[[ 0.04604377 -6.580897  ]
 [ 1.5367963  -7.3100147 ]
 [ 1.1492391  -6.6632104 ]
 [ 1.4486405  -6.932836  ]
 [ 0.1381167  -6.481616  ]
 [-0.58508676 -7.459905  ]
 [ 1.0509185  -6.6677833 ]
 [ 0.31262717 -6.913281  ]
 [ 1.3265676  -6.698701  ]
 [ 1.2562423  -7.2469254 ]]
good10:
 [[ 1.3265676 -6.6372375]
 [ 1.3265676 -6.705507 ]
 [ 1.3265676 -6.738461 ]
 [ 1.3265676 -6.698701 ]]
bad10:
 [[8.238885  6.6861544]
 [8.238885  6.484263 ]
 [8.238885  6.662115 ]
 [8.238885  6.5353355]]
umap --> good,bad= 1.3265676 8.238885
A
 [[1.32656765 1.        ]
 [8.23888493 1.        ]] 

In [8]:
output_file("iris2.html")

targets = [str(d) for d in iris.target_names]

source = ColumnDataSource(
    dict(
        x=[e[0] for e in embedding],
        y=[e[1] for e in embedding],
        label=[targets[d] for d in iris.target],
    )
)

cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

p = figure(title="Test UMAP on Iris dataset")
p.circle(
    x="x",
    y="y",
    source=source,
    color={"field": "label", "transform": cmap},
    legend_label="label",
)

show(p)

In [9]:
x = np.array([0, 1, 2, 3])
y = np.array([-1, 0.2, 0.9, 2.1])
A = np.vstack([x, np.ones(len(x))]).T
print("A\n",A,"\ny\n",y)
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
print("m,c",np.round(m,3),np.round(c,3))
print("fit",m*x+c)

A
 [[0. 1.]
 [1. 1.]
 [2. 1.]
 [3. 1.]] 
y
 [-1.   0.2  0.9  2.1]
m,c 1.0 -0.95
fit [-0.95  0.05  1.05  2.05]
