In [None]:
import umap
import numpy as np
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
%matplotlib inline



In [None]:
IDs=[]
X=[]
with open("./iain_label_test.csv","r") as f:
    s=f.readline()
    while True:
        s=f.readline()
        if not s:
            break 
        ID = s.split(",")[0]
        x  = s[len(ID)+3:-3].split(",")
        X.append(np.array(x))
X = np.vstack(X)
print(X.shape)
        
        
    

In [None]:
Xn = normalize( X, norm='l2', axis=0 )
nclass = 2

#
#  Umap is somewhat usable for easy case (1),
# but does not work so well for few labels (2)
#which_labels = 1
which_labels = 2

if which_labels==1: # easy case
    # alt. label everything 1/3 class 0, 1/3 class 1, 1/3 unlabeled(-1)
    target = np.array(np.random.randint(0,3,X.shape[0]))[:,None]
if which_labels==2: # very few labels, different class cardinalities
    target = np.full((X.shape[0],1), -1, dtype=np.int8)
    target[-10:-3] = 0
    target[-3:] = 1

print("target shape ",target.shape)
print("target[-20:]", target[-20:])

# try defaults - like tutorial
# Euclidean embedding
Euc_map = umap.UMAP(random_state=42,
                    metric='euclidean',
                    #target_weight = 0.9,
                    n_neighbors=9,
                    #target_n_neighbors=3,
                    #verbose=True
   ).fit( Xn ) #, y=target )
Cat_map = umap.UMAP(random_state=42,
                    metric='categorical',
                    n_neighbors=9 # horrible (and slow) int(X.shape[0]/2),
             ).fit( target )

#      print("UMAP ends SemiSupervised")


In [None]:
print(Euc_map.embedding_)
print(Cat_map.embedding_)

In [None]:
target

In [None]:
tcolor = -0.5 + 12*((nclass-(target+1))/(nclass+1))

print(Euc_map.embedding_[-3:])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*Euc_map.embedding_.T, s=64.0, c=tcolor, cmap='viridis', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
#cbar.set_ticklabels(classes)
plt.title('UMAP Euclidean-only embedding');


Now with so few categorical labels, *everything* will essentially have distance 1.0
to everything else.  Also if class oridinalities differ widely, there might be
trouble in finding a decent knn setting.

The unfortunate consequence is that same-labels should cluster, most of the *Cat_map* connectivity is just "random noise".  I worry that this noise will *infect* the combined
graphs, later on!

In [None]:
print(Cat_map.embedding_[-3:])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*Cat_map.embedding_.T, s=60.0, c=tcolor, cmap='viridis', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
#cbar.set_ticklabels(classes)
plt.title('UMAP Categorical-only embedding');


### Combining the two umap embeddings
This can be done in a variety of ways.

The default method is *intersection* of the two connectivity graphs.
This means edges that are present in BOTH graphs get used.
So if the Categorical labels are not within 9 knn of the Euclidean embedding,
they essentially do nothing.

You can see [umap examples of composing models](https://umap-learn.readthedocs.io/en/latest/composing_models.html) where the 'diamonds' dataset
is most similar -- there are continuous variables like price, that can be combined
with several ordinal classes for cut, grade, ...

Let's see this in action for this toy problem.

In [None]:
intersection_mapper = Euc_map * Cat_map  # *, +, - for intersection, union, difference
#
print(intersection_mapper.embedding_[-3:])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*intersection_mapper.embedding_.T, s=60.0, c=tcolor, cmap='viridis', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
#cbar.set_ticklabels(classes)
plt.title('UMAP Intersection (Euc_map * Cat_map) embedding');
#

OK, that wasn't as bad as I expected.  Maybe adjusting Euc knn can help?

So how about union?  Roughly, edges in either graph get used in the result.

In [None]:
union_mapper = Euc_map + Cat_map  # *, +, - for intersection, union, difference
#
print(union_mapper.embedding_[-3:])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*intersection_mapper.embedding_.T, s=60.0, c=tcolor, cmap='viridis', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
#cbar.set_ticklabels(classes)
plt.title('UMAP Union (Euc_map + Cat_map) embedding');
#
if False:
    diff_mapper = Euc_map - Cat_map  # *, +, - for intersection, union, difference
    #
    print(diff_mapper.embedding_[-3:])
    fig, ax = plt.subplots(1, figsize=(14, 10))
    plt.scatter(*intersection_mapper.embedding_.T, s=10.0, c=target*100.0, cmap='prism', alpha=1.0)
    plt.setp(ax, xticks=[], yticks=[])
    cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
    cbar.set_ticks(np.arange(10))
    #cbar.set_ticklabels(classes)
    plt.title('UMAP Difference (Euc_map - Cat_map) embedding');
#

So not much difference.  Intersection does cluster, but really the weighting
between Euclidean and Categorical is not so great.

Looking into the '\*' intersection operator  (method *\_\_mul\_\_*) it's basically
runs *general_simplicial_set_intersection* (with default weight 0.5) and then
fixes up the resulting graph.   It's a bit complicated to do by hand.  So I
added a weight parameter to a new *mul\_helper* version of *\_\_mul\_\_*, that
takes an option *weight* parameter to pass into the general intersection.

In [None]:
if which_labels==1:
    uw_map = Euc_map.__add__(Cat_map, weight=1)  # local modification of graph combiners to also accept a weight
if which_labels==2:
    uw_map = Euc_map.__add__(Cat_map, weight=0.999)  # local modification of graph combiners to also accept a weight
# There is no effect noticable from combining the weight.
# With so many unlabelled data, the categorical graph pushes almost everything unlabelled to "far".
#
print(uw_map.embedding_[-3:])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*uw_map.embedding_.T, s=60.0, c=tcolor, cmap='viridis', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
#cbar.set_ticklabels(classes)
plt.title('UMAP a weighted union embedding');
#

In [None]:
if which_labels==1:
    iw_map = Euc_map.__mul__(Cat_map, weight=0.42)  # local modification of graph combiners to also accept a weight
if which_labels==2:
    iw_map = Euc_map.__mul__(Cat_map, weight=0.43)  # local modification of graph combiners to also accept a weight
# There is no effect noticable from combining the weight.
# With so many unlabelled data, the categorical graph pushes almost everything unlabelled to "far".
#
print(iw_map.embedding_[-3:])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*iw_map.embedding_.T, s=60.0, c=tcolor, cmap='viridis', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
#cbar.set_ticklabels(classes)
plt.title('UMAP Weighted intersection embedding');
#

In [None]:
if which_labels==1:
    tw = 1.1
if which_labels==2:
    # there is NO GOOD VALUE for the umap built-in to get clusters
    tw = 0.99

u_map = umap.UMAP(random_state=42,
                    metric='euclidean',
                    n_neighbors=9,
                    target_metric='categorical',
                    target_weight = tw,
                    target_n_neighbors=9,
                    #verbose=True
   ).fit( Xn, y=target )
print(u_map.embedding_[-3:])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*u_map.embedding_.T, s=60.0, c=tcolor, cmap='viridis', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
#cbar.set_ticklabels(classes)
plt.title('UMAP semi-supervised embedding');
#

So the UMAP result looks *the same*, and "washes out" too much of the Euclidean graph. There is almost
no effect from playing with n_neighbors, target_n_neighbors, or target_weight.

### OK, so now for something different.
A custom metric can be constructed as follows.

1. append a one-hot class vector to X dimensions
    - if no label, append a zero vector
    - if label, set value in that dim to LBL (default 1.0)
2. now euclidean metric can weight euclidean and categorical data *before* UMAP
   constructs its knn graph.
    - a *high LBL* value means the label data is effectively *more important*.


In [None]:
print("Xn.shape",Xn.shape)
print("nclasses",nclass)
xxshape = (Xn.shape[0], Xn.shape[1]+nclass)
print("xxshape",xxshape)
Xx = np.zeros( xxshape, dtype=np.float64)
print("Xx.shape",Xx.shape)
Xx[:,0:Xn.shape[1]] = Xn
print("target shape",target.shape)

def xxlabel():
    global LBL
    for i in range(target.shape[0]):
        #print(target[i,0])
        lbl = target[i,0]
        if lbl>=0: # i.e one of {0, ..., nclass-1}
            Xx[i, Xn.shape[1]+lbl] = LBL

LBL = 1.11111111111
xxlabel()
print("Xx[-4:]",Xx[-4:])

*Xn* has normalized item vectors.

First we have extended dimensionality adding *nclass* zeros.
Then if we had label C, we set the C'th added zero to a big value *LBL*.

A big *LBL* puts same-class items *very far* from all other points, and
close (original euclidean distance) to all same-class points.

Distances between unlabeled items also retain their original euclidean distance.

Distances from a labeled to other-label will tend to be higher than distances to unlabelled.

(I think)  Let's see how this works out:

In [None]:
LBL = 0.1  # very tiny clustering one-hot dims -- not enough to force clusters
xxlabel()
# Euclidean embedding of Xx
xxmap = umap.UMAP(random_state=42,
                  metric='euclidean',
                  n_neighbors=9,
                 ).fit( Xx )
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*xxmap.embedding_.T, s=60.0, c=tcolor, cmap='viridis', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
#cbar.set_ticklabels(classes)
plt.title(f'UMAP Euc + {LBL=} embedding');
#

In [None]:
LBL = 1.0  # since Xn was normalized, even 1.0 is enough to cluster well
xxlabel()
xxmap = umap.UMAP(random_state=42,
                  metric='euclidean',
                  n_neighbors=9,
                 ).fit( Xx )
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*xxmap.embedding_.T, s=60.0, c=tcolor, cmap='viridis', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
#cbar.set_ticklabels(classes)
plt.title(f'UMAP Euc + {LBL=} embedding');
#