In [None]:
import pandas as pd
import numpy as np
from scipy.cluster.vq import whiten
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_distances
import seaborn as sns
sns.set(style="white")
import cmcrameri.cm as cmc
import pacmap
import trimap
import umap
import random

### BOKEH
from bokeh.models import HoverTool, ColumnDataSource, Plot, Scatter, BoxZoomTool, ResetTool
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.resources import INLINE
output_notebook(INLINE)

#### Description
This notebook is used to visualise the Embedding space of concepts

In [None]:
v = 4.02
save_path = r"../plots/%s/" %v
def rgb(hex):
    hex = hex[1:]
    rgb = []
    for i in (0, 2, 4):
        decimal = int(hex[i:i+2], 16)
        rgb.append(decimal/256)
    rgb.append(1)
    return tuple(rgb)
def return_distances(token: str, x):
    idx = np.argwhere(vocab["TOKEN"].values == token).item()
    return np.einsum("j,ij -> i", x[idx], x)

def cosine_distances(token: str, x):
    output = list()
    idx = np.argwhere(vocab["TOKEN"].values == token).item()
    for i in range(x.shape[0]):
        output.append(cosine(x[idx], x[i]))
    return np.array(output)


def plot_region(r):
    plt.hlines(y = r[0][0], xmin=r[1][0], xmax=r[1][1], linewidth=1.5, linestyle="dotted", color="black")
    plt.hlines(y = r[0][1], xmin=r[1][0], xmax=r[1][1], linewidth=1.5, linestyle="dotted", color="black")
    plt.vlines(x = r[1][0], ymin=r[0][0], ymax=r[0][1], linewidth=1.5, linestyle="dotted", color="black")
    plt.vlines(x = r[1][1], ymin=r[0][0], ymax=r[0][1], linewidth=1.5, linestyle="dotted", color="black")
    
def plot_square(s, width):
    sx = s[0]
    sy = s[1]
    plt.hlines(y = sy, xmin=sx, xmax=sx+width, linewidth=2, linestyle="dotted", color="black")
    
    plt.hlines(y = sy - width, xmin=sx, xmax=sx + width, linewidth=2, linestyle="dotted", color="black")
    plt.vlines(x = sx, ymin=sy-width, ymax=sy, linewidth=2, linestyle="dotted", color="black")
    plt.vlines(x = sx + width, ymin=sy-width, ymax=sy, linewidth=2, linestyle="dotted", color="black")

In [None]:
on_linux = False
if on_linux:
    vocab_path = r"../vocab/global_set/result.tsv"
    data_path = r"../token_embeddings/tensors.tsv"
else:
    vocab_path = r"K:\22SSI\Germans\processed\vocab\global_set\result.tsv"
    data_path = r"O:\projekter\PY000017_D\logs\v15\pre_training\version_1.33\00029\token_embeddings\tensors.tsv"
vocab = pd.read_csv(vocab_path, sep="\t").set_index("ID")
data = pd.read_csv(data_path, sep="\t", header=None)
data.head()

In [None]:
le = LabelEncoder()
labels = le.fit_transform(vocab["CATEGORY"])
print(le.classes_)
for i, c in enumerate(le.classes_):
    print(i,c)

In [None]:
c = []
cmap = cmc.batlowS
for l in labels:
    if l in [2, 3,4]: # C_ADIAG, C_INDM, C_PATTYPE
        c.append("#009E73") #sky blue
    elif l == 11:     # SALARY
        c.append(cmap(0))
    elif l==7:        # MUNICIPALITY
        c.append("#cc79a7") # reddish purple
    elif l in [6, 15]: # MONTH, YEAR
        c.append("#0072b2") #blue
    elif l in [1, 8, 9]: #BACKGROUND, SOC
        c.append("#e69f00") #orange
    elif l == 12: #WORK_INDUSTRY
        c.append("#999999") #vermilion
    elif l==13: #WORK_POSITION
        c.append("#d55e00") #vermilion
    else:
        c.append("#000000") #black


In [None]:
x = data.values
x_ = np.delete(x, [0,4,5,6,7,8])
x -= x_.mean(0)
xh = whiten(x)

### Visualisation with different Projection Methods

In [None]:
projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, random_state=0, MN_ratio=1, FP_ratio=10, distance="angular", lr=0.5)
xp = projector.fit_transform(xh)
fig = plt.figure(figsize=(10,10))
plt.scatter(xp[:,0], xp[:,1], c=c,  s=50, edgecolors="white", linewidths=0.5, alpha=0.8)
plt.autoscale()
plt.savefig(save_path + "life_space_pacmap.svg", format="svg")
plt.show()

In [None]:
### DENSMAP
param = dict(n_components=2,
             densmap=True,
             dens_lambda=0.2, 
             random_state=0)
prj = umap.UMAP(**param)
xp = prj.fit_transform(xh)
fig = plt.figure(figsize=(10,10))
plt.scatter(xp[:,0], xp[:,1], c=c,  s=50, edgecolors="white", linewidths=0.5, alpha=0.8)
plt.autoscale()
plt.savefig(save_path + "life_space_densmap.svg", format="svg")
plt.show()

In [None]:
### UMAP
param = dict(n_components=2,
             #n_neighbors=50,
             random_state=0)
prj = umap.UMAP(**param)
xp = prj.fit_transform(xh)
fig = plt.figure(figsize=(10,10))
plt.scatter(xp[:,0], xp[:,1], c=c,  s=50, edgecolors="white", linewidths=0.5, alpha=0.8)
#plt.tight_layout()
plt.autoscale()
plt.savefig(save_path + "life_space_umap.svg", format="svg")
plt.show()

In [None]:
#prj = TSNE(n_components=2, metric="cosine", perplexity=30, early_exaggeration=25, n_jobs=5)
#xp = prj.fit_transform(x)
#fig = plt.figure(figsize=(10,10))
#plt.scatter(xp[:,0], xp[:,1], c=c,  s=50, edgecolors="white", linewidths=0.5, alpha=0.8)
#plt.tight_layout()
#plt.savefig(save_path + "life_space_tsne.svg", format="svg")
#plt.show()

## Bokeh

In [None]:
projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, random_state=0, MN_ratio=1, FP_ratio=10, distance="angular", lr=0.5)
xp = projector.fit_transform(xh)
source = ColumnDataSource(dict(x=xp[:,0], y=xp[:,1], color=c, concept=vocab["TOKEN"].values))
tooltips = [("Token", "@concept"), 
            ("X", "@x"),
            ("Y", "@y")]
plot = Plot(title = "Life Event Embediing Space",
            width=600, toolbar_location="below")
glyph = Scatter(x="x", y="y", fill_color="color", line_color="white", line_width=0.5, size = 7)

plot.add_glyph(source, glyph)
plot.add_tools(HoverTool(tooltips=tooltips))
plot.add_tools(BoxZoomTool())
plot.add_tools(ResetTool())

show(plot)

In [None]:
fig = plt.figure(figsize=(10,7))
plt.scatter(xp[:,0], xp[:,1], c=c,  s=30, edgecolors="white", linewidths=0.5, alpha=0.8)
plt.axis("scaled")
p = [patches.Patch(color= c,label = l) for c, l in zip([rgb("#009E73"), cmap(0), rgb("#cc79a7"), rgb("#0072b2"), rgb("#e69f00"), rgb("#999999"), rgb("#d55e00"), rgb("#000000") ],
                                          ["Health", "Income", "Municipality", "Birthday", "Social", "Industry", "Position", "Others"])]
plt.legend(handles=p)
plt.tight_layout()

# plt.savefig(save_path + "life_space.svg", format="svg")

plt.show()

In [None]:
fig = plt.figure(figsize=(10,10))
plt.scatter(xp[:,0], xp[:,1], c=c,  s=50, edgecolors="white", linewidths=0.5, alpha=0.8)
p = [patches.Patch(color= c,label = l) for c, l in zip([rgb("#009E73"), cmap(0), rgb("#cc79a7"), rgb("#0072b2"), rgb("#e69f00"), rgb("#999999"), rgb("#d55e00"), rgb("#000000") ],
                                          ["Health", "Income", "Municipality", "Birthday", "Social", "Industry", "Position", "Others"])]

rS = [-1, 18.2]
wS = 2.6

rP3 = [3.2, 0.5]
wP3 = 2

rP4 = [-9.9,-6.3]
wP4 = 2

rI1 = [-3.6,-17.6]
wI1 = 2

rINC = [17, -5]
wINC = 3

rO= [-17.3,-0.4]
wO=1.2
plot_square(rS, width = wS)
plot_square(rP3, width = wP3)
plot_square(rP4, width = wP4)

plot_square(rI1, width = wI1)
plot_square(rINC, width = wINC)
plot_square(rO, width = wO)

plt.legend(handles=p)

plt.axis("scaled")
plt.axis("off")
plt.tight_layout()
plt.savefig(save_path + "life_space.svg", format="svg")
plt.show()

In [None]:
### sace areas separatelly
fig = plt.subplots(figsize=(5,5))
r = rP4
w = wP4
title = "P4"
plt.scatter(xp[:,0], xp[:,1], c=c,  s=500, edgecolors="white", linewidths=1, alpha=0.75)
plt.xlim(r[0], r[0] + w)
plt.ylim(r[1] - w, r[1])
for i, v  in enumerate(vocab["TOKEN"]):
    plt.annotate(v, (xp[i,0], xp[i,1] + 0.05),  annotation_clip=True, c="gray", fontsize = 15)
plt.axis("off")
plt.tight_layout()
plt.savefig(save_path + "life_areas_%s.svg" %title, format="svg")
plt.show()

In [None]:
#### Cosine Distance to Concept
top_n = 10
dist = cosine_distances("C16", xh)
idx = np.argsort(dist)[:top_n]
for k ,v in zip(vocab["TOKEN"].values[idx], dist[idx]):
    print(k,"%.3f" %v)
