# Visualizing lanaguage embeddings

In this notebook we will visualize UMAP representations of language embeddings together with word clouds generated from the underlying text data. This is a game: You can draw outlines around point clouds and the wordcloud will update. From the wordcloud of PhD topics you can guess at which institute the researchers might work. You can test if your guess is right by clicking the `Show solution` button at the bottom.

In [1]:
import pandas as pd
#import umap
import stackview
import numpy as np
import yaml

In [2]:
with open("phd_topics.yml", 'r') as file:
    data_dict = yaml.safe_load(file)
df = pd.DataFrame(data_dict)
df.head()

Unnamed: 0,TSNE0,TSNE1,UMAP0,UMAP1,embedding,name,research_field,selection,topic
0,2.548442,8.969955,-1.020463,9.660064,"[0.019196026027202606, 0.010897933505475521, -...",Taylor Reed,FIZ-KA - Leibniz-Institut für Informationsinfr...,1,"Digital Archives, Embodied Knowledge, and the ..."
1,-12.006737,-13.473205,1.41924,6.479001,"[0.016622617840766907, -0.009818249382078648, ...",Riley Jain,HKI - Leibniz-Institut für Naturstoff-Forschun...,1,Microbial Secondary Metabolites and Narrative:...
2,-10.38535,12.543368,4.321546,4.795953,"[-0.016480615362524986, 0.014093692414462566, ...",Taylor Adams,IÖR - Leibniz-Institut für ökologische Raument...,1,Spatial Imaginaries of Ecological Transition: ...
3,8.713301,-10.254994,0.332513,10.49652,"[0.008821303024888039, 0.005257884040474892, -...",Devon Thomas,"IWM - Leibniz-Institut für Wissensmedien, Tübi...",1,Algorithmic Storytelling and the Evolution of ...
4,-12.952323,3.004944,2.245961,3.266387,"[-0.02530881017446518, 0.004650192800909281, -...",Alex Lee,MfN - Museum für Naturkunde - Leibniz-Institut...,1,The Literary Ecology of Scientific Illustratio...


In [4]:
import ipywidgets as widgets

# original wordcloud widget
wc = stackview.wordcloudplot(df, column_text="topic", column_x="UMAP0", column_y="UMAP1")

# new controls
show_btn = widgets.Button(description="Show solution")
reset_btn = widgets.Button(description="Reset")
label = widgets.Textarea(
    value="",
    placeholder="",
    description="",
    disabled=True,  # read-only
    layout=widgets.Layout(width='70%', height='150px')
)

code_text = """
"""

def on_show(b):
    # 1. Filter rows where selection == 1
    filtered = df[df['selection'] == 1]
    
    # 2. Get unique values (excluding NaNs) and sort them for readability
    label.value = "\n".join(np.unique(filtered['research_field'].dropna())) 

def on_reset(b):
    label.value = ""

wc.observe(on_reset)

show_btn.on_click(on_show)
reset_btn.on_click(on_reset)

controls = widgets.HBox([show_btn, reset_btn, label])
ui = widgets.VBox([wc, controls])

# display the combined UI (putting ui as last expression will display it in a notebook)
ui

VBox(children=(VBox(children=(HBox(children=(HBox(children=(VBox(children=(VBox(children=(HBox(children=(VBox(…