# Visualizing lanaguage embeddings

In this notebook we will visualize UMAP representations of language embeddings together with word clouds generated from the underlying text data. This is a game: You can draw outlines around point clouds and the wordcloud will update. From the wordcloud of PhD topics you can guess at which institute the researchers might work. You can test if your guess is right by clicking the `Show solution` button at the bottom.

In [1]:
import pandas as pd
#import umap
import stackview
import numpy as np
import yaml

In [2]:
with open("phd_topics.yml", 'r') as file:
    data_dict = yaml.safe_load(file)
df = pd.DataFrame(data_dict)
df.head()

Unnamed: 0,TSNE0,TSNE1,UMAP0,UMAP1,embedding,name,research_field,selection,topic
0,2.162428,-7.150111,2.770866,7.708063,"[-0.0071924785152077675, 0.0039014238864183426...",Taylor Reed,Biodiversity Synthesis,1,Integrative Modeling of Multi‑Taxon Functional...
1,12.824485,10.727262,1.838338,1.169984,"[-0.005492590367794037, 0.022543391212821007, ...",Riley Jain,Biodiversity Economics,1,Quantifying the Economic Valuation of Pollinat...
2,-2.38446,5.856971,1.017747,4.52293,"[-0.0024650206323713064, 0.019827308133244514,...",Taylor Adams,Biodiversity Conservation,1,Integrative Landscape Genomics for Enhancing A...
3,3.304832,11.290336,0.807963,3.035258,"[-0.00911727361381054, 0.0035786619409918785, ...",Devon Thomas,Biodiversity & People,1,Integrating Traditional Ecological Knowledge a...
4,-7.001156,0.154177,-0.031597,6.630692,"[-0.0033709630370140076, 0.018772806972265244,...",Alex Lee,Biodiversity in the Anthropocene,1,"Integrating Genomic, Functional, and Landscape..."


In [3]:
import ipywidgets as widgets

# original wordcloud widget
wc = stackview.wordcloudplot(df, column_text="topic", column_x="UMAP0", column_y="UMAP1")

# new controls
show_btn = widgets.Button(description="Show solution")
reset_btn = widgets.Button(description="Reset")
label = widgets.Textarea(
    value="",
    placeholder="",
    description="",
    disabled=True,  # read-only
    layout=widgets.Layout(width='70%', height='150px')
)

code_text = """
"""

def on_show(b):
    # 1. Filter rows where selection == 1
    filtered = df[df['selection'] == 1]
    
    # 2. Get unique values (excluding NaNs) and sort them for readability
    label.value = "\n".join(np.unique(filtered['research_field'].dropna())) 

def on_reset(b):
    label.value = ""

wc.observe(on_reset)

show_btn.on_click(on_show)
reset_btn.on_click(on_reset)

controls = widgets.HBox([show_btn, reset_btn, label])
ui = widgets.VBox([wc, controls])

# display the combined UI (putting ui as last expression will display it in a notebook)
ui

VBox(children=(VBox(children=(HBox(children=(HBox(children=(VBox(children=(VBox(children=(HBox(children=(VBox(…