# Prototype research area tsne plot
Trying to embed research description and plot it with TSNE 2d.

In [None]:
from dotenv import load_dotenv
from community_map import download_datafile, parse, extract_metadata
import numpy as np
import pandas as pd
import altair as alt

from langchain.vectorstores import Annoy
from langchain.embeddings import OpenAIEmbeddings
from sklearn.manifold import TSNE

In [None]:
load_dotenv()
# df = download_datafile()
# df.to_parquet("tmp/df.parquet")

In [None]:
df = pd.read_parquet("tmp/df.parquet")
texts, metadatas = parse(df)

### Annoy Document Store

In [None]:
embeddings = OpenAIEmbeddings()

# embedded = embeddings.embed_documents(texts)

with np.load("tmp/embedded.npz") as data:
    embedded = data["embedded"]

# Push to Annoy
text_and_embedded = list(zip(texts, embedded))
doc_store = Annoy.from_embeddings(text_and_embedded, embeddings, metadatas=metadatas)

In [None]:
# Try to search with the default cosine
doc_store.similarity_search_with_score("Soil health", k=3)

In [None]:
doc_store.save_local("doc_store.ann")

# doc_store = Annoy.load_local("doc_store.ann", embeddings=embeddings)

## T-SNE

In [None]:
tsne = TSNE(n_components=2)
embedded = np.array(embedded, dtype=np.float32)
embedded_2d = tsne.fit_transform(embedded)

In [None]:
first_names = extract_metadata(metadatas, "first_name")
last_names = extract_metadata(metadatas, "last_name")
names = [f"{first} {last}" for first, last in zip(first_names, last_names)]

In [None]:
plot_df = pd.DataFrame(embedded_2d, columns=["x", "y"])
plot_df["name"] = names
plot_df["text"] = texts

In [None]:
base = alt.Chart(plot_df).mark_circle().encode(x="x", y="y", tooltip=["name", "text"])
name = base.mark_text(align="left", dx=5).encode(text="name")
plot = (
    (base + name)
    .interactive()
    .properties(width=800, height=600, title="Community Map Users T-SNE")
)
plot.save("plots/user_tsne.html")

Save npz for later use

In [None]:
np.savez("tmp/embedded.npz", embedded=embedded)