In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import altair as alt

load_dotenv()


True

In [2]:
def get_datafile():
    """Get raw data file from community map API."""

    api_key = os.getenv("COMMUNITY_MAP_KEY")
    url = f"https://maps.datascience.wisc.edu/apis/authentication/public/api/contact-info?password={api_key}&format=csv"
    return pd.read_csv(url)


def parse(df) -> tuple[list, list[dict]]:
    """Parse dataframe to get texts for embeddings and its metadatas."""

    data = df[["First name", "Last name", "Email address", "Research summary"]].dropna()
    data["name"] = data["First name"] + " " + data["Last name"]
    data["text"] = data["name"] + "; " + data["Research summary"]
    data.drop(["First name", "Last name", "Research summary"], axis=1, inplace=True)
    data.rename(columns={"Email address": "email"}, inplace=True)

    texts = data["text"].tolist()
    metadatas = data[["email", "name"]].to_dict(orient="records")
    return texts, metadatas


def extract_metadata(metadatas: list[dict], key: str) -> list:
    """Extract metadata values."""

    values = []
    for metadata in metadatas:
        values.append(metadata[key])
    return values

In [3]:
df = get_datafile()
texts, metadatas = parse(df)


### Annoy Document Store

In [4]:
from langchain.vectorstores import Annoy
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedded = embeddings.embed_documents(texts)

# Push to Annoy
text_and_embedded = list(zip(texts, embedded))
doc_store = Annoy.from_embeddings(text_and_embedded, embeddings)


In [5]:
doc_store.similarity_search_with_score("Soil health", k=3)


[(Document(page_content='Jamie Patton; Outreach Specialist with the Nutrient and Pest Management Program.  In this role, I provide education to farmers, technical service providers, and agency professionals on conservation cropping systems, focusing on soil health and management.  My research is focused on the agronomic and environmental impact of integrating cover crops and alternative forages into annual cropping systems.', metadata={}),
  0.6141745448112488),
 (Document(page_content='Jingyi Huang; remote sensing; proximal soil sensing; digital soil mapping; soil physics; soil vegetation atmosphere interaction; geospatial big data analytics;', metadata={}),
  0.6238724589347839),
 (Document(page_content='Kent Weigel; Whole-genome selection selection of dairy cattle for increasing feed efficiency, reducing methane emissions, and improving animal health', metadata={}),
  0.6330299973487854)]

In [6]:
doc_store.save_local("doc_store.ann")
doc_store = Annoy.load_local("doc_store.ann", embeddings=embeddings)


## T-SNE

In [7]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
embedded_2d = tsne.fit_transform(np.array(embedded))


In [8]:
plot_df = pd.DataFrame(embedded_2d, columns=["x", "y"])
plot_df["name"] = extract_metadata(metadatas, "name")
plot_df["text"] = texts

In [9]:
base = alt.Chart(plot_df).mark_circle().encode(
    x="x", y="y", tooltip=["name", "text"])
name = base.mark_text(align="left", dx=5).encode(text="name")
(base + name).interactive().properties(
    width=800, height=600, title="Community Map Users T-SNE"
)
