In [None]:
import pandas as pd
from io import BytesIO
from PIL import Image
from base64 import b64decode
from langchain.schema import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.embeddings.fastembed import FastEmbedEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DataFrameLoader
from langchain.schema.runnable import RunnablePassthrough

from utils import format_as_doc, format_as_doc, llm, decode_image

# 1. Data Exploration

In [None]:
df = pd.read_json("https://storage.googleapis.com/swe-workshop-23/organizations.json")
df

In [None]:
nebula = df[df["title"] == "Nebula Labs"]
nebula

In [None]:
encoded_image = nebula["picture_data"].iloc[0]
decode_image(encoded_image)

In [None]:
df["content"] = df.apply(format_as_doc, axis=1)

nebula = df[df["title"] == "Nebula Labs"]

print(nebula["content"].iloc[0])

# 2. Interacting with the LLM

In [None]:
response = llm(
    "Hi! My name is Amrit! I'm a member of Nebula Labs, a student organization on campus dedicated to Open-Source projects built by students, for students. Today I will be talking to you about "
)
print(response)

In [None]:
prompt = PromptTemplate.from_template(
    """You are a student organization recommendation assistant. Given the user's interests, recommend a student organization on campus.

User interests: {interests}

Recommendation: """
)

chain = prompt | llm | StrOutputParser()
response = chain.invoke({"interests": "computer science, soccer"})
print(response)

# 3. Retrieving Context

In [None]:
loader = DataFrameLoader(df[["title", "content"]], page_content_column="content")
documents = loader.load()
embeddings = FastEmbedEmbeddings(max_length=512)
docsearch = Chroma.from_documents(documents, embeddings)

In [None]:
docsearch.similarity_search("computer science, soccer", k=5)

# 4. Integrating Context

In [None]:
retriever = docsearch.as_retriever(search_kwargs={"k": 5})

prompt = PromptTemplate.from_template(
    """You are a student organization recommendation assistant. Given the user's
    interests and some relevent search results from the campus student
    organization directory, recommend a student organization on campus as plain
    text, and give a short description of the organization.

User Interests: {interests}

Search Results:
=============
{context}
=============

Given the users interests and some relvent search results from the campus student organization directory, the recommended organization is """
)


def format_docs(docs):
    return "\n\n\n".join([d.page_content for d in docs])


chain = (
    {"context": retriever | format_docs, "interests": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
response = chain.invoke("women in stem")
print(response)