In [1]:
import random
import numpy as np
import pandas as pd
import torch 
from sentence_transformers import SentenceTransformer, util
from time import perf_counter as timer


device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)



In [2]:
embeddings_df_save_path = "text_chunk_and_embeddings_df.csv"

text_chunks_and_embedding_df = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df.head()
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep="  ")) 
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")
embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)

In [3]:
embeddings.shape

torch.Size([1680, 768])

In [4]:
def retrieve_relevant_resources(query:str, embeddings:torch.tensor, model:SentenceTransformer=embedding_model, n_resources_to_return: int=5, print_time:bool=True):
    query_embedding = model.encode(query, convert_to_tensor=True)
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    scores, indices = torch.topk(input=dot_scores, k=n_resources_to_return)
    return scores, indices

In [5]:
query = "foods high in fiber"
retrieve_relevant_resources(query=query, embeddings=embeddings)

(tensor([0.6964, 0.6810, 0.5566, 0.5344, 0.5187], device='cuda:0'),
 tensor([ 418,  360,  358, 1047,  412], device='cuda:0'))

In [6]:
import os
from dotenv import load_dotenv

load_dotenv()  # loads from .env file
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")

In [7]:
from together import Together

client = Together(api_key=TOGETHER_API_KEY)

In [8]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query   
    full_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    return [
        {"role": "system", "content": "You are a helpful assistant that answers based only on the given context."},
        {"role": "user", "content": full_prompt}
    ]

In [9]:
def ask(query, 
        temperature=0.7,
        max_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    """
    Queries Together AI's chat completion endpoint with retrieved context.
    """

    # Retrieve top context
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)
    
    context_items = [pages_and_chunks[i] for i in indices]

    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu()
    
    # Format into Together-style chat messages
    messages = prompt_formatter(query=query, context_items=context_items)

    # Query Together AI API
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",  # Or other Together model
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens
    )

    output_text = response.choices[0].message.content

    # Optionally format output
    if format_answer_text:
        output_text = output_text.strip()

    if return_answer_only:
        return output_text
    
    return output_text, context_items

In [13]:
query = "is protein dangerous for the body"
print(f"Query: {query}")

# Answer query with context and return context 
answer, context_items = ask(query=query, return_answer_only=False)

print(f"Answer:\n")
print(answer)
print(f"Context items:")
context_items

Query: is protein dangerous for the body
Answer:

Protein is not inherently dangerous for the body, but excessive consumption can have negative health consequences. High-protein diets, defined as those that derive more than 30 percent of calories from protein, have been linked to a higher risk of kidney stones, kidney disease, liver malfunction, colorectal cancer, and osteoporosis, particularly when the primary protein source is red meat. However, it is essential to note that diets high in red meat are also often high in saturated fat and cholesterol, making it challenging to determine if the high protein content is the primary culprit.

Additionally, some studies suggest that high-protein diets may accelerate bone-tissue loss by blocking calcium absorption in the gut and promoting calcium loss from bone. The Nurses' Health Study found that women who consumed more than 95 grams of protein per day had a 20 percent higher risk of wrist fracture. Nevertheless, the scientific data on high 

[{'page_number': 400,
  'sentence_chunk': 'as those that derive more than 30 percent of calories from protein. Many people follow high-protein diets because marketers tout protein’s ability to stimulate weight loss. It is true that following high-protein diets increases weight loss in some people. However the number of individuals that remain on this type of diet is low and many people who try the diet and stop regain the weight they had lost. Additionally, there is a scientific hypothesis that there may be health consequences of remaining on high-protein diets for the long-term, but clinical trials are ongoing or scheduled to examine this hypothesis further. As the high-protein diet trend arose so did the intensely debated issue of whether there are any health consequences of eating too much protein. Observational studies conducted in the general population suggest diets high in animal protein, specifically those in which the primary protein source is red meat, are linked to a higher 