Follow these: 
https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_rag_agent_llama3_local.ipynb
https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_self_rag_local.ipynb

TODO: change embeddings; adapt json output for groq -> json_mode: https://api.python.langchain.com/en/latest/chat_models/langchain_groq.chat_models.ChatGroq.html#langchain_groq.chat_models.ChatGroq.with_structured_output


Using `python 3.11.9`

In [1]:
# specify your working directory
working_dir = "/Users/pietromascheroni/open-modular-rag"

In [2]:
from dotenv import load_dotenv
import os
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from torch import cuda
from typing import Callable
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import chromadb

import pandas as pd
import re
import string

In [3]:
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

### Utils

In [4]:
def parse_metadata(metadata_str: str):
    """ transforms relevant data from a data frame column into a dict format
    Args:
        metadata_str (_type_): column of a dataframe

    Returns:
        _type_: column in a dict format needed for the metadata chroma function
    """
    metadata_dict = {}
    if pd.notna(metadata_str):
        # Assuming metadata is a string formatted as "key: value, key: value"
        for part in metadata_str.split(", "):
            if ": " in part:
                key, value = part.split(": ", 1)
                metadata_dict[key.strip()] = value.strip()
    return metadata_dict

## Load the docs

In [5]:
# store preprocessed and chunked data to a defined directory
combined_df = pd.read_parquet(working_dir + '/moreAgentsPaper.parquet', engine='fastparquet')
combined_df.head()


Unnamed: 0,Chunk_ID,Content,Metadata
0,more_agents_arxiv_paper 1 - Chunk 1,4 2 0 2 b e F 3 ] L C . s c [ 1 v 0 2 1 5 0 . ...,Source: /Users/pietromascheroni/open-modular-r...
1,more_agents_arxiv_paper 1 - Chunk 2,"LLMs, while the degree of enhancement is cor- ...",Source: /Users/pietromascheroni/open-modular-r...
2,more_agents_arxiv_paper 1 - Chunk 3,"in variety of applications (Zhao et al., 2023)...",Source: /Users/pietromascheroni/open-modular-r...
3,more_agents_arxiv_paper 1 - Chunk 4,"Wu et al., 2023). In these works, multiple LLM...",Source: /Users/pietromascheroni/open-modular-r...
4,more_agents_arxiv_paper 1 - Chunk 5,"to using one single agent. Similarly, CoT-SC (...",Source: /Users/pietromascheroni/open-modular-r...


In [6]:
combined_df["Metadata"] = combined_df["Metadata"].apply(parse_metadata)
combined_df.Metadata.to_list()[:2]

[{'Source': '/Users/pietromascheroni/open-modular-rag/docs/2402.05120v1.pdf',
  'Page': '1',
  'Last Modified': '2024-05-02T21:13:10'},
 {'Source': '/Users/pietromascheroni/open-modular-rag/docs/2402.05120v1.pdf',
  'Page': '1',
  'Last Modified': '2024-05-02T21:13:10'}]

In [7]:
# extract elements from dataframe and put them in a format suitable for chromadb
metadatas = combined_df['Metadata'].tolist()
ids = combined_df[['Chunk_ID']].apply(lambda x: ' '.join(x.dropna().values.tolist()), axis=1).tolist()
documents_all = combined_df[['Content']].apply(lambda x: ' '.join(x.dropna().values.tolist()), axis=1).tolist() 

### Initialize embedding model and embed chunks

In [8]:
embed_model_id = 'sentence-transformers/all-mpnet-base-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32},
    cache_folder=working_dir + '/emb_model'
)



In [9]:
# Perform Embedding
embeddings = embedding_model.embed_documents(documents_all)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 139 doc embeddings, each with a dimensionality of 768.


In [10]:
# ChromaDB setup to initilize collection including indeces of all documents
# (in case of errors, perform pip uninstall chromadb and pip install chromadb)
chroma_client = chromadb.PersistentClient(path=working_dir + "/vectordb")

In [11]:
# provide a name to setup and reference the vector index
collection_name = "more_agents_paper_self_rag"
# initialize the vector index with the respective similarity search metric
vectorstore = chroma_client.get_or_create_collection(collection_name, metadata={"hnsw:space": "cosine"})

In [12]:
# update the vector index with the preparred data
vectorstore.upsert(
    embeddings=embeddings,
    documents=documents_all,
    metadatas=metadatas,
    ids=ids
)

In [14]:
print(f"We have {vectorstore.count()} chunks in the vector store")

We have 139 chunks in the vector store
