Follow these: 
https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_rag_agent_llama3_local.ipynb
https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_self_rag_local.ipynb

TODO: change embeddings; adapt json output for groq -> json_mode: https://api.python.langchain.com/en/latest/chat_models/langchain_groq.chat_models.ChatGroq.html#langchain_groq.chat_models.ChatGroq.with_structured_output


In [1]:
# specify your working directory
working_dir = "/Users/pietro/open-modular-rag"

In [2]:
from dotenv import load_dotenv
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain.document_loaders import UnstructuredFileLoader
from unstructured.cleaners.core import group_broken_paragraphs
from langchain_community.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import pandas as pd
import re
import string

In [3]:
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

In [4]:
chat = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")

In [5]:
system = "You are a helpful assistant."
human = "{text}"
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

chain = prompt | chat
response = chain.invoke({"text": "What NFL team won the Super Bowl in the year Justin Bieber was born?"})
print(response.content)

Justin Bieber was born on March 1, 1994. The NFL team that won the Super Bowl that year was the Dallas Cowboys, who defeated the Buffalo Bills in Super Bowl XXVIII on January 30, 1994.


### Define some useful functions

In [6]:
def split_pdf_pages_with_metadata(file_path, ID_name):
    """PDF file loader with metadata
    Args:
        file_path (_type_): Path to the PDF file relative to the current working directory
        ID_name (_type_): Identifier based on the file name
    Returns:
        _type_: Dataframe with the extracted data
    """
    try:
        # # Load data using the Unstructured schema with 'paged' mode -> not working on my system
        # loader = UnstructuredFileLoader(
        #     file_path,
        #     mode="paged",  # Use 'paged' mode to split pages correctly
        #     strategy="fast",
        # )
        # Used PDFMiner instead
        loader = PDFMinerLoader(file_path, concatenate_pages=False)
        data = loader.load()

        data_list = []

        for page_number, item in enumerate(data):
            page_content = " ".join(item.page_content.split())
            source = file_path
            last_modified = item.metadata.get("last_modified", "28Apr2024") # Modified from "N/A")

            # Create a unique ID for each page based on ID_name and page number
            page_id = f"{ID_name} {page_number + 1}"

            # Append the extracted data to a list with a unique ID for each page
            data_list.append({
                "ID": page_id,  # Unique identifier for each page
                "Content": page_content,
                "Metadata": f"Source: {source}, Page: {page_number + 1}, Last Modified: {last_modified}",
                "DocumentType": f"Content: {True}",
            })

        # Create a DataFrame from the extracted data
        df = pd.DataFrame(data_list)
        return df

    except Exception as e:
        print(f"Error processing PDF file {file_path}: {e}")
        return None


def clean_document_content(content: str):
    """Clean document content from dedicated words, phrases or patterns
    Args:
        content (_type_): raw text content df["Content"]
    Returns:
        _type_: df["Content"] - cleaned
    """
    # Words or phrases to remove
    words_to_remove = ["More Agents Is All You Need"]

    # Create a regular expression pattern to match whole words
    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'

    # Remove specified words/phrases
    content = re.sub(pattern, '', content, flags=re.IGNORECASE)

    # Remove duplicate consecutive words
    content = re.sub(r'\b(\w+)( \1\b)+', r'\1', content)

    # Remove extra spaces
    content = re.sub(r'\s+', ' ', content).strip()

    return content


def preprocess_text(text: str):
    """Enhanced text preprocessing.
    - Lowercase conversion.
    - Punctuation removal.
    - Whitespace normalization.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


def create_chunks_with_unique_ids(df, text_splitter):
    """function to split preprocessed document content into defined chunk sizes
    Args:
        df (_type_): input the dataframe with the content column
        text_splitter (_type_): uses the RecursiveCharacterTextSplitter to split the content into chunks of size specified in chunk above
    Returns:
        _type_: pandas dataframe with the chunks
    """
    chunks = []

    for index, row in df.iterrows():
        content = row['Content']

        # Split the content into chunks using the RecursiveCharacterTextSplitter
        content_chunks = text_splitter.split_text(content)

        # Generate a unique chunk ID for each chunk based on the document's ID and index
        doc_id = row['ID']
        for chunk_index, chunk_content in enumerate(content_chunks, start=1):
            chunk_id = f"{doc_id} - Chunk {chunk_index}"
            chunks.append({
                'Chunk_ID': chunk_id,
                'Content': chunk_content,
                'Metadata': row['Metadata']
            })

    return pd.DataFrame(chunks)

## Load the docs

In [7]:
doc_path = working_dir + "/docs/2402.05120v1.pdf"

# load the pdf document and split the content into pages with respective meta data fields (e.g. date and page nr.)
article_df = split_pdf_pages_with_metadata(doc_path, "more_agents_arxiv_paper")

In [8]:
# perform some cleaning operations on the extracted content
article_df['Content'] = article_df['Content'].apply(group_broken_paragraphs)
article_df['Content'] = article_df['Content'].apply(clean_document_content)
article_df.head()

Unnamed: 0,ID,Content,Metadata,DocumentType
0,more_agents_arxiv_paper 1,Junyou Li * 1 Qin Zhang * 1 Yangbin Yu 1 Qiang...,Source: /Users/pietro/open-modular-rag/docs/24...,Content: True
1,more_agents_arxiv_paper 2,"paths. In fact, it can be used as a plug-in to...",Source: /Users/pietro/open-modular-rag/docs/24...,Content: True
2,more_agents_arxiv_paper 3,Algorithm 1 Sampling-and-voting Require: Query...,Source: /Users/pietro/open-modular-rag/docs/24...,Content: True
3,more_agents_arxiv_paper 4,Table 1. Comparing the conducted experiments w...,Source: /Users/pietro/open-modular-rag/docs/24...,Content: True
4,more_agents_arxiv_paper 5,Figure 3. The accuracy scales with the ensembl...,Source: /Users/pietro/open-modular-rag/docs/24...,Content: True


In [9]:
# define how the content should be split into smaller chunks
text_splitter_400 = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=25
)

In [10]:
# create chunks using the text splitter
article_df = create_chunks_with_unique_ids(article_df, text_splitter_400)
article_df.head()

Unnamed: 0,Chunk_ID,Content,Metadata
0,more_agents_arxiv_paper 1 - Chunk 1,Junyou Li * 1 Qin Zhang * 1 Yangbin Yu 1 Qiang...,Source: /Users/pietro/open-modular-rag/docs/24...
1,more_agents_arxiv_paper 1 - Chunk 2,"LLMs, while the degree of enhancement is cor- ...",Source: /Users/pietro/open-modular-rag/docs/24...
2,more_agents_arxiv_paper 1 - Chunk 3,"in variety of applications (Zhao et al., 2023)...",Source: /Users/pietro/open-modular-rag/docs/24...
3,more_agents_arxiv_paper 1 - Chunk 4,"Wu et al., 2023). In these works, multiple LLM...",Source: /Users/pietro/open-modular-rag/docs/24...
4,more_agents_arxiv_paper 1 - Chunk 5,"to using one single agent. Similarly, CoT-SC (...",Source: /Users/pietro/open-modular-rag/docs/24...


In [11]:
# store preprocessed and chunked data to a defined directory
article_df.to_parquet(working_dir + '/moreAgentsPaper.parquet')

### Retriever