In [None]:

!pip install langchain
!pip install unstructured # The unstructured library provides open-source components for pre-processing text documents such as PDFs, HTML and Word Documents.
!pip install openai
!pip install pybind11 # pybind11 is a lightweight header-only library that exposes C++ types in Python
!pip install chromadb # the AI-native open-source embedding database
!pip install Cython # Cython is an optimising static compiler for both the Python programming language
!pip3 install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI" # COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation
!pip install unstructured[local-inference]
!CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" pip install 'git+https://github.com/facebookresearch/detectron2.git' # Detectron2 is Facebook AI Research's next generation library that provides state-of-the-art detection and segmentation algorithms.
!pip install layoutparser[layoutmodels,tesseract] # A Unified Toolkit for Deep Learning Based Document Image Analysis
!pip install pytesseract # Python-tesseract is an optical character recognition (OCR) tool for python.
!pip install Pillow==9.0.0 # The Python Imaging Library adds image processing capabilities to your Python interpreter. Need this version, otherwise errors occur.

In [None]:
%pip install pinecone-client

In [82]:
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage

llm = ChatOllama(model="llama3.1",temperature=0.2)

messages = [SystemMessage(content= "Act as helpful assistant"), 
            HumanMessage(content ="what is logic behind calculus?")]
# model_response = llm.invoke(messages)

In [4]:
# model_response

AIMessage(content='Calculus is a branch of mathematics that deals with the study of continuous change, particularly in the context of functions and limits. The logic behind calculus can be broken down into several key concepts:\n\n1. **Limits**: Calculus starts with the concept of limits, which allows us to study how a function behaves as its input gets arbitrarily close to a certain value. Limits are used to define the basic operations of calculus, such as differentiation and integration.\n2. **Differentiation**: Differentiation is the process of finding the rate at which a function changes as its input changes. This is done by calculating the derivative of the function, which represents the slope of the tangent line to the graph of the function at a given point.\n3. **Integration**: Integration is the process of finding the area under a curve or the accumulation of a quantity over an interval. This is done by calculating the definite integral of the function, which represents the tot

In [3]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

In [8]:
!wget https://s21.q4cdn.com/399680738/files/doc_financials/2022/q4/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf

--2024-08-11 22:12:33--  https://s21.q4cdn.com/399680738/files/doc_financials/2022/q4/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf
Resolving s21.q4cdn.com (s21.q4cdn.com)... 139.99.123.118
Connecting to s21.q4cdn.com (s21.q4cdn.com)|139.99.123.118|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 185815 (181K) [application/pdf]
Saving to: ‘Meta-12.31.2022-Exhibit-99.1-FINAL.pdf’


2024-08-11 22:12:37 (64.5 KB/s) - ‘Meta-12.31.2022-Exhibit-99.1-FINAL.pdf’ saved [185815/185815]



In [13]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("Meta-12.31.2022-Exhibit-99.1-FINAL.pdf")
pages = loader.load_and_split()

In [4]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader('/Users/shaonsikder/Downloads/Horizon/Generative AI/', glob="**/*.pdf", show_progress=True)
pdf_docs = loader.load()

  0%|          | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 1/1 [00:02<00:00,  2.84s/it]


In [17]:
pdf_docs

[Document(metadata={'source': '/Users/shaonsikder/Downloads/Horizon/Generative AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf'}, page_content='Meta Reports Fourth Quarter and Full Year 2022 Results\n\nMENLO PARK, Calif. – February 1, 2023 – Meta Platforms, Inc. (Nasdaq: META) today reported financial results for the quarter and full year ended December 31, 2022.\n\n"Our community continues to grow and I\'m pleased with the strong engagement across our apps. Facebook just reached the milestone of 2 billion daily actives," said Mark Zuckerberg, Meta founder and CEO. "The progress we\'re making on our AI discovery engine and Reels are major drivers of this. Beyond this, our management theme for 2023 is the \'Year of Efficiency\' and we\'re focused on becoming a stronger and more nimble organization."\n\nFourth Quarter and Full Year 2022 Financial Highlights\n\nThree Months Ended December 31,\n\nYear Ended December 31,\n\nIn millions, except percentages and per share amounts Revenue Costs and e

In [5]:
len(pdf_docs)

1

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = text_splitter.split_documents(pdf_docs)

In [7]:
len(splits)

60

In [14]:
from pinecone import Pinecone
api_key = "YOUR_API_KEY"
# configure client
pc = Pinecone(api_key=api_key)

In [15]:
from pinecone import ServerlessSpec

cloud = 'aws'
region = 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [16]:
index_name = 'rag-example'

In [37]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,  # dimensionality of all-MiniLM-L6-v2
        metric='cosine',
        spec=spec
    )
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [39]:
from langchain_huggingface import HuggingFaceEmbeddings 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")



In [91]:
splits

[Document(metadata={'source': '/Users/shaonsikder/Downloads/Horizon/Generative AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf'}, page_content='Meta Reports Fourth Quarter and Full Year 2022 Results\n\nMENLO PARK, Calif. – February 1, 2023 – Meta Platforms, Inc. (Nasdaq: META) today reported financial results for the quarter and full year ended December 31, 2022.'),
 Document(metadata={'source': '/Users/shaonsikder/Downloads/Horizon/Generative AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf'}, page_content='"Our community continues to grow and I\'m pleased with the strong engagement across our apps. Facebook just reached the milestone of 2 billion daily actives," said Mark Zuckerberg, Meta founder and CEO. "The progress we\'re making on our AI discovery engine and Reels are major drivers of this. Beyond this, our management theme for 2023 is the \'Year of Efficiency\' and we\'re focused on becoming a stronger and more nimble organization."\n\nFourth Quarter and Full Year 2022 Financial Highlights'

In [54]:
from tqdm.auto import tqdm

batch_size = 100  # Choose a batch size that works for your system
for i in tqdm(range(0, len(splits), batch_size)):
    i_end = min(i+batch_size, len(splits))
    batch = splits[i:i_end]
    
    ids = [f"doc_{j}" for j in range(i, i_end)]
    texts = [doc.page_content for doc in batch]
    metadatas = [{**doc.metadata, "text": doc.page_content} for doc in batch]
    embeddings_batch = embeddings.embed_documents(texts)
    
    to_upsert = list(zip(ids, embeddings_batch, metadatas))
    index.upsert(vectors=to_upsert)

100%|██████████| 1/1 [00:03<00:00,  3.46s/it]


In [95]:
vector1 = embeddings.embed_documents("Hello, world!")

In [96]:
vector2 = embeddings.embed_documents("Hello, Bangladesh!")

In [92]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,  # dimensionality of all-MiniLM-L6-v2
        metric='cosine',
        spec=spec
    )
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 60}},
 'total_vector_count': 60}

In [97]:
query = "What is the revenue of the company?"
query_embedding = embeddings.embed_query(query) #query vector
results = index.query(vector=query_embedding, top_k=5, include_metadata=True)

for match in results['matches']:
    print(f"Score: {match['score']:.2f}")
    print(f"Text: {match['metadata']['text'][:100]}...")  # Print first 100 chars
    print("---")

Score: 0.57
Text: Twelve Months Ended December 31,

Revenue Costs and expenses: Cost of revenue Research and developme...
---
Score: 0.56
Text: The following table presents our segment information of revenue and income (loss) from operations:

...
---
Score: 0.56
Text: 2

Total

1,462

1,488

564

684

4,198

CFO Outlook Commentary

We expect first quarter 2023 total ...
---
Score: 0.54
Text: $

32,165 $

33,671 $

116,609 $

117,929

Income (loss) from operations:

Family of Apps

$

10,678...
---
Score: 0.52
Text: Three Months Ended December 31,

Year Ended December 31,

In millions, except percentages and per sh...
---


In [57]:
results

{'matches': [{'id': 'doc_39',
              'metadata': {'source': '/Users/shaonsikder/Downloads/Horizon/Generative '
                                     'AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf',
                           'text': 'Twelve Months Ended December 31,\n'
                                   '\n'
                                   'Revenue Costs and expenses: Cost of '
                                   'revenue Research and development Marketing '
                                   'and sales General and administrative Total '
                                   'costs and expenses\n'
                                   '\n'
                                   'Income from operations Interest and other '
                                   'income (expense), net Income before '
                                   'provision for income taxes Provision for '
                                   'income taxes Net income Earnings per share '
                                   'attr

In [101]:
from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(index=index, embedding=embeddings)
vector_store.as_retriever().invoke("What is met's revenue?")

[Document(metadata={'source': '/Users/shaonsikder/Downloads/Horizon/Generative AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf'}, page_content='The following table presents our segment information of revenue and income (loss) from operations:\n\nSegment Information\n\n(In millions)\n\n(Unaudited)\n\nThree Months Ended December 31,\n\nTwelve Months Ended December 31,\n\n2022\n\n2021\n\n2022\n\n2021\n\nRevenue:\n\nAdvertising\n\n$\n\n31,254 $\n\n32,639 $\n\n113,642 $\n\n114,934\n\nOther revenue\n\n184\n\n155\n\n808\n\n721\n\nFamily of Apps Reality Labs\n\n31,438 727\n\n32,794 877\n\n114,450 2,159\n\n115,655 2,274\n\nTotal revenue\n\n$\n\n32,165 $\n\n33,671 $\n\n116,609 $\n\n117,929'),
 Document(metadata={'source': '/Users/shaonsikder/Downloads/Horizon/Generative AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf'}, page_content='$\n\n$\n\n4,990 $ 1,117 1,367 19,552 27,026 15,301 9,923 7,764 60,014\n\n$\n\n64,444 (3,530) 64,799 125,713 185,727 $\n\nDecember 31, 2021\n\n16,601 31,397 14,039 4,629 66,666 

In [90]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

# results = vector_store.similarity_search(
#     "LangChain provides abstractions to make working with LLMs easy",
#     k=2,
# )
# for res in results:
#     print(f"* {res.page_content} [{res.metadata}]")

# Set up a custom prompt
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)

# Create the RAG chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

# Use the RAG system
query = "What is the revenue of the company?"
result = qa_chain({"query": query})
print(result['result'])

# Print source documents
print("\nSource Documents:")
for doc in result['source_documents']:
    print(f"Content: {doc.page_content[:100]}...")  # Print first 100 chars
    print(f"Source: {doc.metadata.get('source', 'N/A')}")
    print("---")

$32,165 million for the twelve months ended December 31, 2022.

Source Documents:
Content: Twelve Months Ended December 31,

Revenue Costs and expenses: Cost of revenue Research and developme...
Source: /Users/shaonsikder/Downloads/Horizon/Generative AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf
---
Content: The following table presents our segment information of revenue and income (loss) from operations:

...
Source: /Users/shaonsikder/Downloads/Horizon/Generative AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf
---
Content: 2

Total

1,462

1,488

564

684

4,198

CFO Outlook Commentary

We expect first quarter 2023 total ...
Source: /Users/shaonsikder/Downloads/Horizon/Generative AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf
---
Content: $

32,165 $

33,671 $

116,609 $

117,929

Income (loss) from operations:

Family of Apps

$

10,678...
Source: /Users/shaonsikder/Downloads/Horizon/Generative AI/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf
---


In [None]:
import json
import os
from typing import Annotated, Optional, Dict, List, Tuple
import tiktoken
from fastapi import APIRouter, Form
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
from langchain_community.document_transformers import LongContextReorder
from langchain_openai.embeddings import OpenAIEmbeddings
from openai import OpenAI
from starlette.responses import StreamingResponse

from app.configs import settings
from app.logging import logger

improved_rag = APIRouter(tags=["V3_Rag_app"])
previous_messages = []
reordering = LongContextReorder()


def fetch_examples(
    query: str, k: int, prefilter: Optional[Dict[str, List[str]]] = None
) -> List[Tuple[str, float]]:
    """
    Fetches example documents from a MongoDB Atlas vector search based on the provided
    query, and filters out documents with a similarity score less than 0.5.

    Args:
        query (str): The search query to find similar documents.
        k (int): The number of similar documents to retrieve.
        prefilter (Optional[Dict[str, List[str]]]): Optional dictionary to specify
          pre-filter criteria for the search. The keys are field names and the values
          are lists of acceptable values for those fields. Documents that do not
          match these criteria will be excluded from the search.

    Returns:
        List[Tuple[str, float]]: A list of tuples where each tuple contains the page
        content of a similar document and its corresponding similarity score. Only
        documents with a similarity score of 0.5 or higher are included in the results.
    """
    fewshot_prompt_search = MongoDBAtlasVectorSearch.from_connection_string(
        settings.MONGO_URL,
        f"{settings.LITERATURE_DATABASE}.{settings.PROMPT_COLLECTION}",
        OpenAIEmbeddings(
            model=settings.OPENAI_EMBEDDING_MODEL,
            dimensions=settings.OPENAI_EMBEDDING_DIMENSIONS,
            disallowed_special=(),
        ),
        index_name=settings.VECTOR_INDEX,
    )
    if prefilter and any(value for value in prefilter.values()):
        formatted_prefilter = {
            key: {"$in": value} for key, value in prefilter.items() if value
        }
        results = fewshot_prompt_search.similarity_search_with_score(
            query=query, k=k, pre_filter=formatted_prefilter
        )
    else:
        results = fewshot_prompt_search.similarity_search_with_score(query=query, k=k)
    formatted_results = [
        (result.page_content, score) for result, score in results if score >= 0.7
    ]

    return formatted_results


def generate_response_chunks(
    message: str,
    project_id: Optional[str],
    flag_id: Optional[str],
    user_id: Optional[str],
    title: Optional[str],
    description: Optional[str],
    fewshot_prefilter: Optional[Dict[str, List[str]]] = None,
):
    """
    Generate response chunks using OpenAI's GPT-4 turbo model.

    Args:
        message (str): The user's query or message.
        project_id (str | None): The ID of the project.
        flag_id (str | None): The ID of the flag.
        user_id (str | None): The ID of the user.
        title (str | None): The title of the project.
        description (str | None): The description of the project.
        fewshot_prefilter (Optional[Dict[str, List[str]]]): Optional dictionary to
          specify pre-filter criteria for the few-shot examples search. The keys
          are field names and the values are lists of acceptable values for those
          fields.

    Yields:
        str: Response chunks generated by OpenAI's GPT-4 turbo model.
        str: JSON string containing source information.
    """

    vector_search = MongoDBAtlasVectorSearch.from_connection_string(
        settings.MONGO_URL,
        f"{settings.LITERATURE_DATABASE}.{settings.EMBEDDING_COLLECTION}",
        OpenAIEmbeddings(
            model=settings.OPENAI_EMBEDDING_MODEL,
            dimensions=settings.OPENAI_EMBEDDING_DIMENSIONS,
            disallowed_special=(),
        ),
        index_name=settings.VECTOR_INDEX,
    )

    prefilter = (
        {"flag_id": flag_id}
        if flag_id
        else (
            {"user_id": {"$in": [user_id]}}
            if user_id
            else {"project_id": {"$in": [project_id]}}
        )
    )
    examples = fetch_examples(query=message, k=5, prefilter=fewshot_prefilter)
    # Prepare few-shot examples for the system prompt
    few_shot_prompt = ""
    for example, score in examples:
        logger.info(f"Example: {example}")
        logger.info(f"Score: {score}")
        few_shot_prompt = "Example:\n"
        few_shot_prompt += f"{example}\n\n"

    results = vector_search.similarity_search_with_score(
        query=message, k=14, pre_filter=prefilter
    )
    results = reordering.transform_documents(results)

    contexts = []
    try:
        if title:
            contexts.append(f"Project Title: {title}\n")
        if description:
            contexts.append(f"Project Description: {description}\n\n\n")
    except Exception as e:
        logger.error(e)

    paper_titles = []
    for result in results:
        try:
            result_title = result[0].metadata.get("title")
            if result_title and result_title not in paper_titles:
                paper_titles.append(result_title)
                contexts.append(f"Paper Title: {result_title}\n")
            contexts.append(f"Page No: {result[0].metadata.get('page')}\n")
            contexts.append(f"Page Content: {result[0].page_content}\n")
            contexts.append("------------------------------\n")
        except Exception as e:
            logger.error(f"Error in adding context from vector search results: {e}")
            continue

    previous_messages.append(message)
    if len(previous_messages) > 4:
        previous_messages.pop(0)

    context_with_prev_msgs = "Previous User Messages:\n"
    for prev_msg in previous_messages[:-1]:
        context_with_prev_msgs += f"User: {prev_msg}\n"

    contexts.append(context_with_prev_msgs)
    human_query = (
        f"""Context: {contexts}\n\n\nUser's Current Message: {message}\nAnswer: """
    )

    messages = [
        {
            "role": "system",
            "content": """Never forget your name is Delineate AI, a helpful
            AI assistant of Delineate. Website of delineate is www.delineate.pro
            Delineate is committed to accelerating model based meta analysis, empowering
            scientists to achieve results within hours/weeks.

            Think step by step as experienced Clinical Pharmacologist
            researcher. Your focus is helping researcher with high quality and thought
            out answers using accurate information from research paper context.
            Answer the question based on context and do not make things up
            but it's okay to answer things roughly just make sure to indicate if you are
            estimating.
            """,
        },
        {"role": "user", "content": few_shot_prompt + human_query},
    ]

    encoding = tiktoken.encoding_for_model(settings.GPT_4_TEXT_MODEL)
    generated_tokens = 0
    input_tokens = sum(len(encoding.encode(msg["content"])) for msg in messages)

    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    response = client.chat.completions.create(
        model=settings.GPT_4_TEXT_MODEL,
        messages=messages,
        temperature=0.1,
        max_tokens=1400,
        stream=True,
    )

    for chunk in response:
        content = chunk.choices[0].delta.content
        if content:
            tokens = len(encoding.encode(content))
            generated_tokens += tokens
            yield content
        else:
            logger.error("Received None content from OpenAI API")

    source_info = {"context": []}

    for result in results:
        try:
            source = (
                "https://delineatespaces.nyc3.cdn.digitaloceanspaces.com/delineateAI/"
                + result[0].metadata["source"]
            )
            flag_id = result[0].metadata.get("flag_id")
            title = result[0].metadata.get("title")
            source_info["context"].append(
                {
                    "page_content": result[0].page_content,
                    "source": f"{source}" if source else "",
                    "page": result[0].metadata.get("page"),
                    "flag_id": f"{flag_id}" if flag_id else "",
                    "title": f"{title}" if title else "",
                }
            )
        except Exception as e:
            logger.error(f"Error in generating source info: {e}")

    yield "😊"

    per_input_token_cost = settings.GPT_4O_INPUT_TOKEN
    per_output_token_cost = settings.GPT_4O_OUTPUT_TOKEN
    total_input_token_cost = per_input_token_cost * input_tokens
    total_output_token_cost = per_output_token_cost * generated_tokens

    yield json.dumps({"source_info": source_info})
    yield "🔥"
    yield json.dumps(
        {
            "input_tokens": input_tokens,
            "usd_input_cost": total_input_token_cost,
            "total_generated_tokens": generated_tokens,
            "usd_output_cost": total_output_token_cost,
        }
    )


@improved_rag.post("/improved-rag/")
async def improved_rag_endpoint(
    message: Annotated[str, Form()],
    project_id: Annotated[Optional[str], Form()] = None,
    flag_id: Annotated[Optional[str], Form()] = None,
    user_id: Annotated[Optional[str], Form()] = None,
    title: Annotated[Optional[str], Form()] = None,
    description: Annotated[Optional[str], Form()] = None,
    fewshot_prefilter: Annotated[Optional[str], Form()] = None,
):
    """
    FastAPI endpoint to generate response chunks based on user's query using OpenAI's
    GPT-4 turbo model.

    Args:
        message (str): The user's query or message.
        project_id (str | None, optional): The ID of the project. Defaults to None.
        flag_id (str | None, optional): The ID of the flag. Defaults to None.
        user_id (str | None, optional): The ID of the user. Defaults to None.
        title (str | None, optional): The title of the project. Defaults to None.
        description (str | None, optional): The description of the project. Defaults
        to None.
        fewshot_prefilter (str | None, optional): JSON string representing the pre-filter criteria for the few-shot examples search. Defaults to None.

    Returns:
        StreamingResponse: A streaming response of generated response chunks.
    """
    fewshot_prefilter_dict = (
        json.loads(fewshot_prefilter) if fewshot_prefilter else None
    )
    return StreamingResponse(
        generate_response_chunks(
            message,
            project_id,
            flag_id,
            user_id,
            title,
            description,
            fewshot_prefilter_dict
        )
    )
