In [1]:
# What is document
from langchain_core.documents import Document

#from langchain.docstore.document import Document

documents = [
    # Example 1: Simple text document
    Document(
        page_content="Hello world",
        metadata={"source": "world.txt"}
    ),
    
    # Example 2: Article with author and date metadata
    Document(
        page_content="Climate change is impacting global weather patterns...",
        metadata={"source": "news_article.txt", "author": "Jane Doe", "date": "2023-08-15"}
    ),
    
    # Example 3: Research paper with DOI and journal information
    Document(
        page_content="In this study, we explore the effects of machine learning on healthcare...",
        metadata={"source": "research_paper.pdf", "DOI": "10.1234/example.doi", "journal": "Journal of AI Research"}
    ),
    
    # Example 4: Blog post with tags
    Document(
        page_content="Top 10 Python libraries for data science...",
        metadata={"source": "blog_post.html", "author": "Data Guru"}
    ),
    
    # Example 5: Product review with rating
    Document(
        page_content="This laptop has excellent battery life but the screen resolution is lacking...",
        metadata={"source": "product_review.txt", "product_name": "XYZ Laptop", "rating": 4.5}
    ),
    
    # Example 6: Transcript of a conversation
    Document(
        page_content="Person A: How are you doing today?\nPerson B: I'm doing great, thank you!",
        metadata={"source": "conversation.txt", "date": "2023-08-18"}
    ),
    
    # Example 7: Technical documentation with version info
    Document(
        page_content="This document explains the API for version 2.0 of the software...",
        metadata={"source": "api_docs.pdf", "version": "2.0", "product": "MySoftware"}
    ),
    
    # Example 8: Code snippet with programming language info
    Document(
        page_content="def add(a, b):\n    return a + b",
        metadata={"source": "code_snippet.py", "language": "Python"}
    ),
    
    # Example 9: Email content with sender and recipient information
    Document(
        page_content="Hi Team,\nPlease find the attached report for Q3...",
        metadata={"source": "email.txt", "sender": "boss@example.com", "recipient": "team@example.com", "date": "2023-08-10"}
    ),
    
    # Example 10: Book chapter with title and chapter number
    Document(
        page_content="In this chapter, we explore the fundamentals of quantum mechanics...",
        metadata={"source": "book.pdf", "title": "Introduction to Quantum Mechanics", "chapter": 3}
    )
]


In [2]:
documents

[Document(metadata={'source': 'world.txt'}, page_content='Hello world'),
 Document(metadata={'source': 'news_article.txt', 'author': 'Jane Doe', 'date': '2023-08-15'}, page_content='Climate change is impacting global weather patterns...'),
 Document(metadata={'source': 'research_paper.pdf', 'DOI': '10.1234/example.doi', 'journal': 'Journal of AI Research'}, page_content='In this study, we explore the effects of machine learning on healthcare...'),
 Document(metadata={'source': 'blog_post.html', 'author': 'Data Guru'}, page_content='Top 10 Python libraries for data science...'),
 Document(metadata={'source': 'product_review.txt', 'product_name': 'XYZ Laptop', 'rating': 4.5}, page_content='This laptop has excellent battery life but the screen resolution is lacking...'),
 Document(metadata={'source': 'conversation.txt', 'date': '2023-08-18'}, page_content="Person A: How are you doing today?\nPerson B: I'm doing great, thank you!"),
 Document(metadata={'source': 'api_docs.pdf', 'version': 

In [5]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["HUGGING_FACE_TOKEN"] = os.getenv("HUGGING_FACE_TOKEN")

### Vector Store
Use OLLAMA embeddings -> as of now

In [6]:
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain_community.embeddings import OllamaEmbeddings

embeddings =OllamaEmbeddings(model = "gemma2:2b")
llm = ChatGroq(model="llama3-8b-8192")

In [6]:
#from langchain_huggingface import HuggingFaceEmbeddings
#HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")

In [7]:
#vectorStore = Chroma.from_documents(documents=documents,embedding=embeddings)

In [9]:
from langchain_community.vectorstores import FAISS

vectorStore = FAISS.from_documents(documents, embeddings)

In [12]:
vectorStore.similarity_search("Python language for programming")

[Document(metadata={'source': 'world.txt'}, page_content='Hello world'),
 Document(metadata={'source': 'api_docs.pdf', 'version': '2.0', 'product': 'MySoftware'}, page_content='This document explains the API for version 2.0 of the software...'),
 Document(metadata={'source': 'book.pdf', 'title': 'Introduction to Quantum Mechanics', 'chapter': 3}, page_content='In this chapter, we explore the fundamentals of quantum mechanics...'),
 Document(metadata={'source': 'research_paper.pdf', 'DOI': '10.1234/example.doi', 'journal': 'Journal of AI Research'}, page_content='In this study, we explore the effects of machine learning on healthcare...')]

In [14]:
## Async query 
await vectorStore.asimilarity_search("Python language for programming")


[Document(metadata={'source': 'world.txt'}, page_content='Hello world'),
 Document(metadata={'source': 'api_docs.pdf', 'version': '2.0', 'product': 'MySoftware'}, page_content='This document explains the API for version 2.0 of the software...'),
 Document(metadata={'source': 'book.pdf', 'title': 'Introduction to Quantum Mechanics', 'chapter': 3}, page_content='In this chapter, we explore the fundamentals of quantum mechanics...'),
 Document(metadata={'source': 'research_paper.pdf', 'DOI': '10.1234/example.doi', 'journal': 'Journal of AI Research'}, page_content='In this study, we explore the effects of machine learning on healthcare...')]

In [22]:
# Retriver => Runnable : can be combine and chain together to perform task on LLM.

from typing import List
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

#1] method 1 to create retriver: RunnableLambda

retriever = RunnableLambda(vectorStore.similarity_search).bind(k=1)

retriever.batch(["python programming", "add two number"])

[[Document(metadata={'source': 'world.txt'}, page_content='Hello world')],
 [Document(metadata={'source': 'world.txt'}, page_content='Hello world')]]

In [23]:
# 2] Method 2 to create retriver 
vectorRetriever=vectorStore.as_retriever(
    search_type = "similarity",
    search_kwargs={"k":1}
)

In [31]:
vectorRetriever.batch(["code"])

[[Document(metadata={'source': 'world.txt'}, page_content='Hello world')],
 [Document(metadata={'source': 'world.txt'}, page_content='Hello world')]]

In [32]:
vectorRetriever.invoke("python ")

[Document(metadata={'source': 'code_snippet.py', 'language': 'Python'}, page_content='def add(a, b):\n    return a + b')]

In [8]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# RunnablePassThrough is like a conveyor belt: it takes input, does nothing to it, 
# and passes it along to the next step.
#It doesn’t process or transform the input in any way; it just moves the data forward.


message = """
Answer the question using the provided context only.
{question}. 

context: 
{context}. 

 """



prompt = ChatPromptTemplate.from_messages(
    [
        ("human",message)
    ]
)

# pass retriver and LLM

ragChain = {"context":vectorRetriever, "question":RunnablePassthrough()}

chain = ragChain|prompt|llm

response = chain.invoke("Tell me about fundamental")

NameError: name 'vectorRetriever' is not defined

In [37]:
response

AIMessage(content="Since the provided context is a document with metadata and page content, I'm assuming you're asking about the fundamental nature of this document. \n\nIn that case, the fundamental aspect of this document is its basic structure, which consists of metadata (source) and page content (Hello world). These two components provide the essential information about the document, making it a fundamental unit of communication.", response_metadata={'token_usage': {'completion_tokens': 77, 'prompt_tokens': 47, 'total_tokens': 124, 'completion_time': 0.064166667, 'prompt_time': 0.009983879, 'queue_time': 0.0036966210000000006, 'total_time': 0.074150546}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_6a6771ae9c', 'finish_reason': 'stop', 'logprobs': None}, id='run-05795e72-f753-4905-aa83-9ab8d566fe73-0', usage_metadata={'input_tokens': 47, 'output_tokens': 77, 'total_tokens': 124})