In [1]:
!pip install langchain-huggingface
!pip install huggingface_hub
!pip install langchain
!pip install chromadb
!pip install pypdf
!pip install pytest



In [2]:
!pip install langchain_core



In [3]:
#loading pdf data 
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
DATA_PATH = "Data"

def load_documents():
    document_loader= PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

In [4]:
documents=load_documents()
print(documents[0])

page_content='MONOPOLY 
Property Trading Game from Parker Brothers" 
AGES 8+ 
2 to 8 Players 
Contents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance 
and Community Chest cards, Title Deed cards, play money and a Banker's tray. 
Now there's a faster way to play MONOPOLY. Choose to play by 
the classic rules for buying, renting and selling properties or use the 
Speed Die to get into the action faster. If you've never played the classic 
MONOPOLY game, refer to the Classic Rules beginning on the next page. 
If you already know how to play and want to use the Speed Die, just 
read the section below for the additional Speed Die rules. 
SPEED DIE RULES 
Learnins how to Play with the S~eed Die IS as 
/ 
fast as playing with i't. 
1. When starting the game, hand out an extra $1,000 to each player 
(two $5005 should work). The game moves fast and you'll need 
the extra cash to buy and build. 
2. Do not use the Speed Die until you've landed on or passed over 
GO for the first time. O

In [5]:
#Split the documents 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_documents(documents: list[Document]):
    
    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=500,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [6]:
chunks= split_documents(documents)
print(chunks[2])


page_content='1. When starting the game, hand out an extra $1,000 to each player 
(two $5005 should work). The game moves fast and you'll need 
the extra cash to buy and build. 
2. Do not use the Speed Die until you've landed on or passed over 
GO for the first time. Once you collect that first $200 salary, you'll 
use the Speed Die for the rest of the game. This means that some 
players will start using the die before others. 
3. Once you start using the Speed Die, roll it along with the two' metadata={'source': 'Data\\monopoly.pdf', 'page': 0}


In [7]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from secret_key import huggingface_key

def get_embedding_function():
    
    embeddings = HuggingFaceInferenceAPIEmbeddings(
        api_key=huggingface_key, model_name="sentence-transformers/all-MiniLM-l6-v2"
    )
    #print(huggingface_key)
    return embeddings

In [8]:
#setting chunk id

def calculate_chunk_ids(chunks):
    last_page_id= None
    current_chunk_index= 0 
    
    for chunk in chunks:
        source= chunk.metadata.get("source")
        page= chunk.metadata.get("page")
        current_page_id= f"{source}:{page}"
    
        #if the current and last page ID are the same increment the chunk index.
        if current_page_id == last_page_id:
            current_chunk_index +=1
        else:
            current_chunk_index = 0
    
        #calculate chunk index
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id
    
        #add it to the page metadata
        chunk.metadata["id"]=chunk_id
    
    return chunks




In [9]:
from get_embedding_function import get_embedding_function
from langchain.vectorstores.chroma import Chroma 
CHROMA_PATH= "chroma"
def add_to_chroma(chunks: list[Document]):
    db= Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    #calculate chunk ids
    chunks_with_ids = calculate_chunk_ids(chunks)
    
    #add or update the documents.
    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    #only add new chunks to the db.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print(f"✅no new documents to add")

   
    
        

In [10]:
add_to_chroma(chunks)


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
Number of existing documents in DB: 95
✅no new documents to add


In [15]:
import os 
from langchain import PromptTemplate, LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEndpoint

CHROMA_PATH= "chroma"

PROMPT_TEMPLATE= """
Query:
{Question}

----

Answer based on the chunks that best match the query:
{Context}
"""
parser.arg

def rag_query(query_text: str):

    # Redefine the same embedding function that was used for the database. 
    embedding_function = get_embedding_function()
    db= Chroma(
        persist_directory=CHROMA_PATH, embedding_function=embedding_function
    )

    # Search query in the database to find k most similar chunks
    result= db.cosine_similarity(query_text, k=5)
    #db.similarity_search_with_score(query_text, k=5)

    # Create a final prompt with relevant context and query to pass to the LLM.
    context_text= "\n\n-------------\n\n".join([doc.page_content for doc, _score in result])
    prompt_template= ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt=prompt_template.format(Question= query_text, Context= context_text)


    repo_id="mistralai/Mistral-7B-Instruct-v0.3"
    token=huggingface_key


    llm=HuggingFaceEndpoint(
        repo_id=repo_id,
        max_length=128,
        temperature=0.6,
        token=token)

    response_text= llm.invoke(prompt)
    return(response_text)
    
    

In [81]:
print(huggingface_key)

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
