### RAG Document


### 01 Setup

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os 
baseUrl = os.getenv('BASE_URL')
model = os.getenv('MODEL')
Temperature = float(os.getenv('Temperature'))  
Num_predict = int(os.getenv('Num_predict'))   

In [3]:

from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import SystemMessagePromptTemplate,HumanMessagePromptTemplate,ChatPromptTemplate

llm = ChatOllama(base_url=baseUrl, model=model, temperature=Temperature, num_predict=Num_predict    )


### 02 Setup llm for to call 


In [4]:

def ask_llm(query,context):
    system = SystemMessagePromptTemplate.from_template("""
You are helpful AI assistant who answer user question based on the provided context.
                                                   Do not answer in more than {words} words 
""")
    
    prompt= """ 
    Answer user question based on the provided context ONLY! If do not know the anser , just say "i don't know".
    ### Context: 
    {context}

    ### Question: 
    {question}

    ### Answer :


 """
    
    messages = [system, HumanMessagePromptTemplate.from_template(prompt)]
    template = ChatPromptTemplate(messages=messages)
    chain= template |  llm | StrOutputParser()

    response = chain.invoke({'context': context, 'question': query, 'words': 50})
    return response
    


In [5]:


### PDF Parser 


### 03 PDF Parser 


In [6]:
from langchain_community.document_loaders import PyMuPDFLoader

pdfs = []

for root, dirs, files in os.walk("rag-dataset"):
    for file in files:
         if file.endswith(".pdf"):
              pdfs.append(os.path.join(root,file))

docs = []
for pdf in pdfs:
     loader= PyMuPDFLoader(pdf)
     temp = loader.load()
     docs.extend(temp)



In [7]:
def format_docs(textDocs):
    return "\n\n".join([x.page_content for x in textDocs])

context = format_docs(docs)



    

In [8]:
# reponse =  ask_llm(context, 'You are the pdf Summarize . ## Summarize the pdf in 10 points')
# print(reponse)

In [9]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4o")
print(len(enc.encode(context)))

228229


In [11]:
output = ask_llm(query="how to gain Muscle? give me in mark down format", context=context)


## Vector Store and Retrival 

In [18]:
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url=baseUrl)


In [19]:
import faiss 
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS



In [20]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [21]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_spliter = RecursiveCharacterTextSplitter(chunk_size= 1000, chunk_overlap=100)
chunks = text_spliter.split_documents(docs)



In [23]:
vector_store.search(query='HOw to gain msucle mass ?', k=5, search_type="similarity")

[Document(id='8f918217-1642-4f09-b0c3-893f12ad87de', metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'rag-dataset\\health supplements\\3.health_supplements_side_effects.pdf', 'file_path': 'rag-dataset\\health supplements\\3.health_supplements_side_effects.pdf', 'total_pages': 11, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-10-21T14:37:42+00:00', 'trapped': '', 'modDate': 'D:20241021143742Z', 'creationDate': '', 'page': 7}, page_content='component of Geranium plants, e.g. as geranium extract (71). However, the presence of \nDMAA in plants has not been verified, leading to the conclusion that DMAA in supplements \nis generated by chemical synthesis (72). DMAA has further been banned as a performance \nenhancing drug by the World Anti-Doping Agency (73). One version of the weight-loss \nsupplement OxyELITE Pro from USPlabs, LLC contained the compound 1,3-\ndimethylamylamine (DMAA) in addition to ingredients

In [22]:
ids = vector_store.add_documents(documents=chunks)
db_name = "health_supplements"
vector_store.save_local(db_name)

In [1]:
from dotenv import load_dotenv
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import Document
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import streamlit as st
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
import pymupdf
import time
import os

In [7]:
def ask_llmFormater(text):
        """Return the text in format way  """
        try:
            baseUrl = os.getenv('BASE_URL')
            model = os.getenv('MODEL')
            temperature = float(os.getenv('Temperature', 0.7))
            num_predict = int(os.getenv('Num_predict', 100))
            google_api_key = os.getenv('GOOGLE_API_KEY')
            db_name = 'health_supplements_VECTOR_DB'

            llm = ChatGoogleGenerativeAI(
                model="gemini-2.0-flash",
                google_api_key=google_api_key,
                temperature=0.0
            )
            
            prompt = ChatPromptTemplate.from_template(
                    """
                    You are a text formatter that enhances readability while preserving 100% of the original content.

                    FORMAT RULES:
                    1. Convert main topics/section headings to **bold**
                    2. Convert key terms, concepts, and subtopics to *italics*
                    3. Use bullet points for lists and details
                    4. Maintain all original information - do not summarize or remove anything
                    5. Do not add any explanatory text, notes, or commentary
                    6. Organize content with proper spacing and hierarchy
                    7. Do not include phrases like "cannot be summarized" or explanations of format

                    INPUT TEXT:
                    '{context}'

                    OUTPUT (formatted version of the exact same content):
                    """
                )
            
            # Create the chain
            chain = (
               prompt
                | llm 
                | StrOutputParser()
            )
            
            response = chain.invoke(text)
            return response
            
        except Exception as e:
            print(f"Error in ask_llmFormater: {e}")
            return f"Sorry, there was an error ask_llmFormater: {e}"

In [None]:
text= """ 
*   **Multimedia Systems:** The document starts by defining multimedia and its global structure, elements, uses, and the types of media involved. It also covers data streams, representation values, representation space, and the properties of multimedia in terms of media combination and independence.
*   **Sound and MIDI:** It discusses the need for the MIDI standard, types of MIDI messages, the relationship between MIDI and SMPTE timing, and MIDI software. It also explains sound digitization, storage, and playback in multimedia systems, as well as sound wave frequency, speech recognition, MIDI reception modes, sound generation steps, and MIDI data formats.
*   **Digital Images:** The document covers digital image representation, image recognition steps, image formats, and the fundamentals of image processing. It also discusses bitmap images, their advantages and disadvantages compared to vector images, and image enhancement techniques like contrast enhancement, linear and non-linear transformations, and histogram equalization.
*   **Image Enhancement and Transmission:** Finally, it explains spatial filtering techniques for image enhancement, different image transmission possibilities, and negative transformation used in image enhancement.
"""
response = ask_llmFormater(text)

In [9]:
response

'**Multimedia Systems**\n\n*   *What they are, their structure, elements, uses, and properties.*\n*   How multimedia combines different types of media.\n\n**Sound**\n\n*   How sound is digitized, stored, and played in multimedia systems.\n*   *Sound wave frequency*\n*   *Speech recognition*\n*   *MIDI (Musical Instrument Digital Interface)*\n*   *Sound generation*\n*   *MIDI data formats*\n\n**Digital Images**\n\n*   How images are represented digitally.\n*   *Image recognition steps*\n*   *Image formats*\n*   Fundamentals of image processing.\n\n**Image Enhancement**\n\n*   Techniques like *contrast enhancement, linear and non-linear transformations, and spatial filtering.*\n*   *Histogram equalization*\n*   *Negative transformation*\n\n**MIDI**\n\n*   The need for the *MIDI standard.*\n*   Types of *MIDI messages.*\n*   Its relationship with *SMPTE timing.*\n*   *MIDI software.*'