<a href="https://colab.research.google.com/github/VishalPrem1994/AIGenPlayGround/blob/main/RAG_Document_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install openai
%pip install langchain
%pip install pypdf
%pip install chromadb
%pip install tiktoken
%pip install sentence-transformers
%pip install pdfminer.six
%pip install kor==0.10.0
%pip install cv2
%pip install pytesseract
%pip install PIL


In [None]:
from google.colab import drive
from pdfminer.high_level import extract_text

drive.mount('/content/drive/')

In [None]:
text = extract_text('/content/drive/MyDrive/R&D/Deep Learning V2/TrainingData/PatientBills/P1.pdf')

text = " ".join(text.split("\n"))
text = text.replace("   "," ")
print(text)

In [None]:
from typing import List, Optional
import itertools
import requests

import pandas as pd
from pydantic import BaseModel, Field, validator
from kor import extract_from_documents, from_pydantic, create_extraction_chain
from kor.documents.html import MarkdownifyHTMLProcessor
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI, VertexAI, HuggingFaceHub
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number


repo_id = "tiiuae/falcon-7b"
llm = OpenAI(model_name="gpt-3.5-turbo",temperature=0,max_tokens=2000)
# llm = HuggingFaceHub(huggingfacehub_api_token=huggingfacehub_api_token,repo_id=repo_id,model_kwargs={"temperature":0.6, "max_new_tokens":500})

In [216]:
invoice_schema = Object(
    id="amts_extraction",
    description="extraction of all payment info",
    attributes=[
        Number(id="total_bill_amt",
             description= "total bill amount",
             examples=[
            ("Gross bill amount :  Rs. 56,456.00", "56456"),
            ("Gross bill amount : 5,000.00", "5000"),
            ("Gross bill amount : Rs. 50,476.00 Net Amount to be paid by patient  :  45,000.34", "45000")
            ]),
        Number(id="insurance_approved_amt",
             description= "Net Amount Paid by Company",
             examples=[
            ("Net Amount to be Paid by Company : Rs. 56,456.00", "56456"),
            ("Net Amount to be Paid by Company : 5,000.00", "5000"),
            ("Net Amount to be Paid by Company  :  45,000.34", "45000")
            ]),
        Number(id="net_amt_extraction",
             description= "net payment to be paid by the patient",
             examples=[
            ("Net Amount to be paid by patient : Rs. 56,456.00", "56456"),
            ("Net Amount to be paid by patient : 5,000.00", "5000"),
            ("Net Bill Amount Rs. 61,641.00 Company Credit Limit : Rs. 50,476.00 Net Amount to be paid by patient  :  45,000.34", "45000")
            ]),
    ],
    many=False,
)
invoice_chain = create_extraction_chain(llm, invoice_schema)


from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    document_extraction_results = invoice_chain.predict_and_parse(text=text)['data']
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")
document_extraction_results



Total Tokens: 1631
Prompt Tokens: 1609
Completion Tokens: 22
Successful Requests: 1
Total Cost (USD): $0.0024575


{'amts_extraction': [{'total_bill_amt': '64297.13',
   'insurance_approved_amt': '50476',
   'net_amt_extraction': '11165'}]}

In [None]:
details_schema = Object(
    id="amts_extraction",
    description="extraction of all payment info",
    examples=[
        ("Patient Name : Mr. Paul Sams", [{"first_name": "Paul"}, {"second_name": "Sams"}]),
        ("Patient Name : Mr. Joju george", [{"first_name": "Joju"}, {"second_name": "george"}]),
        ("Patient Name : Mr. Nirmal Hafiz", [{"first_name": "Nirmal"}, {"second_name": "Hafiz"}])
    ],
    attributes=[
        Text(
            id="first_name",
            description="The first name of a person.",
        ),
        Text(
            id="second_name",
            description="The second name of a person.",
        )
    ],
    many=False,
)
details_chain = create_extraction_chain(llm, details_schema)
details_chain.run(text)

In [None]:
# split_docs = RecursiveCharacterTextSplitter().split_documents([text])

In [None]:
text

In [None]:
validated_data = list(
    itertools.chain.from_iterable(
        extraction["validated_data"] for extraction in document_extraction_results
    )
)

In [None]:
pd.DataFrame(record.dict() for record in validated_data)

In [None]:
details_chain.predict_and_parse(text=text)['data']

In [None]:
invoice_chain.predict_and_parse(text=text)['data']

## RAG Based Extraction

In [None]:
import os
from time import perf_counter
from langchain.document_loaders import PyPDFLoader
from langchain.llms import OpenAI, VertexAI, HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter

from langchain.embeddings import OpenAIEmbeddings, VertexAIEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

from google.colab import drive
drive.mount('/content/drive')

In [None]:




def pdf_loader():
    return [PyPDFLoader("/content/drive/MyDrive/R&D/Deep Learning V2/TrainingData/PatientBills/P1.pdf")]


def build_qa_chain(platform: str = 'falcon', chunk_size: int = 1000, chunk_overlap: int = 50) -> RetrievalQA:


    if platform == "openai":
      embedding = OpenAIEmbeddings()
      # splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
      splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
      llm = OpenAI(model_name="text-davinci-003",temperature=0.9,max_tokens=256)
    else:
      embedding = HuggingFaceEmbeddings()
      repo_id = "tiiuae/falcon-7b"
      splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
      llm = HuggingFaceHub(huggingfacehub_api_token=huggingfacehub_api_token,
                      repo_id=repo_id,
                      model_kwargs={"temperature":0.5, "max_new_tokens":200})

    loaders = pdf_loader()
    index = VectorstoreIndexCreator(
        embedding=embedding,
        text_splitter=splitter).from_loaders(loaders)
    print(len(index.vectorstore.get()))

    # Prepare the pipeline
    return RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=index.vectorstore.as_retriever(search_type="similarity",
                                                                                search_kwargs={"k": 2}),
                                       return_source_documents=True,
                                       input_key="question")




In [None]:
tick = perf_counter()
qa_chain = build_qa_chain('open', chunk_overlap=0)
print(f'Time span for building index: {perf_counter() - tick}')

# get reply to our questions
tick = perf_counter()
result = qa_chain({'question': 'What are the patient details', 'include_run_info': True})
print(f'Time span for query: {perf_counter() - tick}')

print('Q:', result['question'])
print('A:', result['result'])
print('\n')
print('Resources:', result['source_documents'])