In [24]:
import os
open_ai_models = {
    1: "gpt-4-1106-preview",
    2: "gpt-4",
    3: "gpt-4-32k",
    4: "gpt-3.5-turbo-1106",
    5: "gpt-3.5-turbo",
    6: "gpt-3.5-turbo-16k"
}
os.environ['OPENAI_API_KEY'] = 'OPEN_API_KEY'
os.environ["model"] = 'gpt-4-1106-preview'

In [2]:
!pip install amazon-textract-caller
!pip install pypdf Pillow
!pip install openai
!pip install chromadb
!pip install tiktoken
!pip install pdf2image
!pip install langchain



# If you closely look into the Pages of the Floor Plan PDF,you see each page has some textual information and the image of the floor plan of the particular Flat unit.
# So the problem of directly parsing pdf data using PyPdf is that it will omit the floor plan image data all together.

# So to avoid this data omission ,we will use a work around,in which we first convert each page of PDF to an Image using pdf2image library and store in a directory.
# Once all the pages are converted to images we can use Text-Extraction Techniques like OCR to extract data from each of the image.
# There are number of tools for Text-Extraction from the image such as Amazon Textractor,UnstructuredImageLoader etc.


In [3]:
#Convert PDF Pages to Images
from pdf2image import convert_from_path
def pdf_to_image(file_path,image_dir):
        os.makedirs(image_dir, exist_ok=True)
        images = convert_from_path(file_path, fmt='jpg', output_folder=image_dir,output_file="page")
        print(len(images))

pdf_to_image("./floor-plans.pdf","floor-plan-images")

13


In [4]:
import glob
def fetch_pages(image_dir):
        '''This Function will iterate through all the images and store the path of each image.'''

        image_files_path = glob.glob(f'{image_dir}/*.jpg')

        return image_files_path

image_files_path = fetch_pages('./floor-plan-images')
print(len(image_files_path))

13


# Since we will be using Langchain for creating a Document querying bot ,and out of the box langchain supports two ways of Text Extraction for now.
# 1. Amazon Textractor.
# 2. UnstructuredImageLoader
# We can use any of the above tools to extract the data.
# Below I will be showing both the ways,and we can later on decide to use any.

In [5]:
import boto3
session = boto3.Session(
    aws_access_key_id='Access_Key_Id',
    aws_secret_access_key='Secret_Access_Key'
)

In [6]:
# Using Amazon Textraction tool

from langchain.document_loaders import AmazonTextractPDFLoader
def get_pages_data(textract_client, image_files_path):
        '''Amazon Textraction uses OCR technique to extract data from the image.'''

        all_pages = []
        count = 1
        for path in image_files_path:

            loader = AmazonTextractPDFLoader(path, client=textract_client)
            documents = loader.load()
            all_pages.extend(documents)
            count += 1
        return all_pages

pages_data = get_pages_data(boto3.client("textract", region_name="us-east-1"),image_files_path)
print(len(pages_data))
pages_data

13


[Document(page_content="DAR AL KARAMA REAL ESTATE\nDAK\n2015\nSAPPHIRE 32\nBEDROOM\n3.80 X 3,90\n01 BEDROOM\nLEVEL 3 9.21\nUNIT X06\nBATHROOM\n1.90 X 1.90\n64.32 SQ.M.\n692 SQ.FT.\nSUITE AREA\nBALCONY AREA\n8,08 SQ.M.\n87 SQ.FT.\nTOTAL AREA\n72.40 SQ.M\n779 SQ.FT.\nBALCONY\nKITCHEN\n3.85 X 1.95\n2.70 X 3.68\nKEY SECTION\nKEY PLAN\nKILL\nLIVING ROOM\n5.55 X 3,70\nTOILET\n1.80 X 1.30\nSUBJECT\nTO\nCHANGE\nWITHOUT\nNOTICE,\nA DISCLAIMER: LL ARE IN IMPERIA I AND METRIC, AND MEASURED FROM F INISH TO STATED F INISH AREA. EXCLUDING 5. DRAWINGS CONSTRUCTION NOT TO SCA TOLERANCES. LE. 6. ALL IMAGES 2 ALL ANY USED MATERIA LIABILITY ARE FOR LS. WHATSOEVER DIMENSIONS, ILLUSTRATIVE AND PURPOSES DRAWINGS ONLY ARE AND APPROXIMATE DO NOT REPRESENT ONLY 3. THE INFORMATION ACTUA L SIZE, IS FEATURES SPECIF ICATIONS, F\nITTINGS,\nAT\nDEVELOPER'S DIMENSIONS FURNISHINGS. ABSOLUTE 7. THE DEVELOPER DISCRETION. RESERVES 4. ACTUA THE L AREA RIGHT MAY TO VAR MAKE Y FROM REVISIONS THE / ALTERATIONS, AT IT'S ABSOL

In [7]:
from langchain.document_loaders.image import UnstructuredImageLoader
def get_pages_data(image_files_path):
    pages_data = []
    count = 1

    for path in image_files_path:
        print(f"Processing image: {path}")

        try:
            loader = UnstructuredImageLoader(path)
            documents = loader.load()
            pages_data.extend(documents)
            print(f"Extracted Text from Page No. - {count}")

        except Exception as e:
            print(f"Error processing image {path}: {e}")
            continue

        count += 1

    print("All Pages of the Document Loaded")
    return pages_data

pages_data = get_pages_data(image_files_path)
print(f"Number of pages processed: {len(pages_data)}")
pages_data


Processing image: ./floor-plan-images/page0001-08.jpg
Extracted Text from Page No. - 1
Processing image: ./floor-plan-images/page0001-09.jpg
Extracted Text from Page No. - 2
Processing image: ./floor-plan-images/page0001-01.jpg
Error processing image ./floor-plan-images/page0001-01.jpg: cannot convert float infinity to integer
Processing image: ./floor-plan-images/page0001-02.jpg
Extracted Text from Page No. - 3
Processing image: ./floor-plan-images/page0001-03.jpg
Extracted Text from Page No. - 4
Processing image: ./floor-plan-images/page0001-07.jpg
Extracted Text from Page No. - 5
Processing image: ./floor-plan-images/page0001-13.jpg
Error processing image ./floor-plan-images/page0001-13.jpg: cannot convert float infinity to integer
Processing image: ./floor-plan-images/page0001-12.jpg
Extracted Text from Page No. - 6
Processing image: ./floor-plan-images/page0001-06.jpg
Extracted Text from Page No. - 7
Processing image: ./floor-plan-images/page0001-10.jpg
Extracted Text from Page No

[Document(page_content="SAPPHIRE D\n\nKS\n\nOl BEDROOM\n\nLEVEL 3,9, 2\n\nUNIT X06\n\nSUITE AREA 64.32 SQM. 692 SQFT. BALCONY AREA 8.08 SQM. 87 SQFT. TOTAL AREA 7240 SQM. 779 SQFT. KEY PLAN KEY SECTION Ss © . | os & a ale = o Ty a: ———«\n\n| in | : «\n\nB\n\n2\n\nr\n\nLU |\n\n|\n\nBEDROOM L 3.80 X 3.90 { = cree | a 190 X 190 r aa a ain | BALCONY «J KITCHEN |! fj B85 195 2.70 X 3.68 al . } Z | | | LIVING ROOM = >\\5 555 x3 70 <| ole | LU | f (Le 180 X 130 | _\n\nDISCLAIMER:\n\nLL DIMENSIONS ARE IN IMPERIA L AND METRIC, AND MEASURED FROM F INISH TO F INISH EXCLUDING CONSTRUCTION TOLERANCES. 2. A LL MATERIA LS, DIMENSIONS, AND DRAWINGS ARE APPROXIMATE ONLY. 3. INFORMATION IS SUBJECT TO CHANGE WITHOUT NOTICE, DEVELOPER'S ABSOLUTE DISCRETION. 4. ACTUA L AREA MAY VAR Y FROM THE STATED AREA. 5. DRAWINGS NOT TO SCA LE. 6. A LL IMAGES USED ARE FOR ILLUSTRATIVE PURPOSES ONLY AND DO NOT REPRESENT THE ACTUA L SIZE, FEATURES, SPECIF ICATIONS, F ITTINGS, FURNISHINGS. 7. THE DEVELOPER RESERVES THE RI

# Now we have extracted the data ,we can move on to the next step of training the LLM Model with our extracted data.
# Following are the two approaches to train our LLM.
# 1. Convert the Extracted data to vectors using embedding models and store it in vector-database.
# 2. Create a prompt from the extracted data structure it to feed LLM directly.

# For larger documents its always advisable to use Vector Database approach to avoid Token limit error and also to reduce the LLM Costs per query.


# For storing our Extracted data as Vectors into the Vector data base we have to do following steps.
# 1. Embed the Extracted data into Vectors.
# 2. Choose a Vector DB to Store our embeded extracted data.

# There are many models which can Embed our Extracted data ,such as Amazon Titan Embedding Model,Open AI's Text-Embedding-Ada-002,Cohere-Embed-English-V3.
# For our use case we will be using Amazon Titan Embedding Model through AWS Bedrock for embedding our Extracted Data.
# For Storing the Embedded Vector Data we will be using FAISS.

In [8]:
from langchain.embeddings import BedrockEmbeddings
embedding_model = BedrockEmbeddings(
    credentials_profile_name="default", region_name="us-east-1", model_id='amazon.titan-embed-text-v1'
)

In [10]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
def embed_data(all_pages,embedding_model):

        print(all_pages[1])
        print(len(all_pages))
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        documents = text_splitter.split_documents(all_pages)

        #self.print_steps(f"Converting Pages Data to Vector Database ....")

        vectordb = FAISS.from_documents(
        documents,
        embedding=embedding_model,
        #persist_directory=f'./{file_path}'
        )
        #vectordb.persist()
        return vectordb

vector_db = embed_data(pages_data,embedding_model)

page_content="SAPPHIRE D\n\nKS\n\nSTUDIO\n\nLEVEL 3,9, 21 UNIT\n\nSUITE AREA 3127 SQM. 337 BALCONY AREA 5.08 SQM. 55 TOTAL AREA 36.35 SQM. 391 APY PLAN O° o o 8\n\nX07\n\nSQFT.\n\nSQFT.\n\nSQFT.\n\nKEY SECTION\n\n|\n\n8\n\nBes\n\n2\n\nBATHROOM. 2.20 X 180 BEDROOM KITCHEN 148 4.65 X 3.70 3.50 X 1.80\n\nBALCONY 3.18 X\n\nDISCLAIMER:\n\nA LL DIMENSIONS ARE IN IMPERIA L AND METRIC, AND MEASURED FROM F INISH TO F INISH EXCLUDING CONSTRUCTION TOLERANCES. 2. A LL MATERIA LS, DIMENSIONS, AND DRAWINGS ARE APPROXIMATE ONLY. 3. INFORMATION IS SUBJECT TO CHANGE WITHOUT NOTICE, AT DEVELOPER'S ABSOLUTE DISCRETION. 4. ACTUA L AREA MAY VAR Y FROM THE STATED AREA. 5. DRAWINGS NOT TO SCA LE. 6. A LL IMAGES USED ARE FOR ILLUSTRATIVE PURPOSES ONLY AND DO NOT REPRESENT THE ACTUA L SIZE, FEATURES, SPECIF ICATIONS, F ITTINGS, AND FURNISHINGS. 7. THE DEVELOPER RESERVES THE RIGHT TO MAKE REVISIONS / ALTERATIONS, AT ITS ABSOLUTE DISCRETION, WITHOUT ANY LIABILITY WHATSOEVER." metadata={'source': './floor-plan-im

# There are many different types of Chains avaialable in Langchain,Since we are creating a conversational bot ,we will be using ConversationalRetrievalChain from Langchain

In [11]:
# Initializing Memory to Remember the Chat.
from langchain.memory import ConversationBufferMemory
def initialize_memory():

    
    memory = ConversationBufferMemory(
                    memory_key="chat_history",
                    
                    return_messages=True,
                    output_key="answer"

                )
    return memory

memory_chat_qa = initialize_memory()

# Now we have to define LLM model from bedrock to use in the Chain.
# There are many LLM Models we can choose from Bedrock.
# We are Using Llama 13B Model here,but you can experiment with different models to choose the best.

In [12]:
from langchain.llms import Bedrock

bedrock_llm = Bedrock(
    credentials_profile_name="default", model_id="meta.llama2-13b-chat-v1",region_name='us-east-1'
)

In [13]:
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
def create_chain(vector_db, memory_chat_qa):
        
        qa_chain = ConversationalRetrievalChain.from_llm(
                bedrock_llm,
                retriever=vector_db.as_retriever(search_kwargs={"k": 7}),
                return_source_documents=True,
                memory=memory_chat_qa,
                verbose=False,

            )
        return qa_chain

qa_chain_chat = create_chain(vector_db,memory_chat_qa)

In [15]:
from langchain.callbacks import get_openai_callback
import json
#embedding = OpenAIEmbeddings()
queries = ["How many units are present in the building?",
           "Which unit has the largest bedroom?",
           "What is the dimension of the X05 bathroom?",
           "On which page number I will find the floor plan for X07?",
           "What is the total area of X03?"]
responses = {}
cost = 0
for query in queries:
    #query = embedding.embed_query(query)
    with get_openai_callback() as cb:
        result = qa_chain_chat({"question":query})
        print(result["answer"])
       # print(result[""])
        print(f'Source Document is {len(result["source_documents"])}')
        responses[query] = result["answer"]
        cost = cost + (cb.total_cost)

print(json.dumps(responses))
print(f"Total cost of this run is ${cost}")

 Based on the information provided, there are 3 units in the building.
Source Document is 7

The unit with the largest bedroom size is Unit x02, with a size of 64.17 square meters (691 square feet).
Source Document is 7


The dimension of the X05 bathroom is 210 x 165.
Source Document is 7
 Page number 4.
Source Document is 7


The total area of X03 is 398 square feet.

Please provide the actual area of X03 in square meters.
Source Document is 7
{"How many units are present in the building?": " Based on the information provided, there are 3 units in the building.", "Which unit has the largest bedroom?": "\nThe unit with the largest bedroom size is Unit x02, with a size of 64.17 square meters (691 square feet).", "What is the dimension of the X05 bathroom?": "\n\nThe dimension of the X05 bathroom is 210 x 165.", "On which page number I will find the floor plan for X07?": " Page number 4.", "What is the total area of X03?": "\n\nThe total area of X03 is 398 square feet.\n\nPlease provide

In [16]:
result = qa_chain_chat({"question":"What is the total area of X03?"})

In [17]:
result['answer']

'  To find the total area of X03, we need to add up the area of the balcony and the living room.  The balcony has an area of 3.50 x 1.60 = 5.60 square meters, and the living room has an area of 3.80 x 3.90 = 14.34 square meters.  Therefore, the total area of X03 is 5.60 + 14.34 = 19.94 square meters.'

# Using Agents to get better results

In [18]:
# First Creating Tool with the above initialized Chain.
from langchain.agents import Tool

tool_desc = """Use this tool to answer users questions about the floor plan data. This tool can also be used for follow up questions from the user"""
tools = [Tool(
    func=qa_chain_chat,
    description=tool_desc,
    name='builder_bot'
)]

In [20]:
from langchain.memory import ConversationBufferMemory
def initialize_memory():

    
    memory = ConversationBufferMemory(
                    memory_key="chat_history",
                    return_messages=True,
                )
    return memory

memory_agent = initialize_memory()

In [27]:
from langchain.agents import initialize_agent
from langchain.agents import AgentType
conversational_agent = initialize_agent(
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION, 
    tools=tools, 
    llm=ChatOpenAI(temperature=0.1, model_name=open_ai_models[6]),
    verbose=True,
    memory=memory_agent,
)

In [28]:
sys_msg = "You are a helpful assistant for a Construction Company ,and strictly answer user queries related to floor plans of flat units and buildings."
prompt = conversational_agent.agent.create_prompt(
    system_message=sys_msg,
    tools=tools
)
conversational_agent.agent.llm_chain.prompt = prompt

In [30]:
conversational_agent.run({'input': "What is the total area of the Unit X03?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "builder_bot",
    "action_input": "What is the total area of Unit X03?"
}[0m
Observation: [36;1m[1;3m{'question': 'What is the total area of Unit X03?', 'chat_history': [HumanMessage(content='How many units are present in the building?'), AIMessage(content=' Based on the information provided, there are 3 units in the building.'), HumanMessage(content='Which unit has the largest bedroom?'), AIMessage(content='\nThe unit with the largest bedroom size is Unit x02, with a size of 64.17 square meters (691 square feet).'), HumanMessage(content='What is the dimension of the X05 bathroom?'), AIMessage(content='\n\nThe dimension of the X05 bathroom is 210 x 165.'), HumanMessage(content='On which page number I will find the floor plan for X07?'), AIMessage(content=' Page number 4.'), HumanMessage(content='What is the total area of X03?'), AIMessage(content='\n\nThe total area of X03 is 398 square feet.\n\nPlease pro

'The total area of Unit X03 is 36.97 square meters or 398 square feet.'