### The module reads the pdf text from the local device and convert them into vectors.
I have used two methods.
1. TFIDF
2. Word2Vec

In [3]:
import os
import glob
import PyPDF2
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import TFLongformerForSequenceClassification, LongformerTokenizer, LongformerConfig
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-03-16 13:53:40.088281: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Read single pdf file and convert it to text.

In [4]:
# Function to read PDF files and extract text
def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader (file)
        text = ""
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
        return text

In [5]:
# Provide the path to your PDF file
pdf_path = 'data/rental-increase-decree-43-in-dubai.pdf'

txt = read_pdf(pdf_path)

In [6]:
txt

'1 \n  \nDecree No (43) for 2013 regarding the  \nDetermination of the Increase in R ent for Properties in the Emirate of Dubai  \n \n \n \nWe, Mohammed Bin Rashied Al Maktoum, Ruler of Dubai  \n \nAfter perusal of  Law No (9) for 2004 regarding Dubai International Financial Center and \namendments;  \nLaw No (16) for 2 007 regarding establishment of the  Real Estate Regulatory Agency;  \nLaw No (26) for 2007 re garding Regulation the R elationship between Landlords and \nTenants of Prop erties in the Emirate of Dubai and its amendments;  \nDecree No (22) for 2009 regarding Private Development Zones in the Emirate of Dubai;  \nDecree No (2) for 2011 regarding Rents in the Emirate of Dubai;  \nDecree No (26) for 2013 regarding Rental Disputes settle ment Center in the Emirate of \nDubai, and  \nLegislation regulating free zones in the Emirate of Dubai,   \n \nDecided the following:  \n \nPercentage of Increase  \nArticle (1)  \n \nThe maximum rent increase percentage of properties  in t

Below function tolenize the text after remove non-alphabetic character and converting them to lowercase.

In [7]:
# Function to preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Lowercase and remove non-alphabetic characters
    tokens = [token.lower() for token in tokens if token.isalpha()]
    return tokens

In [8]:
tkns = preprocess_text(txt)

In [9]:
tkns

['decree',
 'no',
 'for',
 'regarding',
 'the',
 'determination',
 'of',
 'the',
 'increase',
 'in',
 'r',
 'ent',
 'for',
 'properties',
 'in',
 'the',
 'emirate',
 'of',
 'dubai',
 'we',
 'mohammed',
 'bin',
 'rashied',
 'al',
 'maktoum',
 'ruler',
 'of',
 'dubai',
 'after',
 'perusal',
 'of',
 'law',
 'no',
 'for',
 'regarding',
 'dubai',
 'international',
 'financial',
 'center',
 'and',
 'amendments',
 'law',
 'no',
 'for',
 'regarding',
 'establishment',
 'of',
 'the',
 'real',
 'estate',
 'regulatory',
 'agency',
 'law',
 'no',
 'for',
 're',
 'garding',
 'regulation',
 'the',
 'r',
 'elationship',
 'between',
 'landlords',
 'and',
 'tenants',
 'of',
 'prop',
 'erties',
 'in',
 'the',
 'emirate',
 'of',
 'dubai',
 'and',
 'its',
 'amendments',
 'decree',
 'no',
 'for',
 'regarding',
 'private',
 'development',
 'zones',
 'in',
 'the',
 'emirate',
 'of',
 'dubai',
 'decree',
 'no',
 'for',
 'regarding',
 'rents',
 'in',
 'the',
 'emirate',
 'of',
 'dubai',
 'decree',
 'no',
 'for

tkns

In [10]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/tarique/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/tarique/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/tarique/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/tarique/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/tarique/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading pa

True

In [11]:
tkns

['decree',
 'no',
 'for',
 'regarding',
 'the',
 'determination',
 'of',
 'the',
 'increase',
 'in',
 'r',
 'ent',
 'for',
 'properties',
 'in',
 'the',
 'emirate',
 'of',
 'dubai',
 'we',
 'mohammed',
 'bin',
 'rashied',
 'al',
 'maktoum',
 'ruler',
 'of',
 'dubai',
 'after',
 'perusal',
 'of',
 'law',
 'no',
 'for',
 'regarding',
 'dubai',
 'international',
 'financial',
 'center',
 'and',
 'amendments',
 'law',
 'no',
 'for',
 'regarding',
 'establishment',
 'of',
 'the',
 'real',
 'estate',
 'regulatory',
 'agency',
 'law',
 'no',
 'for',
 're',
 'garding',
 'regulation',
 'the',
 'r',
 'elationship',
 'between',
 'landlords',
 'and',
 'tenants',
 'of',
 'prop',
 'erties',
 'in',
 'the',
 'emirate',
 'of',
 'dubai',
 'and',
 'its',
 'amendments',
 'decree',
 'no',
 'for',
 'regarding',
 'private',
 'development',
 'zones',
 'in',
 'the',
 'emirate',
 'of',
 'dubai',
 'decree',
 'no',
 'for',
 'regarding',
 'rents',
 'in',
 'the',
 'emirate',
 'of',
 'dubai',
 'decree',
 'no',
 'for

### TFIDF to convert text to vectors

In [12]:
# Function to convert text to TF-IDF vectors
def text_to_tfidf_vectors(texts):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    return tfidf_matrix

In [13]:
texts = [read_pdf(pdf_path)]

In [14]:
texts

['1 \n  \nDecree No (43) for 2013 regarding the  \nDetermination of the Increase in R ent for Properties in the Emirate of Dubai  \n \n \n \nWe, Mohammed Bin Rashied Al Maktoum, Ruler of Dubai  \n \nAfter perusal of  Law No (9) for 2004 regarding Dubai International Financial Center and \namendments;  \nLaw No (16) for 2 007 regarding establishment of the  Real Estate Regulatory Agency;  \nLaw No (26) for 2007 re garding Regulation the R elationship between Landlords and \nTenants of Prop erties in the Emirate of Dubai and its amendments;  \nDecree No (22) for 2009 regarding Private Development Zones in the Emirate of Dubai;  \nDecree No (2) for 2011 regarding Rents in the Emirate of Dubai;  \nDecree No (26) for 2013 regarding Rental Disputes settle ment Center in the Emirate of \nDubai, and  \nLegislation regulating free zones in the Emirate of Dubai,   \n \nDecided the following:  \n \nPercentage of Increase  \nArticle (1)  \n \nThe maximum rent increase percentage of properties  in 

In [15]:
tfidf_matrix = text_to_tfidf_vectors(texts)



In [17]:
tfidf_matrix.toarray()

array([[0.01527831, 0.01527831, 0.03055662, 0.03055662, 0.03055662,
        0.12222647, 0.01527831, 0.01527831, 0.03055662, 0.01527831,
        0.01527831, 0.07639154, 0.01527831, 0.01527831, 0.10694816,
        0.01527831, 0.07639154, 0.06111323, 0.03055662, 0.01527831,
        0.04583492, 0.01527831, 0.01527831, 0.01527831, 0.01527831,
        0.01527831, 0.10694816, 0.01527831, 0.01527831, 0.01527831,
        0.03055662, 0.01527831, 0.21389632, 0.01527831, 0.13750477,
        0.01527831, 0.01527831, 0.01527831, 0.01527831, 0.03055662,
        0.03055662, 0.01527831, 0.01527831, 0.15278308, 0.03055662,
        0.01527831, 0.01527831, 0.01527831, 0.01527831, 0.07639154,
        0.21389632, 0.03055662, 0.06111323, 0.01527831, 0.03055662,
        0.01527831, 0.01527831, 0.03055662, 0.03055662, 0.04583492,
        0.01527831, 0.01527831, 0.07639154, 0.01527831, 0.03055662,
        0.01527831, 0.01527831, 0.03055662, 0.01527831, 0.10694816,
        0.53474079, 0.03055662, 0.03055662, 0.01

#### Word2Vec to convert text to vectors

In [18]:
# Provide the path to your PDF file
pdf_directory = 'data'
# Read PDF files and extract text
pdf_files = glob.glob(os.path.join(pdf_directory, '*.pdf'))

In [19]:
pdf_files

['data/book.pdf',
 'data/tenancyguideen.pdf',
 'data/tr01-survey-manual-sign_2022-04-26.pdf',
 'data/rental-increase-decree-43-in-dubai.pdf']

In [20]:
texts = [read_pdf(file) for file in pdf_files]

In [21]:
texts

 'Our Vision: \nTo be a global real estate leader in attracting investments.  \nOur Mission: \nTo create a real estate environment that applies best international \npractices to guarantee the rights of all stakeholders, and to contribute to the development of society by: •Developing and applying clear and transparent real estateregulations\n•Providing distinctive and effcient real estate services that helpattract investment\n•Increasing real estate knowledgeChapter I \nEJARI Program 6 \nTenancy GuideAbout EJARI \nEJARI is an online program developed by RERA for recording tenancy \ncontracts for all types of property in the Emirate of Dubai, pursuant to Law No. 26 of 2007, as amended by Law No. 33 of 2008. EJARI is the frst step to regulate the process of real estate lease and management. It helps upgrade this important sector and provides distinctive real estate services in terms of registration and regulation. The program aims to preserve rights, regulate the relations between the par

In [22]:
# Function to convert text to Word Embeddings using Word2Vec
def text_to_word_embeddings(texts):
    tokenized_texts = [preprocess_text(text) for text in texts]
    word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
    return word2vec_model

In [23]:
word2vec_model = text_to_word_embeddings(texts)

In [24]:
word2vec_model

<gensim.models.word2vec.Word2Vec at 0x15cf62750>

In [25]:
import numpy as np

# Assuming you have already trained your Word2Vec model and stored it in the variable `word2vec_model`

# Get the vocabulary from the model
vocabulary = word2vec_model.wv.index_to_key

# Initialize an empty matrix to store the word vectors
word_vectors_matrix = np.zeros((len(vocabulary), word2vec_model.vector_size))

# Fill the matrix with word vectors
for i, word in enumerate(vocabulary):
    word_vectors_matrix[i] = word2vec_model.wv[word]

# Now `word_vectors_matrix` contains the word vectors in matrix form
print(word_vectors_matrix)


[[-0.18438952  0.83760953  0.17891631 ... -0.88129956  0.18032031
   0.06003939]
 [-0.17798465  0.76311445  0.16873004 ... -0.7892167   0.14853454
   0.06509221]
 [-0.21578877  0.99234998  0.19672678 ... -1.01583087  0.22753797
   0.07208512]
 ...
 [ 0.00237486 -0.00653813 -0.00426267 ... -0.00658685  0.00890439
   0.00444756]
 [-0.00399155  0.00690683  0.00108764 ...  0.00322931  0.00971037
  -0.00634383]
 [-0.00710464  0.02577048  0.00415421 ... -0.01029505 -0.00358413
   0.0070907 ]]


In [26]:
word_vectors_matrix.shape

(2633, 100)

### Below code utilizes Langchain's FAISS technique to convert all the pdf data into vectors. Then OpenAI api is called for QnA.

In [27]:
from dotenv import load_dotenv
import os
from PyPDF2 import PdfReader
import streamlit as st
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback

# Load environment variables
load_dotenv()

True

In [28]:
def process_text(text):
    # Split the text into chunks using Langchain's CharacterTextSplitter
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split(text)
    
    
    # Convert the chunks of text into embeddings to form a knowledge base
    embeddings = OpenAIEmbeddings()
    knowledgeBase = FAISS.from_texts(chunks, embeddings)
    
    return knowledgeBase

In [31]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/book.pdf")
pages = loader.load_and_split()

In [32]:
pages[0]

Document(page_content='2019 Issue Year', metadata={'source': 'data/book.pdf', 'page': 1})

In [33]:
from dotenv import load_dotenv
import os
# Load environment variables
load_dotenv()

True

In [34]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())

  warn_deprecated(


In [35]:
import openai
import langchain

from langchain.chat_models import ChatOpenAI


In [36]:
llm = ChatOpenAI(temperature=0, openai_api_key="OPEN_AI_API_KEY")

  warn_deprecated(


In [37]:
question = "what is Article 18 all about?"
docs = faiss_index.similarity_search(question)
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=question)

  warn_deprecated(


'Article 18 mentioned in the context provided is related to Real Property Brokers. It states that a Broker may not broker any deal that violates the laws or regulations in force in the Emirate.'