In [16]:
from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import fetch_archive_from_http,clean_wiki_text,convert_files_to_docs,print_answers
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import BM25Retriever, FARMReader
from haystack.nodes import RAGenerator, DensePassageRetriever
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack import Document
import logging
import os


logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [17]:
import pandas as pd

# Create dataframe with columns "title" and "text"
df = pd.read_excel("PSSCOC_Cleaned.xlsx")
# Minimal cleaning
df.fillna(value="", inplace=True)

print(df.head())
df.to_excel("df.xlsx")

# Define the directory where the text files are located
#text_files_dir = os.getcwd() + "\\TextFile"

        title  \
0  PSSCOC.pdf   
1  PSSCOC.pdf   
2  PSSCOC.pdf   
3  PSSCOC.pdf   
4  PSSCOC.pdf   

                                                                              text  
0                                        FOR CONSTRUCTION WORKS:  2020              
1                      STANDARD CONDITIONS OF CONTRACT:  for Construction Works     
2  1 DEFINITIONS AND INTERPRETATION:    1.1 Definitions  In the Contract (as he...  
3  OFFICER'S REPRESENTATIVE:    2.1 Superintending Officer's Authority  (1) The...  
4  3 CONTRACT DOCUMENTS:    3.1 Contract Documents to be Taken as Mutually Expl...  


In [18]:
# Use data to initialize Document objects
titles = list(df["title"].values)
texts = list(df["text"].values)
documents = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={"name": title or ""}))

In [19]:
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import RAGenerator, DensePassageRetriever


# Initialize FAISS document store.
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True,sql_url="sqlite://")
#document_store = FAISSDocumentStore( sql_url="sqlite:///my_doc_store.db")


In [20]:
# Convert the text files to Document objects and write them to the document store
#file_to_index = [os.path.join(text_files_dir, f) for f in os.listdir(text_files_dir)]
#file_to_index

In [21]:
print(documents)

[<Document: {'content': 'FOR CONSTRUCTION WORKS:  2020            ', 'content_type': 'text', 'score': None, 'meta': {'name': 'PSSCOC.pdf'}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'f29206186abb37fd00773ce967b33f76'}>, <Document: {'content': 'STANDARD CONDITIONS OF CONTRACT:  for Construction Works   ', 'content_type': 'text', 'score': None, 'meta': {'name': 'PSSCOC.pdf'}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '570b12af32649afa72d2bf43fc858c'}>, <Document: {'content': '1 DEFINITIONS AND INTERPRETATION:    1.1 Definitions  In the Contract (as hereinafter defined) the following words and expressions shall have the meanings hereby assigned to them except where the context otherwise requires:  (a) "Appendix" means the appendix to these Conditions.  (b)         "Claimed Amount" means the whole or part of any payment claimed by the Contractor in a Payment Claim pursuant to Clause 32.1(1).  (c) "Conditions" means the Standard Conditions and Particular Conditions (

In [22]:
from haystack.nodes import EmbeddingRetriever

# Initialize EmbeddingRetriever Retriever to encode documents, encode question and query documents
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
    model_format="sentence_transformers"
)

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.nodes.retriever.dense:Init retriever using embeddings of model flax-sentence-embeddings/all_datasets_v3_mpnet-base
  return self.fget.__get__(instance, owner)()


In [23]:
from haystack.nodes import Seq2SeqGenerator

generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [27]:
# Delete existing documents in documents store
document_store.delete_documents()

# Write documents to document store
document_store.write_documents(documents)

# Add documents embeddings to index
document_store.update_embeddings(retriever=retriever)
documentZ = document_store.get_all_documents(return_embedding=True)
print(documentZ)

Writing Documents:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:haystack.document_stores.faiss:Updating embeddings for 53 docs...


Updating Embedding:   0%|          | 0/53 [00:00<?, ? docs/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[<Document: {'content': '20 VALUATION OF VARIATIONS:    20.1 Valuation Methods  Subject to Clauses 19.3, 20.4 and 20.5, all variations shall be valued as follows:  (a) Where the varied work is of a similar character to, is executed under similar conditions as and does not significantly change the quantity of work described in the Contract, the Rates for the Works shall determine the valuation; or  (b) Where the varied work is of similar character to work described in the Contract  but is not executed under similar conditions of such work described in the Contract or involves significant changes in the quantity of such work described in the Contract, the Rates for the Works shall be the basis for determining the valuation but with a fair allowance for any differences in such conditions and/or quantity; or  (c) Where (a) and (b) above do not apply, then by measurement and valuation at fair market rates and prices;  (d) Where none of the above methods is applicable or appropriate in the c

In [26]:
print(document_store)

<haystack.document_stores.faiss.FAISSDocumentStore object at 0x000001C672089670>


In [25]:
current_directory=os.getcwd()
print(current_directory)

C:\Users\Admin\OneDrive\Data Science\PPA No Github\QuestionAnswerGeneration\OpenDomainQA\LandingQA


In [14]:
import json

document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
document_store.save(index_path=f"{current_directory}/PSSCOC1_index.faiss", config_path=f"{current_directory}/PSSCOC1_config.json")
# Export documents to a JSON file
all_docs = document_store.get_all_documents()
with open("PSSCOC_documents.json", "w") as f:
    json.dump(all_docs, f)

In [17]:
QUESTIONS = [
    "What must happen for the Employer to terminate the contract in PSSCOC?",
    "What happens when a Contractor is terminated for default in PSSCOC?",
    "What are the liquidated damages after termination in PSSCOC?",
    "Is the contractor responsible for damages caused by the subcontractor in PSSCOC?",
    "Write me a contract provision on a Cafe within an underground rail system"
]

In [18]:
from haystack.document_stores import FAISSDocumentStore

# First, initialize the document store:
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

# Save the document store:
document_store.save(index_path="my_faiss_index.faiss")
# Saving the document store creates two files: my_faiss_index.faiss and my_faiss_index.json



In [15]:
# Or alternatively use the Pipeline class
from haystack.pipelines import GenerativeQAPipeline
from haystack.utils import print_answers

pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
for question in QUESTIONS:
    res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
    print_answers(res, details="minimum")

NameError: name 'QUESTIONS' is not defined

In [16]:
!pip freeze > requirements.txt


In [1]:
from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import fetch_archive_from_http,clean_wiki_text,convert_files_to_docs,print_answers
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import BM25Retriever, FARMReader
from haystack.nodes import RAGenerator, DensePassageRetriever
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack import Document
import logging
import os
from haystack.document_stores.faiss import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever, Seq2SeqGenerator
import pandas as pd

# Create dataframe with columns "title" and "text"
df = pd.read_excel("PSSCOC_Cleaned.xlsx")
# Minimal cleaning
df.fillna(value="", inplace=True)

print(df.head())
df.to_excel("df.xlsx")

# Define the directory where the text files are located
#text_files_dir = os.getcwd() + "\\TextFile"

# create FAISSDocumentStore object
document_store = FAISSDocumentStore(sql_url="sqlite:///faiss.db")

# Use data to initialize Document objects
titles = list(df["title"].values)
texts = list(df["text"].values)
documents = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={"name": title or ""}))

# write documents to the index
document_store.write_documents(documents)

# initialize EmbeddingRetriever Retriever to encode documents, encode question and query documents
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
    model_format="sentence_transformers"
)

# update FAISS index with document embeddings
document_store.update_embeddings(retriever=retriever)

# create Seq2SeqGenerator object
generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

YOUR_PATH=os.getcwd()
# Save the FAISSDocumentStore object to disk
document_store.save(f"{YOUR_PATH}/faiss2_document_store")

# Load the FAISSDocumentStore object from disk
#document_store = FAISSDocumentStore.load(f"{YOUR_PATH}/faiss2_document_store")

        title  \
0  PSSCOC.pdf   
1  PSSCOC.pdf   
2  PSSCOC.pdf   
3  PSSCOC.pdf   
4  PSSCOC.pdf   

                                                                              text  
0                                        FOR CONSTRUCTION WORKS:  2020              
1                      STANDARD CONDITIONS OF CONTRACT:  for Construction Works     
2  1 DEFINITIONS AND INTERPRETATION:    1.1 Definitions  In the Contract (as he...  
3  OFFICER'S REPRESENTATIVE:    2.1 Superintending Officer's Authority  (1) The...  
4  3 CONTRACT DOCUMENTS:    3.1 Contract Documents to be Taken as Mutually Expl...  


Writing Documents:   0%|          | 0/53 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


Updating Embedding:   0%|          | 0/53 [00:00<?, ? docs/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]