In [1]:

import glob
import pprint
import haystack
import pandas as pd
from haystack.nodes import FARMReader
from haystack.nodes import BM25Retriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes.file_converter import CsvTextConverter
from haystack.pipelines.standard_pipelines import TextIndexingPipeline

In [2]:
documentStore = InMemoryDocumentStore(use_bm25=True)

In [3]:
help(haystack.pipelines.standard_pipelines)

Help on module haystack.pipelines.standard_pipelines in haystack.pipelines:

NAME
    haystack.pipelines.standard_pipelines

CLASSES
    abc.ABC(builtins.object)
        BaseStandardPipeline
            DocumentSearchPipeline
            ExtractiveQAPipeline
            FAQPipeline
            MostSimilarDocumentsPipeline
            QuestionAnswerGenerationPipeline
            QuestionGenerationPipeline
            RetrieverQuestionGenerationPipeline
            SearchSummarizationPipeline
            TextIndexingPipeline
            TranslationWrapperPipeline
            WebQAPipeline
    
    class BaseStandardPipeline(abc.ABC)
     |  Base class for pre-made standard Haystack pipelines.
     |  This class does not inherit from Pipeline.
     |  
     |  Method resolution order:
     |      BaseStandardPipeline
     |      abc.ABC
     |      builtins.object
     |  
     |  Methods defined here:
     |  
     |  add_node(self, component, name: str, inputs: List[str])
     |      Ad

In [4]:
files2index = ['../../dataStuff/dataFetch/cityData/cityDataStuff.txt']
files2index

['../../dataStuff/dataFetch/cityData/cityDataStuff.txt']

In [8]:
indexing_pipeline = TextIndexingPipeline(documentStore)
indexing_pipeline.run_batch(file_paths=files2index)

Converting files: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]
Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Document 6bef9d747eda9b64a5bfa0f7dcca8d00 is 41119 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document 2acdc08cecd2d4fc22a74cfb74710c68 is 31119 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document 617c8b449cc4fa73399c5d2be5da2ba9 is 21119 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affe

{'documents': [<Document: {'content': 'The Population of Chümoukedima District/City is 125400\nThe Population of Dimapur District/City is 379769\nThe Population of Kiphire District/City is 74033\nThe Population of Kohima District/City is 270063\nThe Population of Longleng District/City is 50593\nThe Population of Mokokchung District/City is 193171\nThe Population of Mon District/City is 259604\nThe Population of Niuland District/City is 42287\nThe Population of Noklak District/City is 59300\nThe Population of Peren District/City is 163294\nThe Population of Phek District/City is 163294\nThe Population of Shamator District/City is 34223\nThe Population of Tseminyü District/City is 63629\nThe Population of Tuensang District/City is 414801\nThe Population of Wokha District/City is 166239\nThe Population of Zunheboto District/City is 141014\nThe Population of Amritsar District/City is 2490891\nThe Population of Barnala District/City is 596294\nThe Population of Bathinda District/City is 13

In [9]:
retrieverStuff = BM25Retriever(document_store=documentStore)

In [10]:
readerStuff = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

In [11]:

pipe = ExtractiveQAPipeline(readerStuff, retrieverStuff)

In [17]:
prediction = pipe.run(
    query="What is the density of coimbatore?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Inferencing Samples:   0%|          | 0/5 [00:00<?, ? Batches/s]

In [None]:
pprint.pprint(prediction)

{'answers': [<Answer {'answer': '3472578', 'type': 'extractive', 'score': 0.05903422832489014, 'context': ' District/City is 7139882\nThe Population of Coimbatore District/City is 3472578\nThe Population of Cuddalore District/City is 2600880\nThe Population of', 'offsets_in_document': [{'start': 3310, 'end': 3317}], 'offsets_in_context': [{'start': 72, 'end': 79}], 'document_ids': ['6bef9d747eda9b64a5bfa0f7dcca8d00'], 'meta': {'_split_id': 1}}>,
             <Answer {'answer': '900', 'type': 'extractive', 'score': 0.03907295688986778, 'context': 'ad District/City is 1124176\nThe Population of Kaimur District/City is 1626900\nThe Population of Katihar District/City is 3068149\nThe Population of Kha', 'offsets_in_document': [{'start': 6105, 'end': 6108}], 'offsets_in_context': [{'start': 74, 'end': 77}], 'document_ids': ['cd037a7a6ce194856391cf9488b83e20'], 'meta': {'_split_id': 1}}>,
             <Answer {'answer': '6916', 'type': 'extractive', 'score': 0.032632190734148026, 'context': 