In [18]:

import glob
import pprint
import haystack
import pandas as pd
from haystack.nodes import FARMReader
from haystack.nodes import BM25Retriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines.standard_pipelines import TextIndexingPipeline


In [7]:
documentStore = InMemoryDocumentStore(use_bm25=True)

In [8]:
help(haystack.pipelines.standard_pipelines)

Help on module haystack.pipelines.standard_pipelines in haystack.pipelines:

NAME
    haystack.pipelines.standard_pipelines

CLASSES
    abc.ABC(builtins.object)
        BaseStandardPipeline
            DocumentSearchPipeline
            ExtractiveQAPipeline
            FAQPipeline
            MostSimilarDocumentsPipeline
            QuestionAnswerGenerationPipeline
            QuestionGenerationPipeline
            RetrieverQuestionGenerationPipeline
            SearchSummarizationPipeline
            TextIndexingPipeline
            TranslationWrapperPipeline
            WebQAPipeline
    
    class BaseStandardPipeline(abc.ABC)
     |  Base class for pre-made standard Haystack pipelines.
     |  This class does not inherit from Pipeline.
     |  
     |  Method resolution order:
     |      BaseStandardPipeline
     |      abc.ABC
     |      builtins.object
     |  
     |  Methods defined here:
     |  
     |  add_node(self, component, name: str, inputs: List[str])
     |      Ad

In [9]:
files2index = ['../../dataStuff/dataFetch/cityData/cityDataStuff.csv']
files2index

['../../dataStuff/dataFetch/cityData/cityDataStuff.csv']

In [10]:
df = pd.read_csv(files2index[0])
df

Unnamed: 0,Code,District,Headquarters,Population,Area (km2),Density (/km2),State/UnionTerritory,City Tier
0,–,Chümoukedima,Chümoukedima,125400.0,570.0,220,Nagaland,Tier III
1,DI,Dimapur,Dimapur,379769.0,926.0,410,Nagaland,Tier III
2,KI,Kiphire,Kiphire,74033.0,1255.0,66,Nagaland,Tier III
3,KO,Kohima,Kohima,270063.0,1041.0,213,Nagaland,Tier III
4,LO,Longleng,Longleng,50593.0,885.0,89,Nagaland,Tier III
...,...,...,...,...,...,...,...,...
783,ST,Sitamarhi,Sitamarhi,3419622.0,2199,1491,Bihar,Tier III
784,SW,Siwan,Siwan,3318176.0,2219,1495,Bihar,Tier III
785,SU,Supaul,Supaul,2228397.0,2410,919,Bihar,Tier III
786,VA,Vaishali,Hajipur,3495021.0,2036,1717,Bihar,Tier III


In [11]:
indexing_pipeline = TextIndexingPipeline(documentStore)
indexing_pipeline.run_batch(file_paths=files2index)

Converting files: 100%|██████████| 1/1 [00:00<00:00, 31.24it/s]
Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Document 99337c8e427a7714299029739f418586 is 41844 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document 64375e228ebbffd61edad66deca714eb is 31844 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document 40bf51d90b2360cdbaa5e43f6f0f1e1 is 21844 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affec

{'documents': [<Document: {'content': 'Code,District,Headquarters,Population,Area (km2),Density (/km2),State/UnionTerritory,City Tier\n–,Chümoukedima,Chümoukedima,125400,570.0,220,Nagaland,Tier III\nDI,Dimapur,Dimapur,379769,926.0,410,Nagaland,Tier III\nKI,Kiphire,Kiphire,74033,1255.0,66,Nagaland,Tier III\nKO,Kohima,Kohima,270063,1041.0,213,Nagaland,Tier III\nLO,Longleng,Longleng,50593,885.0,89,Nagaland,Tier III\nMK,Mokokchung,Mokokchung,193171,1615.0,120,Nagaland,Tier III\nMN,Mon,Mon,259604,1786.0,145,Nagaland,Tier III\n–,Niuland,Niuland,42287,483.63,87,Nagaland,Tier III\n–,Noklak,Noklak,59300,1152.0,51,Nagaland,Tier III\nPE,Peren,Peren,163294,2300.0,55,Nagaland,Tier III\nPH,Phek,Phek,163294,2026.0,81,Nagaland,Tier III\n–,Shamator,Shamator,34223,469.0,73,Nagaland,Tier III\n–,Tseminyü,Tseminyü,63629,256.0,249,Nagaland,Tier III\nTU,Tuensang,Tuensang,414801,4228.0,98,Nagaland,Tier III\nWO,Wokha,Wokha,166239,1628.0,120,Nagaland,Tier III\nZU,Zunheboto,Zunheboto,141014,1255.0,112,Nagaland,T

In [12]:
retrieverStuff = BM25Retriever(document_store=documentStore)

In [13]:
readerStuff = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [15]:

pipe = ExtractiveQAPipeline(readerStuff, retrieverStuff)

In [16]:
prediction = pipe.run(
    query="What is the density of coimbatore?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Inferencing Samples: 100%|██████████| 4/4 [02:22<00:00, 35.59s/ Batches]


In [19]:
pprint.pprint(prediction)

{'answers': [<Answer {'answer': '3472578,4723,748', 'type': 'extractive', 'score': 0.7080003619194031, 'context': 'ennai,7139882,426,17000,Tamil Nadu,Tier I\nCO,Coimbatore,Coimbatore,3472578,4723,748,Tamil Nadu,Tier II\nCU,Cuddalore,Cuddalore,2600880,3999,702,Tamil N', 'offsets_in_document': [{'start': 3804, 'end': 3820}], 'offsets_in_context': [{'start': 67, 'end': 83}], 'document_ids': ['99337c8e427a7714299029739f418586'], 'meta': {'_split_id': 2}}>,
             <Answer {'answer': '2317419,3634.0,609', 'type': 'extractive', 'score': 0.2864604592323303, 'context': 'h,Bargarh,1478833,5837.0,253,Odisha,Tier III\nBW,Balasore,Balasore,2317419,3634.0,609,Odisha,Tier III\nCU,Cuttack,Cuttack,2618708,3932.0,666,Odisha,Tier', 'offsets_in_document': [{'start': 7052, 'end': 7070}], 'offsets_in_context': [{'start': 66, 'end': 84}], 'document_ids': ['40bf51d90b2360cdbaa5e43f6f0f1e1'], 'meta': {'_split_id': 2}}>,
             <Answer {'answer': '(/km2', 'type': 'extractive', 'score': 0.27642101049

ImportError: cannot import name 'component' from 'haystack' (/usr/local/anaconda3/lib/python3.9/site-packages/haystack/__init__.py)