<a href="https://colab.research.google.com/github/TanJiaTing/AIP/blob/master/Haystack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install git+https://github.com/deepset-ai/haystack.git

Collecting git+https://github.com/deepset-ai/haystack.git
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-req-build-oriymsxj
  Running command git clone -q https://github.com/deepset-ai/haystack.git /tmp/pip-req-build-oriymsxj
Collecting farm==0.4.6
[?25l  Downloading https://files.pythonhosted.org/packages/e2/93/1beb613753a9845b689eee4571ba4a7f3210b60b4bd90f024fc324c96785/farm-0.4.6-py3-none-any.whl (184kB)
[K     |████████████████████████████████| 194kB 5.7MB/s 
[?25hCollecting fastapi
[?25l  Downloading https://files.pythonhosted.org/packages/31/24/8586a2a6ec815df9a8ac1296a4c7910314307ad3144e620af29bb77b7180/fastapi-0.59.0-py3-none-any.whl (49kB)
[K     |████████████████████████████████| 51kB 6.3MB/s 
[?25hCollecting uvicorn
[?25l  Downloading https://files.pythonhosted.org/packages/ac/3e/c4170b6c04e897fc3ef24d98626e437de9a1ed3bb334ce172285f6be0a57/uvicorn-0.11.5-py3-none-any.whl (43kB)
[K     |████████████████████████████████| 51kB 6.4MB/s 
[?25hCollecting

In [None]:

from haystack import Finder
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

In [None]:

# In-Memory Document Store
from haystack.database.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()

In [None]:

# Let's first get some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Convert files to dicts
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is: {"name": "<some-document-name>, "text": "<the-actual-text>"}
# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and
# can be accessed later for filtering or shown in the responses of the Finder)

# Let's have a look at the first 3 entries:
print(dicts[:3])

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

07/17/2020 09:23:15 - INFO - haystack.indexing.utils -   Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip to `data/article_txt_got`
100%|██████████| 1167348/1167348 [00:00<00:00, 1324867.93B/s]


[{'text': "'''Gilly''' is a fictional character in the ''A Song of Ice and Fire'' series of fantasy novels by American author George R. R. Martin, and its television adaptation ''Game of Thrones''.\nIntroduced in 1998's ''A Clash of Kings'', she is a wildling from the wild lands north of the Wall who is befriended by Samwell Tarly and Jon Snow. She subsequently appeared in Martin's ''A Storm of Swords'' (2000), ''A Feast for Crows'' (2005), ''A Dance with Dragons'' (2011), and will appear in the upcoming novel ''The Winds of Winter''.\nGilly is portrayed by Hannah Murray in the HBO television adaptation.", 'meta': {'name': '262_Gilly__character_.txt'}}, {'text': '\n==Character description==\nGilly is a wildling girl, daughter and wife of Craster. She is in her late teens, has brown eyes and is estimated to be around 15 or 16 years old when she first appears in the novel.\nGilly is not a point of view character in the novels, so her actions are witnessed and interpreted through the eyes

In [None]:
temp = convert_files_to_dicts(dir_path="data/custom", clean_func=clean_wiki_text, split_paragraphs=True)

In [None]:
document_store.write_documents(temp)

In [None]:

# An in-memory TfidfRetriever based on Pandas dataframes

from haystack.retriever.sparse import TfidfRetriever
retriever = TfidfRetriever(document_store=document_store)

07/17/2020 09:26:07 - INFO - haystack.retriever.sparse -   Found 1 candidate paragraphs from 1 docs in DB


In [None]:

# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

07/17/2020 09:24:07 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
07/17/2020 09:24:07 - INFO - farm.infer -   Could not find `deepset/roberta-base-squad2` locally. Try to download from model hub ...
07/17/2020 09:24:07 - INFO - filelock -   Lock 140651990528744 acquired on /root/.cache/torch/transformers/f7d4b9379a9c487fa03ccf3d8e00058faa9d664cf01fc03409138246f48760da.c6288e0f84ec797ba5c525c923a5bbc479b47c761aded9734a5f6a473b044c8d.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=559.0, style=ProgressStyle(description_…

07/17/2020 09:24:07 - INFO - filelock -   Lock 140651990528744 released on /root/.cache/torch/transformers/f7d4b9379a9c487fa03ccf3d8e00058faa9d664cf01fc03409138246f48760da.c6288e0f84ec797ba5c525c923a5bbc479b47c761aded9734a5f6a473b044c8d.lock





07/17/2020 09:24:07 - INFO - filelock -   Lock 140651990570432 acquired on /root/.cache/torch/transformers/8c0c8b6371111ac5fbc176aefcf9dbe129db7be654c569b8375dd3712fc4dc67.d045adc91e17ecdf7dc3eeff4c875df94bdf2eb749d72b3ae47ae93f8e85213c.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=498637366.0, style=ProgressStyle(descri…

07/17/2020 09:24:22 - INFO - filelock -   Lock 140651990570432 released on /root/.cache/torch/transformers/8c0c8b6371111ac5fbc176aefcf9dbe129db7be654c569b8375dd3712fc4dc67.d045adc91e17ecdf7dc3eeff4c875df94bdf2eb749d72b3ae47ae93f8e85213c.lock





	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
07/17/2020 09:24:35 - INFO - filelock -   Lock 140651990638544 acquired on /root/.cache/torch/transformers/1e3af82648d7190d959a9d76d727ef629b1ca51b3da6ad04039122453cb56307.6a4061e8fc00057d21d80413635a86fdcf55b6e7594ad9e25257d2f99a02f4be.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…

07/17/2020 09:24:36 - INFO - filelock -   Lock 140651990638544 released on /root/.cache/torch/transformers/1e3af82648d7190d959a9d76d727ef629b1ca51b3da6ad04039122453cb56307.6a4061e8fc00057d21d80413635a86fdcf55b6e7594ad9e25257d2f99a02f4be.lock





07/17/2020 09:24:36 - INFO - filelock -   Lock 140651990638544 acquired on /root/.cache/torch/transformers/b901c69e8e7da4a24c635ad81d016d274f174261f4f5c144e43f4b00e242c3b0.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

07/17/2020 09:24:37 - INFO - filelock -   Lock 140651990638544 released on /root/.cache/torch/transformers/b901c69e8e7da4a24c635ad81d016d274f174261f4f5c144e43f4b00e242c3b0.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock





07/17/2020 09:24:37 - INFO - filelock -   Lock 140651990638544 acquired on /root/.cache/torch/transformers/2d9b03b59a8af464bf4238025a3cf0e5a340b9d0ba77400011e23c130b452510.16f949018cf247a2ea7465a74ca9a292212875e5fd72f969e0807011e7f192e4.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=150.0, style=ProgressStyle(description_…

07/17/2020 09:24:38 - INFO - filelock -   Lock 140651990638544 released on /root/.cache/torch/transformers/2d9b03b59a8af464bf4238025a3cf0e5a340b9d0ba77400011e23c130b452510.16f949018cf247a2ea7465a74ca9a292212875e5fd72f969e0807011e7f192e4.lock





07/17/2020 09:24:38 - INFO - filelock -   Lock 140651988962832 acquired on /root/.cache/torch/transformers/507984f2e28c7dfed5db9a20acd68beb969c7f2833abc9e582e967fa0291f3dc.100c88dbe27dbd73822c575274ade4eb2427596ac56e96769249b7512341654d.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=189.0, style=ProgressStyle(description_…

07/17/2020 09:24:39 - INFO - filelock -   Lock 140651988962832 released on /root/.cache/torch/transformers/507984f2e28c7dfed5db9a20acd68beb969c7f2833abc9e582e967fa0291f3dc.100c88dbe27dbd73822c575274ade4eb2427596ac56e96769249b7512341654d.lock





07/17/2020 09:24:39 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
07/17/2020 09:24:39 - INFO - farm.infer -   Got ya 1 parallel workers to do inference ...
07/17/2020 09:24:39 - INFO - farm.infer -    0 
07/17/2020 09:24:39 - INFO - farm.infer -   /w\
07/17/2020 09:24:39 - INFO - farm.infer -   /'\
07/17/2020 09:24:39 - INFO - farm.infer -   


In [None]:

finder = Finder(reader, retriever)

In [None]:
prediction = finder.get_answers(question="When is the deadline for agents of non-scheduled flights to submit their slot requests?", top_k_retriever=1, top_k_reader=1)


07/17/2020 09:26:15 - INFO - haystack.finder -   Reader is looking for detailed answer in 397397 chars ...
Inferencing Samples: 100%|██████████| 17/17 [11:12<00:00, 39.55s/ Batches]


In [None]:
print_answers(prediction, details="minimal")

[   {   'answer': '7 calendar days',
        'context': ' their slot requests to the Changi Slot Coordinator no '
                   'earlier than 7 calendar days and but no later than 24 '
                   'hours prior to the operation of the fligh'}]


In [None]:
temp

[]