## Prerequisites

In [None]:
 !pip install transformers
 !pip install trafilatura
 !pip install wikipedia

## Imports

In [13]:
from transformers import pipeline
qa_pipeline = pipeline(
    "question-answering",
    model="henryk/bert-base-multilingual-cased-finetuned-polish-squad2",
    tokenizer="henryk/bert-base-multilingual-cased-finetuned-polish-squad2",
)

In [29]:
from googlesearch import search
from typing import List

import requests
import trafilatura
import wikipedia

## Source

In [35]:
def get_urls(query: str) -> List[str]:
    """Get sets of urls for given query, visit them and check for duplicates"""
    urls = list(search(query, tld='com', lang="pl",
                start=0, stop=3, pause=1.0))
    urls = [url for url in urls if "#" not in url]
    seen_urls = set()
    res_urls = []
    for url in urls:
        if url[:20] not in seen_urls:
            seen_urls.add(url[:20])
            res_urls.append(url)
    return res_urls


def get_context(url: str) -> str:
    """Visit url and get some text present there.
    The size of the context is limited to 500 signs since the time of execution
    of one question was too long.
    """
    wikipedia.set_lang("pl")
    if "wikipedia" in url and "wikimedia" not in url:
        r = requests.get(url)
        html = r.text
        title = html[html.find('<title>') + 7: html.find('</title>')][:-32]
        wiki_page = wikipedia.page(title)
        context = wiki_page.content
    else:
        downloaded = trafilatura.fetch_url(url)
        context = trafilatura.extract(downloaded)
    return context[:500]


def find_answer(question: str) -> str:
    """Find answer to given question using context from websites and Henryk
    model for finding answer in chosen text.
    """
    urls = get_urls(question)
    answers = []
    seen_answers = set()
    for url in urls:
        context = get_context(url)
        if context:
            ans = qa_pipeline({"context": context, "question": question})
            if ans['answer'] not in seen_answers:
                seen_answers.add(ans['answer'])
            else:
                return determine_best_answer(answers)
            answers.append(ans)
    return determine_best_answer(answers)


def determine_best_answer(answers: List[str]) -> str:
    """Find answer with the largest score."""
    return (max(answers, key=lambda x: x['score']))['answer']


In [None]:
find_answer("Co robi Robert Makłowicz?")

In [None]:
find_answer("Co jest stolicą Polski?")