In [1]:
import os
import sys

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
docqa_path = 'C:\\Users\\Вова\\PycharmProjects\\DocQA' # change the path if necessary
sys.path.append(docqa_path)
os.chdir(docqa_path)

## Description

DocQA allows you to fit Retriever, Ranker, CatBoost pipelines and a general Pipeline.

### Train Data

Train dataset has these specific options:
* translated_question 
* native_question
* translated_context
* native_context (it has to be, but it is is not used in the fitting process)

Another options can be generated during the process of fitting and used in CatBoost pipeline fitting.

Train data can be added via Storage.

In [4]:
from docQA.nodes.storage import Storage

storage = Storage(storage_name='base_storage', docs_links=['docs/152.txt']) # creating a storage based on 152 federal law of Russia
storage.add_dataset('docs/test_dataset.csv', 'test_dataset')

In [5]:
storage.add_dataset('docs/test_dataset.csv', 'test_dataset')

AssertionError: Dataset name already exists

In [6]:
storage.del_dataset('test_dataset') # delete dataset

In [7]:
storage.add_dataset('docs/test_dataset.csv', 'test_dataset')

In [8]:
storage.add_dataset('docs/test_dataset.csv', 'benchmark_dataset', is_benchmark=True) # mark dataset as a benchmark

### General Pipeline fitting

In [9]:
from docQA.pipelines import Pipeline, TranslatorPipeline, RetrieverPipeline, RankerPipeline, CatboostPipeline

pipe = Pipeline(storage)

In [10]:
pipe.add_node(TranslatorPipeline, name='translator', is_technical=True, demo_only=True, num_beams=15)
pipe.add_node(RetrieverPipeline, name='retriever')
pipe.add_node(RankerPipeline, name='ranker')
pipe.add_node(CatboostPipeline, name='catboost')

In [11]:
pipe.fit()

Fine tuning retriever:   0%|          | 0/20 [00:00<?, ?it/s]

Fine tuning ranker:   0%|          | 0/20 [00:00<?, ?it/s]

Fine tuning catboost:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
text_input = 'Какое определение персональных данных?'

pipe(text_input)

[{'input': 'Какое определение персональных данных?',
  'output': {'answers': [{'answer': '1) персональные данные - любая информация, относящаяся к прямо или косвенно определенному или определяемому физическому лицу (субъекту персональных данных);',
     'total_score': 0.7079729821886714,
     'scores': {'retriever_cos_sim': 0.8394578099250793,
      'ranker_cos_sim': 0.9055840373039246,
      'catboost_proba': 0.3788770993370103}},
    {'answer': '3) предполагаемые пользователи персональных данных;',
     'total_score': 0.5114594604118877,
     'scores': {'retriever_cos_sim': 0.6962785720825195,
      'ranker_cos_sim': 0.7411357164382935,
      'catboost_proba': 0.09696409271485039}},
    {'answer': '6) предоставление персональных данных - действия, направленные на раскрытие персональных данных определенному лицу или определенному кругу лиц;',
     'total_score': 0.5099472115635625,
     'scores': {'retriever_cos_sim': 0.5751959085464478,
      'ranker_cos_sim': 0.7452040314674377,
   

In [13]:
storage.del_dataset('test_dataset')
storage.del_dataset('benchmark_dataset')