In [11]:
# !pip install logging
# !pip install farm-haystack
# !pip install sentence-transformers
# !pip install seqeval

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

# Here are some imports that we'll need

from haystack.nodes import DensePassageRetriever
#from haystack.utils import fetch_archive_from_http
from haystack.document_stores import InMemoryDocumentStore

In [14]:
# Here are the variables you might want to use instead of the set above
# in order to perform pretraining

doc_dir = "/content/drive/MyDrive/AIR Project"
train_filename = "GermanQuAD_train_converted.json"
dev_filename = "GermanQuAD_test_converted.json"

query_model = "facebook/dpr-question_encoder-single-nq-base"
passage_model = "facebook/dpr-ctx_encoder-single-nq-base"

save_dir = "/content/drive/MyDrive/AIR Project/GermanQuAD saved model"

In [15]:
# Initialize DPR model

retriever = DensePassageRetriever(
    document_store=InMemoryDocumentStore(),
    query_embedding_model=query_model,
    passage_embedding_model=passage_model,
    max_seq_len_query=64,
    max_seq_len_passage=256,
)

# Start training our model and save it when it is finished

retriever.train(
    data_dir=doc_dir,
    train_filename=train_filename,
    dev_filename=dev_filename,
    test_filename=dev_filename,
    n_epochs=1,
    batch_size=4,
    grad_acc_steps=8,
    save_dir=save_dir,
    evaluate_every=3000,
    embed_title=True,
    num_positives=1,
    num_hard_negatives=1,
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.data_handler.data_silo:
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|
 (o)(o)------'\ _ /     ( )
 
INFO:haystack.modeling.data_handler.data_silo:LOADING TRAIN DATA
INFO:haystack.modeling.data_handler.data_silo:Loading train set from: /content/drive/MyDrive/AIR Project/GermanQuAD_train_converted.json 
Preprocessing dataset: 100%|██████████| 23/23 [00:25<00:00,  1.09s/ Dicts]
INFO:haystack.modeling.data_handler.data_silo:
INFO:haystack.modeling.data_handler.data_silo:LOADING DEV DATA
INFO:haystack.modeling.data_handler.data_silo:Loading dev set from: /content/drive/MyDrive/A

**Ouput GermanQuAD:**

\\|//       \\|//      \\|//       \\|//     \\|//
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
***************************************************
***** EVALUATION | TEST SET | AFTER 2880 BATCHES *****
***************************************************
\\|//       \\|//      \\|//       \\|//     \\|//
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

INFO:haystack.modeling.evaluation.eval:
 _________ text_similarity _________
INFO:haystack.modeling.evaluation.eval:loss: 0.21550037061268748
INFO:haystack.modeling.evaluation.eval:task_name: text_similarity
INFO:haystack.modeling.evaluation.eval:acc: 0.9765199637023594
INFO:haystack.modeling.evaluation.eval:f1: 0.9060798548094374
INFO:haystack.modeling.evaluation.eval:acc_and_f1: 0.9412999092558985
INFO:haystack.modeling.evaluation.eval:average_rank: 0.13520871143375682
INFO:haystack.modeling.evaluation.eval:report:
                precision    recall  f1-score   support

hard_negative     0.9866    0.9866    0.9866     15428
     positive     0.9061    0.9061    0.9061      2204

     accuracy                         0.9765     17632
    macro avg     0.9463    0.9463    0.9463     17632
 weighted avg     0.9765    0.9765    0.9765     17632

INFO:haystack.modeling.model.biadaptive_model:prediction_head saving

**Output GermanDPR**

\\|//       \\|//      \\|//       \\|//     \\|//
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
***************************************************
***** EVALUATION | TEST SET | AFTER 2319 BATCHES *****
***************************************************
\\|//       \\|//      \\|//       \\|//     \\|//
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

INFO:haystack.modeling.evaluation.eval:
 _________ text_similarity _________
INFO:haystack.modeling.evaluation.eval:loss: 0.3636443769713743
INFO:haystack.modeling.evaluation.eval:task_name: text_similarity
INFO:haystack.modeling.evaluation.eval:acc: 0.9433731999023676
INFO:haystack.modeling.evaluation.eval:f1: 0.7736585365853659
INFO:haystack.modeling.evaluation.eval:acc_and_f1: 0.8585158682438667
INFO:haystack.modeling.evaluation.eval:average_rank: 0.36
INFO:haystack.modeling.evaluation.eval:report:
                precision    recall  f1-score   support

hard_negative     0.9676    0.9676    0.9676      7169
     positive     0.7737    0.7737    0.7737      1025

     accuracy                         0.9434      8194
    macro avg     0.8706    0.8706    0.8706      8194
 weighted avg     0.9434    0.9434    0.9434      8194

INFO:haystack.modeling.model.biadaptive_model:prediction_head saving