In [2]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [9]:
from google.colab import drive
import os

drive._mount("/content/drive", force_remount=True)

os.chdir('/content/drive/MyDrive/QAsubsystem')
!pwd

Mounted at /content/drive
/content/drive/MyDrive/QAsubsystem


In [4]:
#!pip3 install virtualenv

#!virtualenv QAenv

In [2]:
!pip install --upgrade pip; pip install git+https://github.com/deepset-ai/haystack.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/deepset-ai/haystack.git
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-req-build-eown62qg
  Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-req-build-eown62qg
  Resolved https://github.com/deepset-ai/haystack.git to commit a26c0429941b2c528406bac130483937f0dba50d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m

In [3]:
# haystack class module

from pathlib import Path
from typing import List
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import ElasticsearchDocumentStore, InMemoryDocumentStore
from haystack.nodes import FARMReader, BM25Retriever
from haystack.pipelines import ExtractiveQAPipeline

class QuestionAnsweringSystem():

  def __init__(self, data_dir, train_filename):
      
      self.data_dir = data_dir
      self.train_filename = train_filename

  def convert_to_haystack_format(self) -> List:
    
    # convert txt files to dicts
    all_docs = []
    converter = TextConverter(valid_languages=["el"])
    for file in Path(self.data_dir).iterdir():
        all_docs.append(converter.convert(file_path=file, meta=None)[0])
    # clean and split
    preprocessor = PreProcessor(
        language='el',
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=False,
        split_by="word",
        split_length=200,
        split_respect_sentence_boundary=True,
    )
    docs = preprocessor.process(all_docs)
    return docs
  
  def create_document_store(self, docs, similarity_metric='cosine_similarity'):
    document_store = InMemoryDocumentStore(similarity_metric)
    try:
      document_store.write_documents(docs)
      return document_store
    except Exception as e:
      print(e)

  def fine_tune_reader_model (self, reader_model_path, n_epochs, model="deepset/xlm-roberta-large-squad2", use_gpu=True):

      reader = FARMReader(model_name_or_path=model, use_gpu=use_gpu)
      try:

        reader.train(
            data_dir = self.data_dir,
            train_filename=self.train_filename,
            use_gpu=use_gpu,
            n_epochs=n_epochs,
            save_dir = reader_model_path)
        print ('fine-tuning done')
      except Exception as e:
        print (e)

  def get_retriever(self, document_store):
    return BM25Retriever(document_store)
  def get_reader(self, reader_model_path):
    return FARMReader(model_name_or_path=reader_model_path)
  def get_pipeline (self, reader, retriever):
    return ExtractiveQAPipeline(reader, retriever)
  def get_answers (seld, query, pipeline, top_k_retriever=10, top_k_reader = 3):
    predictions = pipeline.run(
      query=query, params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}}
      )
    return [{'answer': result['answer'], 'context': result['context'], 'startLoc': result['offset_start_in_doc'], 'endLoc': result['offset_end_in_doc'], 'docText':  self.get_haystack_doc_text_by_id(document_store, result['document_id']), 'probability': result['probability']} for
                result in predictions['answers']]



In [None]:
# main module
import os

#if name == _main_:
DATA_DIR = 'data/first_model_data'
TRAIN_FILE = 'answers.json'
MODEL_PATH = './models'

if os.path.isdir(MODEL_PATH) == False:
    os.mkdir(MODEL_PATH)

# initialize class
qa_system = QuestionAnsweringSystem(DATA_DIR, TRAIN_FILE)

# get documents
docs =  qa_system.convert_to_haystack_format()
print ("pre-processing done")

# document store
ds = qa_system.create_document_store(docs)
print ('created document store')

# fine-tune model on data
qa_system.fine_tune_reader_model (MODEL_PATH,n_epochs=1)
print ('fine-tuning done')







100%|██████████| 17/17 [00:00<00:00, 269.30docs/s]
INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '990b8e07a4738a0a23c3ed50e967f99f' already exists in index 'cosine_similarity'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'ec49a131d59306ca7d0c7bf19cc22477' already exists in index 'cosine_similarity'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'c54aaee15abcc34f1adf035dcafcccca' already exists in index 'cosine_similarity'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '4307f9bb7e4d905eb712a8c277c606bf' already exists in index 'cosine_similarity'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '263c4f3445a66fe256ace27bf44b1e5' already exists in index 'cosine_similarity'
INFO - haystack.document_stores.base -  Duplicate Documents:

In [None]:
# retriever & reader
retriever = qa_system.get_retriever(ds)
reader = qa_system.get_reader(MODEL_PATH)

# pipeline 
pipe = qa_system.get_pipeline(reader, retriever)

# get answers dictionary

answers  = qa_system.get_answers("Μπορώ να κάνω δεύτερη δόση με διαφορετικό εμβόλιο;",pipe)
print(answers)
