<a href="https://colab.research.google.com/github/TJT96/AIP/blob/master/cdQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Closed-Domain Question Answering using BERT**


In [None]:
# References: https://github.com/cdqa-suite/cdQA
!pip install cdqa
import os
import pandas as pd
from ast import literal_eval
import tensorflow as tf
from cdqa.utils.converters import pdf_converter
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model

### Download pre-trained reader model and PDF files

In [None]:
# Download model
download_model(model='bert-squad_1.1', dir='./models')


Downloading trained model...
bert_qa.joblib already downloaded


In [None]:
# Download AIP pdf
def download_pdf():
    import os
    import wget
    directory = './data/pdf/'
    model_url = 'https://www.caas.gov.sg/docs/default-source/pdf/aip-singapore---21-may-20.pdf'
    print('\nDownloading PDF file...')

    if not os.path.exists(directory):
        os.makedirs(directory)
    wget.download(url=model_url, out=directory)

download_pdf()


Downloading PDF file...


### Convert the PDF files into a DataFrame for cdQA pipeline

In [None]:
df = pdf_converter(directory_path='./data/pdf/')
df.head()

Unnamed: 0,title,paragraphs
0,aip-singapore---21-may-20,"[AMDTeAIPContactPost:, AERONAUTICALINFORMATION..."
1,aip-singapore---21-may-20 (1),"[AMDTeAIPContactPost:, AERONAUTICALINFORMATION..."


### Instantiate the cdQA pipeline from a pre-trained reader model

In [None]:
cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)

# Fit Retriever to documents
cdqa_pipeline.fit_retriever(df=df)

QAPipeline(reader=BertQA(adam_epsilon=1e-08, bert_model='bert-base-uncased',
                         do_lower_case=True, fp16=False,
                         gradient_accumulation_steps=1, learning_rate=5e-05,
                         local_rank=-1, loss_scale=0, max_answer_length=30,
                         n_best_size=20, no_cuda=False,
                         null_score_diff_threshold=0.0, num_train_epochs=3.0,
                         output_dir=None, predict_batch_size=8, seed=42,
                         server_ip='', server_po..._size=8,
                         verbose_logging=False, version_2_with_negative=False,
                         warmup_proportion=0.1, warmup_steps=0),
           retrieve_by_doc=False,
           retriever=BM25Retriever(b=0.75, floor=None, k1=2.0, lowercase=True,
                                   max_df=1.0, min_df=2, ngram_range=(1, 2),
                                   preprocessor=None, stop_words='english',
                                   t

In [None]:
# Fine-tuning based on SQUAD-like corpus
cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib') # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
cdqa_pipeline.fit_reader('/content/data/trainqa.json')

HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=2, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='Iteration', max=2, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='Iteration', max=2, style=ProgressStyle(description_width='ini…




QAPipeline(reader=BertQA(adam_epsilon=1e-08, bert_model='bert-base-uncased',
                         do_lower_case=True, fp16=False,
                         gradient_accumulation_steps=1, learning_rate=5e-05,
                         local_rank=-1, loss_scale=0, max_answer_length=30,
                         n_best_size=20, no_cuda=False,
                         null_score_diff_threshold=0.0, num_train_epochs=3.0,
                         output_dir=None, predict_batch_size=8, seed=42,
                         server_ip='', server_po...size=8,
                         verbose_logging=False, version_2_with_negative=False,
                         warmup_proportion=0.1, warmup_steps=0),
           retrieve_by_doc=False,
           retriever=BM25Retriever(b=0.75, floor=None, k1=2.0, lowercase=True,
                                   max_df=0.85, min_df=2, ngram_range=(1, 2),
                                   preprocessor=None, stop_words='english',
                                   t

In [None]:
#Save model after fine-tuning
cdqa_pipeline.dump_reader('saved.joblib')

 ### Execute a query

In [None]:
cdqa_pipeline.fit_retriever(df=df)
query = 'Which agent should be engaged by business aviation flights at Changi Airport?'
prediction = cdqa_pipeline.predict(query)

### Explore predictions

In [None]:
print('query: {}'.format(query))
print('answer: {}'.format(prediction[0]))
print('title: {}'.format(prediction[1]))
print('paragraph: {}'.format(prediction[2]))

query: Which agent should be engaged by business aviation flights at Changi Airport?
answer: aground handling agent
title: aip-singapore---21-may-20
paragraph: 5.1.3.9 All business aviation flights must engage aground handling agent at Singapore Changi Airport.


In [None]:
#Evaluate model
from cdqa.utils.evaluation import evaluate_reader

evaluate_reader(cdqa_pipeline, '/content/data/testqa.json')

Evaluation expects v-1.1, but got dataset with v-v2.0


{'exact_match': 37.5, 'f1': 57.17347411618204}

In [None]:
#Check gpu details
#from tensorflow.python.client import device_lib
#device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 16913744362609144849, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 14837562409059974218
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 16725817648116870895
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 11150664704
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 18372024897206872982
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"]