<a href="https://colab.research.google.com/github/TanJiaTing/AIP/blob/master/19Julycdqa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Closed-Domain Question Answering using BERT**


In [2]:
# References: https://github.com/cdqa-suite/cdQA
!pip install cdqa

Collecting cdqa
[?25l  Downloading https://files.pythonhosted.org/packages/39/f5/af831b7ee653aa6bace99e39ec6b2754b1adb10bb60a1296f5e16f1f24ee/cdqa-1.3.9.tar.gz (45kB)
[K     |████████████████████████████████| 51kB 2.4MB/s 
[?25hCollecting Flask==1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/9b/93/628509b8d5dc749656a9641f4caf13540e2cdec85276964ff8f43bbb1d3b/Flask-1.1.1-py2.py3-none-any.whl (94kB)
[K     |████████████████████████████████| 102kB 7.3MB/s 
[?25hCollecting flask_cors==3.0.8
  Downloading https://files.pythonhosted.org/packages/78/38/e68b11daa5d613e3a91e4bf3da76c94ac9ee0d9cd515af9c1ab80d36f709/Flask_Cors-3.0.8-py2.py3-none-any.whl
Collecting joblib==0.13.2
[?25l  Downloading https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl (278kB)
[K     |████████████████████████████████| 286kB 49.2MB/s 
[?25hCollecting pandas==0.25.0
[?25l  Downloading https://files.pytho



In [1]:
import os
import pandas as pd
from ast import literal_eval
import tensorflow as tf
from cdqa.utils.converters import pdf_converter
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model
import torch
import joblib
from cdqa.reader import BertProcessor, BertQA
from cdqa.utils.download import download_squad



### Download pre-trained reader model and PDF files

In [2]:
# Download model
download_model(model='bert-squad_1.1', dir='./models')



Downloading trained model...


In [3]:
import json
import os
import re
import sys
from tqdm import tqdm
from tika import parser
import pandas as pd
import uuid
import markdown
from pathlib import Path
from html.parser import HTMLParser
def pdf_converter(directory_path, min_length=200, include_line_breaks=False):
    """
    Function to convert PDFs to Dataframe with columns as title & paragraphs.
    Parameters
    ----------
    min_length : integer
        Minimum character length to be considered as a single paragraph
    include_line_breaks: bool
        To concatenate paragraphs less than min_length to a single paragraph
    Returns
    -------------
    df : Dataframe
    Description
    -----------------
    If include_line_breaks is set to True, paragraphs with character length
    less than min_length (minimum character length of a paragraph) will be
    considered as a line. Lines before or after each paragraph(length greater
    than or equal to min_length) will be concatenated to a single paragraph to
    form the list of paragraphs in Dataframe.
    Else paragraphs are appended directly to form the list.
    """
    list_file = os.listdir(directory_path)
    list_pdf = []
    for file in list_file:
        if file.endswith("pdf"):
            list_pdf.append(file)
    df = pd.DataFrame(columns=["title", "paragraphs"])
    for i, pdf in enumerate(list_pdf):
        try:
            df.loc[i] = [pdf.replace(".pdf",''), None]
            raw = parser.from_file(os.path.join(directory_path, pdf))
            s = raw["content"].strip()
            paragraphs = re.split("\n\n(?=\u2028|[A-Z-0-9])", s)
            list_par = []
            temp_para = ""  # variable that stores paragraphs with length<min_length
            # (considered as a line)
            for p in paragraphs:
                if not p.isspace():  # checking if paragraph is not only spaces
                    if (p[-1] != ':') and (p[-1] =='.'):
                        if temp_para:
                            # if True, append temp_para which holds concatenated
                            # lines to form a paragraph before current paragraph p
                            list_par.append(temp_para.strip())
                            temp_para = (
                                ""
                            )  # reset temp_para for new lines to be concatenated
                            list_par.append(
                                p.replace("\n", " ")
                            )  # append current paragraph with length>min_length
                        else:
                            list_par.append(p.replace("\n", " "))
                    else:
                        # paragraph p (line) is concatenated to temp_para
                        line = p.replace("\n", " ").strip()
                        temp_para = temp_para + f" {line}"
                else:
                    if temp_para:
                        list_par.append(temp_para.strip())

            df.loc[i, "paragraphs"] = list_par
        except:
            print("Unexpected error:", sys.exc_info()[0])
            print("Unable to process file {}".format(pdf))
    return df

In [4]:
# Download AIP pdf
def download_pdf():
    import os
    import wget
    directory = './data/pdf/'
    model_url = 'https://www.caas.gov.sg/docs/default-source/pdf/aip-singapore---21-may-20.pdf'
    print('\nDownloading PDF file...')

    if not os.path.exists(directory):
        os.makedirs(directory)
    wget.download(url=model_url, out=directory)

download_pdf()


Downloading PDF file...


### Convert the PDF files into a DataFrame for cdQA pipeline

In [5]:
df = pdf_converter(directory_path='./data/pdf/')
df

2020-07-20 05:16:54,090 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar to /tmp/tika-server.jar.
2020-07-20 05:16:55,002 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar.md5 to /tmp/tika-server.jar.md5.
2020-07-20 05:16:55,403 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


Unnamed: 0,title,paragraphs
0,aip-singapore---21-may-20,[AMDTeAIPContact Post: AERONAUTICAL INFORMATIO...


### Instantiate the cdQA pipeline from a pre-trained reader model

In [7]:
cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)

# Fit Retriever to documents
cdqa_pipeline.fit_retriever(df=df)

QAPipeline(reader=BertQA(adam_epsilon=1e-08, bert_model='bert-base-uncased',
                         do_lower_case=True, fp16=False,
                         gradient_accumulation_steps=1, learning_rate=5e-05,
                         local_rank=-1, loss_scale=0, max_answer_length=30,
                         n_best_size=20, no_cuda=False,
                         null_score_diff_threshold=0.0, num_train_epochs=3.0,
                         output_dir=None, predict_batch_size=8, seed=42,
                         server_ip='', server_po..._size=8,
                         verbose_logging=False, version_2_with_negative=False,
                         warmup_proportion=0.1, warmup_steps=0),
           retrieve_by_doc=False,
           retriever=BM25Retriever(b=0.75, floor=None, k1=2.0, lowercase=True,
                                   max_df=1.0, min_df=2, ngram_range=(1, 2),
                                   preprocessor=None, stop_words='english',
                                   t

In [8]:
# Fine-tuning based on SQUAD-like corpus
cdqa_pipeline.fit_reader('/content/data/trainqa.json')

HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=2, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='Iteration', max=2, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='Iteration', max=2, style=ProgressStyle(description_width='ini…




QAPipeline(reader=BertQA(adam_epsilon=1e-08, bert_model='bert-base-uncased',
                         do_lower_case=True, fp16=False,
                         gradient_accumulation_steps=1, learning_rate=5e-05,
                         local_rank=-1, loss_scale=0, max_answer_length=30,
                         n_best_size=20, no_cuda=False,
                         null_score_diff_threshold=0.0, num_train_epochs=3.0,
                         output_dir=None, predict_batch_size=8, seed=42,
                         server_ip='', server_po..._size=8,
                         verbose_logging=False, version_2_with_negative=False,
                         warmup_proportion=0.1, warmup_steps=0),
           retrieve_by_doc=False,
           retriever=BM25Retriever(b=0.75, floor=None, k1=2.0, lowercase=True,
                                   max_df=1.0, min_df=2, ngram_range=(1, 2),
                                   preprocessor=None, stop_words='english',
                                   t

In [9]:
#Save model after fine-tuning
cdqa_pipeline.dump_reader('saved.joblib')

 ### Execute a query

In [16]:
new_pipeline = QAPipeline(reader='./saved.joblib', max_df=1.0)
new_pipeline.fit_retriever(df=df)
queries = ['Which flights can operate to Changi Airport without obtaining slots?',
           "What kind of aircraft is permitted to remain on the ground or layover at Changi Airport?",
           "Which agent should be engaged by business aviation flights at Changi Airport?",
           "What are the penalties for contravening the legislation dealing with non-scheduled flights?",
           "How should business aviation aircraft park in Changi Airport?",
           "What is the normal permit fee for 2 one-way flights?",
           "When is the application deadline for a normal permit?",
           "Whose duty is it to open and repack baggage during customs checks?",
           "What are the different types of dutiable goods?",
           "What are the conditions for duty-free liquor concession?",
           "Which nationalities require visas for the purpose of social visits in Singapore?",
           "What documents are required for unvaccinated individuals who wish to enter Singapore?",
           "How long is an International Certificate of Vaccination valid for?",
           "Which travellers are required to obtain a valid International Certificate of Vaccination?",
           "Which aircrafts does paragraph 50D of the Air Navigation Order apply to?",
           "What documents are required to carry dangerous goods in an aircraft?"]
for query in queries:
  prediction = new_pipeline.predict(query)
  print('query: {}'.format(query))
  print('answer: {}'.format(prediction[0]))
  print('paragraph: {}\n'.format(prediction[2]))

query: Which flights can operate to Changi Airport without obtaining slots?
answer: non-scheduled, commercial and non-commercial flights
paragraph: 2.2 To apply for slots for access to Singapore Changi Airport, all operators or agents of non-scheduled, commercial and non-commercial flights shall submit applications for slots via either a Slot Clearance Request (SCR) to the Changi Slot Coordinator, or for operators without a 2-letter IATA airline code, a General (Aviation) Clearance Request (GCR) through the Online Coordination System (OCS) (at www.online-coordination.com). Changi Slot Coordinator c/o Changi Airport Group (Singapore) Pte Ltd Singapore Changi Airport P.O. Box 168 Singapore 918146 Email: csc@changiairport.com Tel: +65 6541 2378 or +65 6541 3064

query: What kind of aircraft is permitted to remain on the ground or layover at Changi Airport?
answer: business aviation aircraft operating as executive jet charter
paragraph: 5.1.3.5 All business aviation aircraft operating as e

In [12]:
#Evaluate model
from cdqa.utils.evaluation import evaluate_reader

evaluate_reader(new_pipeline, '/content/data/testqa.json')

Evaluation expects v-1.1, but got dataset with v-v2.0


{'exact_match': 37.5, 'f1': 57.76871221142014}

In [None]:
#Check gpu details
#from tensorflow.python.client import device_lib
#device_lib.list_local_devices()
#df['paragraphs'][0]
# import os
# import torch
# !pip install transformers
#!pip install pytorch-transformers
# from transformers import BertTokenizer
#!pip install git+https://github.com/huggingface/transformers.git

# import torch
# from pytorch_transformers import *
# from transformers.modeling_tf_bert import TFBertForSequenceClassification
# output_dir = './model_save'
# model1 = model_class.from_pretrained(output_dir)
# tokenizer1 = tokenizer_class.from_pretrained(output_dir)
# model1

NameError: ignored

In [None]:
#Exploring data
df['paragraphs'][0]


['AMDTeAIPContact Post: AERONAUTICAL INFORMATION SERVICES Civil Aviation Authority of Singapore, Singapore Changi Airport, P. O. Box 1 Singapore 918141 Tel: (65) 64227036 Fax: (65) 64410221 Email: caas_singaporeais@caas.gov.sg 03/2020 Effective date 21 MAY 2020 Publication date 21 MAY 2020  wp-AMDT-2020-03',
 '1. Significant information and changes 1.1 Singapore Changi Airport  a. Safegate Aircraft Docking Guidance system (ADGS) - Safedock Type 2 removed.',
 '2. This amendment incorporates information contained in the listed NOTAMs and AIP Supplements which are hereby superseded: N I L Amended Pages',
 'GEN 0.2-1/2: : replace. GEN 0.3-1/2: : replace. GEN 0.3-3/4: : replace. GEN 0.3-5: : replace. GEN 0.4-1/2: : replace. GEN 0.4-3: : replace. GEN 1.2-3/4: : replace. GEN 3.2-3/4: : replace. ENR 5.6-1/2: : replace. AD 0.6-1/2: : replace. AD 2.WSSS-3/4: : replace. AD 2.WSSS-5/6: : replace. AD 2.WSSS-7/8: : replace. AD 2.WSSS-37/38: : replace. AD-2-WSSS-ADC-2: : replace. AD 2.WSSL-5/6: : rep