## Regulatory Information Retrieval and Answer Generation

This notebook runs the original approach from "A Hybrid Approach To Information Retrieval And Answer Generation For Regulatory Texts" with minimal adjustments to run on Irish S.I dataset

In [29]:
# Copy trec_eval repo to validate our approach - ONLY RUN ONCE
#!git clone https://github.com/usnistgov/trec_eval.git && cd trec_eval && make

In [30]:
import os
import json
import numpy as np
import pandas as pd
from typing import Dict
from tqdm import tqdm
from re import compile
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from contractions import fix as fix_contractions
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import partial

from sklearn.feature_extraction import text as sk_text

## Dataset preparation

Kept original from "A Hybrid Approach To Information Retrieval And Answer Generation For Regulatory Texts"

In [31]:
def load_qrels(docs_dir: str, fqrels: str) -> Dict[str, Dict[str, int]]:
    with open(fqrels, encoding='utf-8') as f:
        data = json.load(f)

    qrels = {}
    
    for e in data:
        qid = e["QuestionID"]
        for psg in e["Passages"]:
            qrels.setdefault(qid, {})
            pid = f"{psg['DocumentID']}-{psg['PassageID']}"
            qrels[qid][pid] = 1

    return qrels

file_type = 'test'
qrels = load_qrels("", "./QnA_complete_fixed.json")

with open("./data/qrels", "w") as f:
    for qid, rels in qrels.items():
        for pid, rel in rels.items():
            line = f"{qid} Q0 {pid} {rel}"
            f.write(line + "\n")

with open('../../all_data.json', 'r', encoding='utf-8') as f:
    all_data = json.load(f)

collection = []
seen = set()

for doc in all_data:
    for psg in doc['Passages']:
        psg_id = f"{psg['DocumentID']}-{psg['PassageID']}"
        if psg_id not in seen:
            passage_text = psg['PassageID'] + " " + psg['Passage']
            if len(passage_text) > 100:
                collection.append(
                    dict(
                        text=passage_text,
                        ID=psg_id,
                        DocumentId=psg['DocumentID'],
                        PassageId=psg['PassageID'],
                    )
                )
                seen.add(psg_id)

print(f"Loaded {len(collection)} passages into collection")

Loaded 32810 passages into collection


In [32]:
stop_words = set(stopwords.words('english'))
stop_words = sk_text.ENGLISH_STOP_WORDS.union(stop_words)
stemmer = SnowballStemmer(language='english')

pattern_newline = compile(r'[\n\t\u200e]')
pattern_multiple_spaces = compile(r' +')
pattern_non_alphanumeric = compile(r'[^a-z0-9]')

def clean_text(text: str) -> str:
    cln_text = fix_contractions(text)
    
    cln_text = cln_text.lower()
    
    cln_text = pattern_newline.sub(' ', cln_text)
    
    cln_text = pattern_non_alphanumeric.sub(' ', cln_text)
    
    tokens = [stemmer.stem(word) for word in word_tokenize(cln_text) if word not in stop_words]
    
    cln_text = ' '.join(tokens)
    
    cln_text = pattern_multiple_spaces.sub(' ', cln_text).strip()
    
    return cln_text

def simple_cleaning(query: str) -> str:
    
    cln_query = pattern_newline.sub(' ', query)
    cln_query = pattern_multiple_spaces.sub(' ', cln_query).strip()
    return cln_query

def tokenizer(text:str)-> list:

    tokens = text.split()
    
    unigrams = tokens
    
    bigrams = [f"{tokens[i]} {tokens[i + 1]}" for i in range(len(tokens) - 1)]
    
    return unigrams + bigrams

In [33]:
tokenized_corpus = [tokenizer(clean_text(doc['text'])) for doc in collection]

In [34]:
collection_array = np.array(collection)

len(tokenized_corpus) # 10592 (originalmente 13732)

32810

## Lexical Retriever: BM25

Let us evaluate the lexical retriever using BM25 (baseline)

In [35]:
bm25 = BM25Okapi(tokenized_corpus, k1=1.5, b=0.75)

In [36]:
def sintactic_query_bm5(query: str, bm5_instance: BM25Okapi) -> np.array:

    tokenized_query = tokenizer(clean_text(query))
    
    scores = bm5_instance.get_scores(tokenized_query)
    
    return scores

In [37]:
sintactic_bm25_retriever = partial(sintactic_query_bm5, bm5_instance=bm25)

In [38]:
retrieved = {}
top_n = 20

with open("./QnA_complete_fixed.json", encoding='utf-8') as f:
    data = json.load(f)
    
    for e in tqdm(data):
        query = e['Question']
        
        scores = sintactic_bm25_retriever(query)
        
        top_k = np.argpartition(-scores, top_n)[:top_n]
        
        top_k = top_k[np.argsort(-scores[top_k])]

        top_docs = collection_array[top_k]

        top_scores = scores[top_k]

        top_results = [{**doc, 'score': score} for doc, score in zip(top_docs, top_scores)]

        retrieved[e["QuestionID"]] = top_results

100%|██████████| 240/240 [01:06<00:00,  3.59it/s]


In [39]:
with open("./data/rankings_sintactic.trec", "w") as f:
    for qid, hits in retrieved.items():
        for i, hit in enumerate(hits):
            line = f"{qid} 0 {hit['ID']} {i+1} {hit['score']} bm25"
            f.write(line + "\n")

In [40]:
from trectools import TrecRun, TrecQrel, TrecEval # type: ignore

qrels = TrecQrel("./data/qrels")
run = TrecRun("./data/rankings_sintactic.trec")
te = TrecEval(run, qrels)

recall_10 = te.get_recall(depth=10)
map_10 = te.get_map(depth=10)

print(f"recall_10             \tall\t{recall_10:.4f}")
print(f"map_cut_10            \tall\t{map_10:.4f}")

recall_10             	all	0.4875
map_cut_10            	all	0.2991


In [41]:
from trectools import TrecRun, TrecQrel, TrecEval # type: ignore

qrels = TrecQrel("./data/qrels")
run = TrecRun("./data/rankings_sintactic.trec")
te = TrecEval(run, qrels)

recall_20 = te.get_recall(depth=20)
map_20 = te.get_map(depth=20)

print(f"recall_20             \tall\t{recall_20:.4f}")
print(f"map_cut_20            \tall\t{map_20:.4f}")

recall_20             	all	0.5500
map_cut_20            	all	0.3034


## Semantic Retriever: Fine Tunned BAAI/bge-small-en-v1.5

Semantic retriever using a fine-tuned model based on `BAAI/bge-small-en-v1.5`

In [42]:
sentence_transformer_model = SentenceTransformer(
    'raul-delarosa99/bge-small-en-v1.5-RIRAG_ObliQA',
    device='cuda'
)

In [43]:
def semantic_query(query: str, corpus_embeddings_matrix: np.array, 
                   sentence_transformer_model: SentenceTransformer) -> np.array:
    query_emb = sentence_transformer_model.encode([simple_cleaning(query)], 
                                                  device='cuda',
                                                  normalize_embeddings=True)
    scores = (query_emb @ corpus_embeddings_matrix.T)[0]
    
    return scores

In [44]:
corpus_embeddings_matrix = sentence_transformer_model.encode([simple_cleaning(doc['text']) for doc in collection_array],
                          normalize_embeddings=True,
                          show_progress_bar=True,
                          max_length=512,
                          )

Batches:   0%|          | 0/1026 [00:00<?, ?it/s]

In [45]:
semantic_retriever = partial(semantic_query, corpus_embeddings_matrix=corpus_embeddings_matrix,
                             sentence_transformer_model=sentence_transformer_model)

In [46]:
retrieved = {}
top_n = 20

with open("./QnA_complete_fixed.json", encoding='utf-8') as f:
    data = json.load(f)
    
    for e in tqdm(data):
        query = e['Question']
        
        scores = semantic_retriever(query)
        
        top_k = np.argpartition(-scores, top_n)[:top_n]
        
        top_k = top_k[np.argsort(-scores[top_k])]

        top_docs = collection_array[top_k]

        top_scores = scores[top_k]

        top_results = [{**doc, 'score': score} for doc, score in zip(top_docs, top_scores)]
        
        retrieved[e["QuestionID"]] = top_results

100%|██████████| 240/240 [00:04<00:00, 56.88it/s]


In [47]:
with open("./data/rankings_semantic.trec", "w") as f:
    for qid, hits in retrieved.items():
        for i, hit in enumerate(hits): 
            line = f"{qid} 0 {hit['ID']} {i+1} {hit['score']} dense"
            f.write(line + "\n") 

In [48]:
from trectools import TrecRun, TrecQrel, TrecEval

qrels = TrecQrel("./data/qrels")
run = TrecRun("./data/rankings_semantic.trec")
te = TrecEval(run, qrels)

recall_10 = te.get_recall(depth=10)
map_10 = te.get_map(depth=10)

print(f"recall_10             \tall\t{recall_10:.4f}")
print(f"map_cut_10            \tall\t{map_10:.4f}")

recall_10             	all	0.4458
map_cut_10            	all	0.2529


In [49]:
from trectools import TrecRun, TrecQrel, TrecEval

qrels = TrecQrel("./data/qrels")
run = TrecRun("./data/rankings_semantic.trec")
te = TrecEval(run, qrels)

recall_20 = te.get_recall(depth=20)
map_20 = te.get_map(depth=20)

print(f"recall_20             \tall\t{recall_20:.4f}")
print(f"map_cut_20            \tall\t{map_20:.4f}")

recall_20             	all	0.5000
map_cut_20            	all	0.2564


## Hybrid Retriever (BM25 + Fine Tunned BAAI/bge-small-en-v1.5)

Hybrid retriever using a fine-tuned model based on `BAAI/bge-small-en-v1.5`

In [50]:
def hybrid_query_avg(query: str, sintactic_retriever: partial, semantic_retriever: partial, 
                     alpha: float = 0.5) -> np.array:    
    
    sintactic_scores = sintactic_retriever(query)
    sintactic_scores = (sintactic_scores - sintactic_scores.min()) / (sintactic_scores.max() - sintactic_scores.min())
    
    semantic_scores = semantic_retriever(query)
    semantic_scores = (semantic_scores - semantic_scores.min()) / (semantic_scores.max() - semantic_scores.min())
    
    scores = alpha * semantic_scores + (1 - alpha) * sintactic_scores

    return scores

In [51]:
retrieved = {}
top_n = 20

with open("./QnA_complete_fixed.json", encoding='utf-8') as f:
    data = json.load(f)
    
    for e in tqdm(data):
        query = e['Question']
        
        scores = hybrid_query_avg(
                                query,
                                sintactic_retriever=sintactic_bm25_retriever,
                                semantic_retriever=semantic_retriever,
                                alpha=0.65
                                )
        
        top_k = np.argpartition(-scores, top_n)[:top_n]

        top_k = top_k[np.argsort(-scores[top_k])]

        top_docs = collection_array[top_k]

        top_scores = scores[top_k]

        top_results = [{**doc, 'score': score} for doc, score in zip(top_docs, top_scores)]
        
        retrieved[e["QuestionID"]] = top_results

100%|██████████| 240/240 [01:15<00:00,  3.20it/s]


In [52]:
with open("./data/rankings_hybrid.trec", "w") as f:
    for qid, hits in retrieved.items():
        for i, hit in enumerate(hits):
            line = f"{qid} 0 {hit['ID']} {i+1} {hit['score']} hybrid"
            f.write(line + "\n") 

In [53]:
from trectools import TrecRun, TrecQrel, TrecEval # type: ignore

qrels = TrecQrel("./data/qrels")
run = TrecRun("./data/rankings_hybrid.trec")
te = TrecEval(run, qrels)

recall_10 = te.get_recall(depth=10)
map_10 = te.get_map(depth=10)

print(f"recall_10             \tall\t{recall_10:.4f}")
print(f"map_cut_10            \tall\t{map_10:.4f}")

recall_10             	all	0.4958
map_cut_10            	all	0.2989


In [54]:
from trectools import TrecRun, TrecQrel, TrecEval # type: ignore

qrels = TrecQrel("./data/qrels")
run = TrecRun("./data/rankings_hybrid.trec")
te = TrecEval(run, qrels)

recall_20 = te.get_recall(depth=20)
map_20 = te.get_map(depth=20)

print(f"recall_20             \tall\t{recall_20:.4f}")
print(f"map_cut_20            \tall\t{map_20:.4f}")

recall_20             	all	0.5625
map_cut_20            	all	0.3038
