In [None]:
!pip install sentence_transformers
!pip install transformers
!pip install whoosh
!pip install pytrec_eval
!pip install wget

In [None]:
import wget
wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/government.zip", "government.zip")

In [None]:
!unzip government.zip

In [1]:
# imports
# Put all your imports here
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import abc
from abc import abstractmethod
from whoosh.analysis import Filter
from whoosh import qparser

from sentence_transformers import SentenceTransformer
import torch
import nltk
from nltk.stem import *
from whoosh.index import FileIndex
from whoosh.searching import Searcher
import pandas as pd
import json
import itertools
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [2]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    Abstract class which is inherited by other IR system
    """

    def __init__(self, data_dir):
        # DON'T change the following definitions for topic_file, qrels_file, document_dir, file_list
        self.topic_file = os.path.join(data_dir, "gov.topics")
        self.qrels_file = os.path.join(data_dir, "gov.qrels")
        self.document_dir = os.path.join(data_dir, "documents")
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        self.create_index()
        self.create_parser_searcher()

    @abstractmethod
    def create_index(self):
        pass

    @abstractmethod
    def add_files(self):
        pass

    @abstractmethod
    def create_parser_searcher(self):
        pass

    @abstractmethod
    def perform_search(self, topic_phrase):
        pass

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def print_trec_eval_result(results):

        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                    for query_measures in results.values()]))


    def score(self,docnum,topic_results, topic_phrase):
        return topic_results.score(docnum)


    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topic_results = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    score = self.post_process_score(score)
                    print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):

        self.create_parser_searcher()
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                # get search result
                topic_results = self.perform_search(topic_phrase)
                # format the result
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    outputTRECFile.write(
                        "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                    topic_with_result = topic_id


        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        #fill results dictionary with queries that were returned 0 documents
        topic_ids = {t.split()[0] for t in topics}
        for emptyresult_topicid in topic_ids.difference(set(results.keys())):
            num_rel = float(sum(qrel[emptyresult_topicid].values()))
            if num_rel>0:
              topic_stats={measure:0.0 for measure in results[topic_with_result]}
            else:
              topic_stats={measure:1.0 for measure in results[topic_with_result]}
            topic_stats["num_rel"]=num_rel
            topic_stats["num_ret"] = 0.0
            topic_stats["num_rel_ret"] = 0.0
            topic_stats["num_q"]=1.0

            results[emptyresult_topicid] = topic_stats


        self.print_trec_eval_result(results)

In [3]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

# Dont change this! Use it as-is in your code if you rerank your results using a non-Whoosh scoring function
class NeuralResults():
  '''
  This class is used to rerank documents returned by whoosh in an interface that
  imitates whoosh.searching.Results (the datatype of topicResults in pyTrecEval)
  '''
  def __init__(self, booleansearchdocs,scores,rankings, file_list):
    self.results=[]
    if rankings.shape:
      for idx in rankings:
        self.results.append({'file_path':file_list[booleansearchdocs[idx]],'score':scores[idx] })

  def score(self,docnum):
    return self.results[docnum]['score']

  def __iter__(self):
    return self.results.__iter__()

## Question 4

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

**6. If you are doing neural IR you should precompute your corpus embeddings and save them in the corpus_embeddings.json file. If you do this, please keep the code used to generate the embeddings somewhere in this notebook**

In [4]:
class IRQ4(IRSystem):

    def dequantization(self, embedding, zero_point, scale):
        dequantized_embedding = (embedding.float() - zero_point) * scale
        return dequantized_embedding

    def load(self):
        with open('corpus_embeddings.json', 'r') as file:
            data = json.load(file)
        embedding_1 = torch.Tensor([data["embedding_1"][doc] for doc in self.file_list]).cpu()
        embedding_2 = torch.Tensor([data["embedding_2"][doc] for doc in self.file_list]).cpu()

        zero_point_1, zero_point_2 = data['zero_point']
        scale_1, scale_2 = data['scale']
        self.mpnet_deq = self.dequantization(embedding_1, zero_point_1, scale_1)
        self.multi_deq = self.dequantization(embedding_2, zero_point_2, scale_2)

        self.mpnet = SentenceTransformer("sentence-transformers/all-mpnet-base-v2").cpu()
        self.multiqa = SentenceTransformer("sentence-transformers/multi-qa-mpnet-base-dot-v1").cpu()

    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        self.load()
        # DON't change the name of 'index_sys'
        myAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | CustomFilter(LancasterStemmer().stem)
        schema = Schema(file_path = ID(stored=True), file_content = TEXT(analyzer = myAnalyzer))
        indexDir = tempfile.mkdtemp()
        self.index_sys = index.create_in(indexDir, schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        try:
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    content = f.read()
                    writer.add_document(file_path = filePath, file_content = content)
        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema, group=qparser.OrGroup)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results OR NeuralResults

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        topicResults = self.searcher.search(self.query_parser.parse(topic_phrase), limit=None)
        booleansearchdocs = list(topicResults.docs())

        query_embedding_mpnet = self.mpnet.encode(topic_phrase,convert_to_tensor=True, normalize_embeddings=True).cpu()
        query_embedding_multiqa = self.multiqa.encode(topic_phrase,convert_to_tensor=True, normalize_embeddings=True).cpu()

        scores_mpnet = torch.mm(query_embedding_mpnet.unsqueeze(0), self.mpnet_deq[booleansearchdocs].T).squeeze()
        scores_multiqa = torch.mm(query_embedding_multiqa.unsqueeze(0), self.multi_deq[booleansearchdocs].T).squeeze()
        combined_scores = 0.59 * scores_mpnet + 0.41 * scores_multiqa # Weighted -> 0.4754

        rankings = torch.argsort(combined_scores, descending=True)
        return NeuralResults(booleansearchdocs, combined_scores, rankings, self.file_list)

        # combined_scores = (scores_mpnet + scores_multiqa) / 2 # Averaging -> 0.4761
        # combined_scores = torch.max(scores_mpnet, scores_multiqa) # Max -> 0.4858

        # scores_mpnet_norm = (scores_mpnet - scores_mpnet.min()) / (scores_mpnet.max() - scores_mpnet.min())
        # scores_multiqa_norm = (scores_multiqa - scores_multiqa.min()) / (scores_multiqa.max() - scores_multiqa.min())
        # combined_scores = (scores_mpnet_norm + scores_multiqa_norm) / 2 # Normalization + Averaging -> 0.4726

        # scores_mpnet_prob = torch.nn.functional.softmax(scores_mpnet, dim=-1)
        # scores_multiqa_prob = torch.nn.functional.softmax(scores_multiqa, dim=-1)
        # combined_scores = (scores_mpnet_prob + scores_multiqa_prob) / 2 # Softmax + Averaging -> 0.4740

        # Square the scores -> 0.4740
        # combined_scores = (scores_mpnet ** 2 + scores_multiqa ** 2) / 2

#### Final Result @ 0.5543

In [5]:
myQ4 = IRQ4("government")
myQ4.add_files()
myQ4.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       481.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.1415
gm_map                   1       -1.9552
Rprec                    1       0.2000
bpref                    1       0.0800
recip_rank               1       0.2500
iprec_at_recall_0.00     1       0.2500
iprec_at_recall_0.10     1       0.2500
iprec_at_recall_0.20     1       0.2500
iprec_at_recall_0.30     1       0.1818
iprec_at_recall_0.40     1       0.1818
iprec_at_recall_0.50     1       0.1111
iprec_at_recall_0.60     1       0.1111
iprec_at_recall_0.70     1       0.0893
iprec_at_recall_0.80     1       0.0893
iprec_at_recall_0.90     1       0.0893
iprec_at_recall_1.00     1       0.0893
P_5                      1       0.2000
P_10                     1       0.1000
P_15                     1       0.1333
P_20                     1       0.1000
P_30                     1       0.10

#### MPNET only @ 0.5131

In [None]:
myQ4 = IRQ4("government")
myQ4.add_files()
myQ4.py_trec_eval()

#### Multi QA MPNET @ 0.5107

In [None]:
myQ4 = IRQ4("government")
myQ4.add_files()
myQ4.py_trec_eval()

#### Ensemble

In [None]:
# Average @ 0.5380
myQ4 = IRQ4("government")
myQ4.add_files()
myQ4.py_trec_eval()

In [None]:
# Max @ 0.5190
myQ4 = IRQ4("government")
myQ4.add_files()
myQ4.py_trec_eval()

In [None]:
# Normalization @ 0.5354
myQ4 = IRQ4("government")
myQ4.add_files()
myQ4.py_trec_eval()

In [None]:
# Softmax @ 0.5380
myQ4 = IRQ4("government")
myQ4.add_files()
myQ4.py_trec_eval()

In [None]:
# Square @ 0.5371
myQ4 = IRQ4("government")
myQ4.add_files()
myQ4.py_trec_eval()

### Please answer the following questions here
(a) A clear list of all final modifications made.
>Added a few more analyzers that include RegexTokenizer, LowerCase, Intraword, StopWord, and a custom filter that uses the Lancaster lemmatizer.\
>Added two neural networks: all-mpnet and multi-qa-mpnet using weighted emsembling methods.\
>Quantization on the corpus embedding.\
>Try to add query expansion but failed.

(b) Why each modification was made – how did it help?
> The analyzers parses the query and make the recall higher in the boolean retrieval stage so that there would be a higher chance of the neural net to find relevant documents in the ranking stage.\
> Both neural networks can increase the accuracy of ranking in the ranking stage. Each of them separately are able to achieve map scores higher than 0.5. Trying different ensembling methods, the combined scores using weighted average can achieve a score of 0.5543.\
> Quantization is used to reduce the size of the corpus embedding and dequantization is used to change it back. However, some information could be lost during the process.\
> Query expansion was added initially that includes both synonym as well as using Splade methods. But num_ret_rel stays the same even though more documents was retrieved in the boolean retrieval stage. Hence, it was discarded.

(c) The  final  MAP  performance  that  these  modifications  attained.\
>0.5543

### Indexing

In [None]:
myAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | CustomFilter(LancasterStemmer().stem)
schema = Schema(file_path = ID(stored=True), file_content = TEXT(analyzer = myAnalyzer))
indexDir = tempfile.mkdtemp()
index_sys = index.create_in(indexDir, schema)

document_dir = os.path.join("government", "documents")
file_list = [str(filePath) for filePath in Path(document_dir).glob("**/*") if filePath.is_file()]
writer = writing.BufferedWriter(index_sys, period=None, limit=1000)
file_contents=[]
try:
    for docNum, filePath in enumerate(file_list):
        with open(filePath, "r", encoding="utf-8") as f:
            content = f.read()
            writer.add_document(file_path = filePath, file_content = content)
            file_contents.append(content)
finally:
    writer.close()

### Embedding

In [None]:
Mpnet = SentenceTransformer("sentence-transformers/all-mpnet-base-v2").to(device)
corpus_embeddings_mpnet=Mpnet.encode(
                                    file_contents,
                                    convert_to_tensor=True,
                                    show_progress_bar=True,
                                    device=device,
                                    normalize_embeddings=True
                                )

In [None]:
MiniLM = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)
corpus_embeddings_miniLM=MiniLM.encode(
                                    file_contents,
                                    convert_to_tensor=True,
                                    show_progress_bar=True,
                                    device=device,
                                    normalize_embeddings=True
                                )

In [None]:
MultiQA = SentenceTransformer("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device)
corpus_embeddings_multiqa=MultiQA.encode(
                                    file_contents,
                                    convert_to_tensor=True,
                                    show_progress_bar=True,
                                    device=device,
                                    normalize_embeddings=True
                                )

### Quantization

In [None]:
def quantization(embedding):
    n_bits = 8
    qmin = 0.
    qmax = 2.**n_bits - 1.
    min_val = embedding.min()
    max_val = embedding.max()
    scale = (max_val - min_val) / (qmax - qmin)
    zero_point = qmin - min_val / scale

    quantized_embedding = (embedding / scale + zero_point).clamp(qmin, qmax).round().byte()
    return quantized_embedding, zero_point, scale

def dequantization(embedding, zero_point, scale):
    dequantized_embedding = (embedding.float() - zero_point) * scale
    return dequantized_embedding

### Download and Load JSON

In [None]:
document_dir = os.path.join("government", "documents")
file_list = [str(filePath).replace("\\", "/") for filePath in Path(document_dir).glob("**/*") if filePath.is_file()]
doc_ids = file_list
a1,b1,c1 = quantization(corpus_embeddings_mpnet)
a2,b2,c2 = quantization(corpus_embeddings_multiqa)
e1 = dict(zip(doc_ids, a1.tolist()))
e2 = dict(zip(doc_ids, a2.tolist()))


final_dict = {
    "embedding_1": e1,
    "embedding_2": e2,
    "zero_point": [b1.item(), b2.item()],
    "scale": [c1.item(), c2.item()]
}

json.dump(final_dict,open('corpus_embeddings.json','w'))

In [None]:
def load():
    with open('corpus_embeddings.json', 'r') as file:
        data = json.load(file)

    embedding_1 = torch.Tensor([dict_corpus_embeddings_2["embedding_1"][doc] for doc in doc_ids])
    embedding_2 = torch.Tensor([dict_corpus_embeddings_2["embedding_2"][doc] for doc in doc_ids])

    zero_point_1, zero_point_2 = data['zero_point']
    scale_1, scale_2 = data['scale']
    mpnet_deq = dequantization(embedding_1, zero_point_1, scale_1)
    multi_deq = dequantization(embedding_2, zero_point_2, scale_2)
    return mpnet_deq, multi_deq

In [None]:
doc_ids = list(range(4078))
corpus_embeddings_3 = torch.Tensor(pd.read_json('corpus_embeddings.json')[doc_ids].T.values).to(device)
(corpus_embeddings_3==corpus_embeddings).all() # NOT WORKING ANYMORE

### Query Expansion

In [None]:
# Method 1 Synonym
def expand_query(user_query, limit):
    # to split and filter
    myAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | CustomFilter(WordNetLemmatizer().lemmatize)
    tmp_user_query = myAnalyzer(user_query.replace("'", "").replace("/", ""))

    # define var
    expanded_terms = []
    old_query = ""

    # to get synonyms and antonyms
    for term in tmp_user_query:
        old_query += term.text + " "
        synonyms = set()
        synonyms.add(term.text)
        tagged_term = pos_tag(word_tokenize(term.text))

        for syn in wordnet.synsets(term.text):
            for lemma in syn.lemmas():
                tagged_lemma = pos_tag(word_tokenize(lemma.name()))
                # compare tags
                if tagged_lemma[0][1] == tagged_term[0][1]: # tag must matches
                    synonyms.add(lemma.name().replace("_", " "))
                    if lemma.antonyms():
                        synonyms.add(lemma.antonyms()[0].name().replace("_", " "))

        expanded_terms.append(synonyms)

    # Do combination
    expanded_queries = [' '.join(combo) for combo in itertools.product(*expanded_terms)]

    # Get jaccard similarity
    query_w_score = []
    for i in expanded_queries:
        set1 = set(old_query.split())
        set2 = set(i.split())
        jaccard_similarity = len(set1.intersection(set2)) / len(set1.union(set2))
        query_w_score.append((i, jaccard_similarity))
    sorted_list = sorted(query_w_score, key=lambda x: x[1], reverse=True)

    return sorted_list[0:limit]

In [None]:
# Method 2 Splade
def expansion(topic_phrase):
    model_id = 'naver/splade-cocondenser-ensembledistil'
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForMaskedLM.from_pretrained(model_id)

    tokens = tokenizer(topic_phrase, return_tensors='pt')
    output = model(**tokens)

    vec = torch.max(
        torch.log(
            1 + torch.relu(output.logits)
        ) * tokens.attention_mask.unsqueeze(-1),
    dim=1)[0].squeeze()
    cols = vec.nonzero().squeeze().cpu().tolist()
    weights = vec[cols].cpu().tolist()

    idx2token = {
        idx: token for token, idx in tokenizer.get_vocab().items()
    }
    sparse_dict_tokens = {
        idx2token[idx]: round(weight, 2) for idx, weight in zip(cols, weights)
    }
    sparse_dict_tokens = {
        k: v for k, v in sorted(
            sparse_dict_tokens.items(),
            key=lambda item: item[1],
            reverse=True
        )
    }

    tmp = dict()
    for q in sparse_dict_tokens.keys():
        if q.startswith("##"):
            continue
        tmp_user_query = WordNetLemmatizer().lemmatize(q, "v")
        if tmp_user_query not in tmp.keys():
            tmp[tmp_user_query] = [1, sparse_dict_tokens[q]]
        else:
            tmp[tmp_user_query][1] = (tmp[tmp_user_query][1] * tmp[tmp_user_query][0] + sparse_dict_tokens[q]) / (tmp[tmp_user_query][0] + 1)
            tmp[tmp_user_query][0] += 1
    sorted_data = dict(sorted(tmp.items(), key=lambda item: item[1][1], reverse=True))
    top_two_keys = tuple(sorted_data.keys())[:2]
    top_ten_keys = tuple(sorted_data.keys())[2:10]
    tmp_set = {top_two_keys, top_ten_keys}
    myQueryExp = [" ".join(c) for c in itertools.product(*tmp_set)]

    return myQueryExp

expansion("Juvenile Delinquency")

### Q4 Validation

In [6]:
q4 = IRQ4("government")
assert(isinstance(q4.index_sys, FileIndex)), "Index Type"
assert(isinstance(q4.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q4.searcher, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
