# Assignment 1: IR

## Preparations
* Put all your imports, and path constants in the next cells

In [None]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget
!pip install transformers
!pip install sentence_transformers

In [None]:
import wget
wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/government.zip", "government.zip")

In [None]:
!unzip government.zip

In [4]:
# imports
# Put all your imports here
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import abc
from abc import abstractmethod
from whoosh.analysis import Filter
import nltk
from nltk.stem import *
from whoosh import qparser
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    Abstract class which is inherited by other IR system
    """

    def __init__(self, data_dir):
        # DON'T change the following names,topic_file, qrels_file, document_dir, file_list
        self.topic_file = os.path.join(data_dir, "gov.topics")
        self.qrels_file = os.path.join(data_dir, "gov.qrels")
        self.document_dir = os.path.join(data_dir, "documents")
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        self.create_index()
        self.create_parser_searcher()

    @abstractmethod
    def create_index(self):
        pass

    @abstractmethod
    def add_files(self):
        pass

    @abstractmethod
    def create_parser_searcher(self):
        pass

    @abstractmethod
    def perform_search(self, topic_phrase):
        pass

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def print_trec_eval_result(results):

        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                    for query_measures in results.values()]))


    def score(self,docnum,topic_results, topic_phrase):
        return topic_results.score(docnum)


    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topic_results = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    score = self.post_process_score(score)
                    print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):

        self.create_parser_searcher()
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                # get search result
                topic_results = self.perform_search(topic_phrase)
                # format the result
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    outputTRECFile.write(
                        "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                    topic_with_result = topic_id


        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        #fill results dictionary with queries that were returned 0 documents
        topic_ids = {t.split()[0] for t in topics}
        for emptyresult_topicid in topic_ids.difference(set(results.keys())):
            num_rel = float(sum(qrel[emptyresult_topicid].values()))
            if num_rel>0:
              topic_stats={measure:0.0 for measure in results[topic_with_result]}
            else:
              topic_stats={measure:1.0 for measure in results[topic_with_result]}
            topic_stats["num_rel"]=num_rel
            topic_stats["num_ret"] = 0.0
            topic_stats["num_rel_ret"] = 0.0
            topic_stats["num_q"]=1.0

            results[emptyresult_topicid] = topic_stats


        self.print_trec_eval_result(results)


In [None]:
# ir_sys_0 = IRSystem("lab-data")

In [6]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

## Question 1
Provide your text answers in the following two markdown cells

### Q1 (a): Provide answer to Q1 (a) here [markdown cell]

MAP: mean average precision

### Q1 (b): Provide answer to Q1 (b) here [markdown cell]

 MAP is a comprehensive measure that takes into account both precision and recall at various cutoffs and provides an overall measure of retrieval quality.

## Question 2

### Q2 (a): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [7]:
from whoosh.qparser.syntax import QueryParserError
class IRQ2(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """

        schema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))
        indexDir = tempfile.mkdtemp()

        # DON't change the name of 'index_sys'
        self.index_sys = index.create_in(indexDir, schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        try:
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    content = f.read()
                    writer.add_document(file_path = filePath, file_content = content)
        finally:
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topic_results: whoosh.searching.Results

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        query = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(query, limit=None)
        return topic_results

In [8]:
q2 = IRQ2("government")

In [9]:
q2.add_files()

In [10]:
q2.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       1.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

In [11]:
q2.print_rel_name('1')

---------------------------Topic_id and Topic_phrase----------------------------------
1 mining gold silver coal
---------------------------Return documents----------------------------------
1 Q0 G00-90-0342721 0 26.645398 test
---------------------------Relevant documents----------------------------------
1 0 G00-00-1006224 1
1 0 G00-02-0901987 1
1 0 G00-03-1898526 1
1 0 G00-10-3730888 1
1 0 G00-10-3849661 1


### Q2 (b): Provide answer to Q2 (b) here [markdown cell]

0.2365

### Q2 (c): Provide answer to Q2(c) here [markdown cell]

Very well: 18, 24, 33

Very poorly: 1, 2, 6, 9

## Question 3

### Q3 (a): Provide answer to Q3 (a) here [markdown cell]

false negative: G00-00-0901987 in query 1 about mining gold silver coal. It should be positive but being classified as negative.

false positive: G00-90-0342721 in query 1 about mining gold silver coal. It should be nagative but being classified as postive.

The reason that document G00-90-0342721 being the wrong result returned in the vanilla whoosh class is that it contains all relevant words of the query: mining, gold, silver, and coal even though the information it contains is only about parks and there mining histories.

Whereas, document G00-00-0901987 being the unretreived result even though it is more related is that we did not set the search group on the query words. The defualt is ANDGroup and only a document contains all the query words exactly will be returned. Hence, we are missing this one even though its content is more related to mining jobs.

Something to improve on is that the IR class could do a little bit more parsing on both the query and documents because mine word itself or gold mine the phrase could also be related to the query but they will never be considered in vanilla whoosh thus missing the boolean retrieval stage. At the same time, ORGroup should be also be applied to retrieve more results during Boolean Retrieval Stage.

What would also be helpful is to consider the context of both the query and the document to make the search more accurate in terms of content rather than simply matching words.

### Q3 (b): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [12]:
class IRQ3(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        myAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | CustomFilter(LancasterStemmer().stem)
        schema = Schema(file_path = ID(stored=True), file_content = TEXT(analyzer = myAnalyzer))
        indexDir = tempfile.mkdtemp()
        self.index_sys = index.create_in(indexDir, schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        file_contents=[]
        try:
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    content = f.read()
                    writer.add_document(file_path = filePath, file_content = content)
                    file_contents.append(content)
        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema, group=qparser.OrGroup)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        query = self.query_parser.parse(topic_phrase)
        topicResults = self.searcher.search(query, limit=None)
        return topicResults

In [13]:
q3 = IRQ3("government")

In [14]:
q3.add_files()

In [15]:
q3.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       481.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.0618
gm_map                   1       -2.7839
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0500
iprec_at_recall_0.00     1       0.0968
iprec_at_recall_0.10     1       0.0968
iprec_at_recall_0.20     1       0.0968
iprec_at_recall_0.30     1       0.0968
iprec_at_recall_0.40     1       0.0968
iprec_at_recall_0.50     1       0.0968
iprec_at_recall_0.60     1       0.0968
iprec_at_recall_0.70     1       0.0412
iprec_at_recall_0.80     1       0.0412
iprec_at_recall_0.90     1       0.0410
iprec_at_recall_1.00     1       0.0410
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0500
P_30                     1       0.06

In [16]:
q3.print_rel_name('1')

---------------------------Topic_id and Topic_phrase----------------------------------
1 mining gold silver coal
---------------------------Return documents----------------------------------
1 Q0 G00-90-0342721 0 21.053078 test
1 Q0 G00-07-1172041 1 20.516152 test
1 Q0 G00-86-3214229 2 20.032835 test
1 Q0 G00-21-2004003 3 19.386576 test
1 Q0 G00-48-3798484 4 18.551201 test
1 Q0 G00-42-1455285 5 18.180408 test
1 Q0 G00-26-0088644 6 18.031872 test
1 Q0 G00-50-2059900 7 18.006670 test
1 Q0 G00-23-3149835 8 17.763006 test
1 Q0 G00-27-2048511 9 17.478137 test
1 Q0 G00-32-2907392 10 17.021165 test
1 Q0 G00-73-3632837 11 16.942216 test
1 Q0 G00-02-0351712 12 16.819788 test
1 Q0 G00-31-1216640 13 16.819139 test
1 Q0 G00-34-1044519 14 16.792744 test
1 Q0 G00-94-0326199 15 16.578556 test
1 Q0 G00-24-4085400 16 16.552536 test
1 Q0 G00-74-1802348 17 16.186131 test
1 Q0 G00-98-3517069 18 15.674244 test
1 Q0 G00-10-3730888 19 15.560410 test
1 Q0 G00-08-0995170 20 15.106893 test
1 Q0 G00-01-2689026 2

### Q3 (c): Provide answer to Q3 (c) here [markdown cell]

Added analyzers: LowercaseFilter, IntraWordFilter, StopFilter and a lemmatizer of Lancaster that serve as a stemFilter. In addition, the query group was changed from AND to OrGroup.

There was an overall increase in the MAP score. It rised from 0.2365 to 0.3761. In the new IR object, more documents are being returned in the boolean retrieval stage leading to decrease in the False Negatives. Yet, there was a huge increase in False Postives since many documents though contains some query words but are totally inrelevant to the query info need.

### Q3 (d): Provide answer to Q3 (d) here [markdown cell]

Yes

### Q3 (e): Provide answer to Q3 (e) here [markdown cell]

Query 22, 26 has drop in terms of MAP score while others has increased or remain the same if it is already a 1.

### Q3 (f): Provide answer to Q3 (f) here [markdown cell]

It looks like there are some positive improvements. Because there is an overall increase in MAP score. In addition, the number of false negatives has decreased dramatically due to the OrGroup in the query searcher. What is more, because our vanilla whoosh previously was so primitive that most of the query has a map score of 0 indicating it cannot find the "right" document at all, even though the recall was very poor in our new model it is still better than our old one.

## Validation

In [None]:
# Run the following cells to make sure your code returns the correct value types

In [17]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

### Q2 Validation

In [18]:
q2 = IRQ2("government")
assert(isinstance(q2.index_sys, FileIndex)), "Index Type"
assert(isinstance(q2.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q2.searcher, Searcher)), "Searcher Type"
print("Q2 Types Validated")

Q2 Types Validated


### Q3 Validation

In [19]:
q3 = IRQ3("government")
assert(isinstance(q3.index_sys, FileIndex)), "Index Type"
assert(isinstance(q3.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q3.searcher, Searcher)), "Searcher Type"
print("Q3 Types Validated")

Q3 Types Validated
