# Assignment 2: IR

## Preparations
* Put all your imports, and path constants in the next cells
* Make sure all your path constants are **relative to** ***DATA_DIR*** and **NOT hard-coded** in your code.

In [1]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget



In [2]:
import wget
wget.download("https://github.com/MIE451-1513-2019/course-datasets/raw/master/government.zip", "government.zip")

'government (1).zip'

In [35]:
!unzip government.zip

Archive:  government.zip
replace government/topics-with-full-descriptions.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [0]:
# imports
# Put all your imports here
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget

In [0]:
DATA_DIR = "government"
#
# Put other path constants here
#
DOCUMENTS_DIR = os.path.join(DATA_DIR,"documents")
TOPIC_FILE = os.path.join(DATA_DIR,"gov.topics")
QRELS_FILE = os.path.join(DATA_DIR,"gov.qrels")

## Question 1
Provide your text answers in the following two markdown cells

### Q1 (a): Provide answer to Q1 (a) here [num_rel_ret]

### Q1 (b): Provide answer to Q1 (b) here [People who search in the government normally wants to find the information they need. By providing the num_rel_ret will let people know the overall number of relevant documents retrived. This way people will have an overview of what they are searching for.]

## Question 2

### Q2 (a): Write your code below

In [6]:
#Creating the index
def createIndex(schema):
    indexDir = tempfile.mkdtemp()
    return index.create_in(indexDir,schema)

# first, define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))
# now, create the index at the path INDEX_DIR based on the new schema
myIndex = createIndex(mySchema)

#Indexing the documents
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if (docNum+1) % 1000 == 0:
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()
# Build a list of files to index
filesToIndex = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

# Check the list
print(filesToIndex[:5])

# count files to index
print("number of files:", len(filesToIndex))

addFilesToIndex(myIndex, filesToIndex)

# Make sure you save the final index in the variable INDEX_Q2, your query parser in QP_Q2, and your searcher in SEARCHER_Q2

['government/documents/68/G00-68-3094820', 'government/documents/68/G00-68-0361254', 'government/documents/68/G00-68-2512403', 'government/documents/68/G00-68-0000000', 'government/documents/68/G00-68-1332243']
number of files: 4078
already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [0]:
# define a query parser for the field "file_content" in the index
myQueryParser = QueryParser("file_content", schema=myIndex.schema)
mySearcher = myIndex.searcher()

In [0]:
INDEX_Q2 = myIndex 
QP_Q2 = myQueryParser 
SEARCHER_Q2 = mySearcher

In [9]:
# print the topic file
with open(TOPIC_FILE, "r") as f:
    print(f.read())
    
# print the first 10 lines in the qrels file
with open(QRELS_FILE, "r") as f:
    qrels10 = f.readlines()[:10]
    print("".join(qrels10))

1 mining gold silver coal
2 juvenile delinquency
4 wireless communications
6 physical therapists
7 cotton industry
9 genealogy searches
10 Physical Fitness
14 Agricultural biotechnology
16 Emergency and disaster preparedness assistance
18 Shipwrecks
19 Cybercrime, internet fraud, and cyber fraud
22 Veteran's Benefits
24 Air Bag Safety
26 Nuclear power plants
28 Early Childhood Education

1 0 G00-00-0681214 0
1 0 G00-00-0945765 0
1 0 G00-00-1006224 1
1 0 G00-00-1591495 0
1 0 G00-00-2764912 0
1 0 G00-00-3253540 0
1 0 G00-00-3717374 0
1 0 G00-01-0270065 0
1 0 G00-01-0400712 0
1 0 G00-01-0682299 0



In [0]:
def pyTrecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()

    # create an output file to which we'll write our results
    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:
        # for each evaluated topic:
        # build a query and record the results in the file in TREC_EVAL format
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            #print(topic_id, topic_phrase)
            topicQuery = queryParser.parse(topic_phrase)
            topicResults = searcher.search(topicQuery, limit=None)
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)
                #print("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
    with open(qrelsFile, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(tempOutputFile, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)
    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    for query_id, query_measures in results.items():
        for measure, value in query_measures.items():
            if measure == "runid":
              continue
            print_line(measure, query_id, value)
    for measure in query_measures.keys():
        if measure == "runid":
              continue
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))

In [11]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q2, SEARCHER_Q2)


num_q                    1       1.0000
num_ret                  1       1.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

### Q2 (b): Provide answer to Q2 (b) here [0.1971]

### Q2 (c): Provide answer to Q2(c) here [Topic 18 and topic 24 have very good results with map values equal to 1. Topic 4, 10, 14, 22, 26 did ok. Topic 1, 2, 6, 7, 9, 16 have very poor result because their map values equal to 0.]

## Question 3

### Q3 (a): Provide answer to Q3 (a) here [False positive means the documents the system retrived that are not relevant.False negative means the system said not relevant(not returned) but actually are relevant documents.We take topic 26 as an example. In topic 26, document "Q0 G00-61-1118212 0 25.225053 test" ranked at the top returned documents, but this document is actually not a relevant document. This document is a false positive example. Document "26 0 G00-92-1620651 1" is the relevant document which is not returned. This document should been ranked on top, but not. Therefore, this is an example of a false negative. In the schema we defined before only have the RegexTokenizer() analyzer. This is too simple and for example, the system might search the not important term such as "the", "of", and therefore ranked the irrelevant documents on top. We should improve the system by define a new analyzer with stemming, lemmatization, lowercaseFilter, StopFilter, IntraWordFilter, and etc.]

In [0]:
def printRelName(topicFile, qrelsFile, queryParser, searcher, id):
  with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()
  for topic in topics:
        topic_id, topic_phrase = tuple(topic.split(" ", 1))
        if topic_id == id:
          print("---------------------------Topic_id and Topic_phrase----------------------------------")
          print(topic_id, topic_phrase)
          topicQuery = queryParser.parse(topic_phrase)
          topicResults = searcher.search(topicQuery, limit=None)
          print("---------------------------Return documents----------------------------------")
          for (docnum, result) in enumerate(topicResults):
              score = topicResults.score(docnum)
              print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
          print("---------------------------Relevant documents----------------------------------")
          with open(qrelsFile, 'r') as f_qrel:
            qrels = f_qrel.readlines()
            for i in qrels:
              qid, _, doc, rel = i.rstrip().split(" ")
              if qid == id and rel == "1":
                print(i.rstrip())

In [13]:
# print the topic file
with open(TOPIC_FILE, "r") as f:
    print(f.read())

1 mining gold silver coal
2 juvenile delinquency
4 wireless communications
6 physical therapists
7 cotton industry
9 genealogy searches
10 Physical Fitness
14 Agricultural biotechnology
16 Emergency and disaster preparedness assistance
18 Shipwrecks
19 Cybercrime, internet fraud, and cyber fraud
22 Veteran's Benefits
24 Air Bag Safety
26 Nuclear power plants
28 Early Childhood Education



In [14]:
printRelName(TOPIC_FILE, QRELS_FILE, QP_Q2, SEARCHER_Q2,"26")

---------------------------Topic_id and Topic_phrase----------------------------------
26 Nuclear power plants
---------------------------Return documents----------------------------------
26 Q0 G00-61-1118212 0 25.225053 test
26 Q0 G00-76-3273936 1 23.814912 test
26 Q0 G00-01-3645577 2 22.681193 test
26 Q0 G00-84-2503293 3 22.639455 test
26 Q0 G00-49-0195605 4 22.506206 test
26 Q0 G00-30-0129773 5 22.470464 test
26 Q0 G00-97-1443049 6 22.048298 test
26 Q0 G00-73-1499832 7 21.748460 test
26 Q0 G00-08-2701029 8 21.451529 test
26 Q0 G00-50-2186799 9 21.295502 test
26 Q0 G00-30-1518511 10 20.813328 test
26 Q0 G00-15-0501460 11 20.707036 test
26 Q0 G00-11-0770745 12 20.420555 test
26 Q0 G00-03-2200929 13 20.200627 test
26 Q0 G00-72-1085257 14 19.836718 test
26 Q0 G00-05-1894408 15 19.228168 test
26 Q0 G00-64-3503951 16 18.457268 test
26 Q0 G00-13-1543158 17 18.416715 test
26 Q0 G00-50-1075346 18 17.385842 test
26 Q0 G00-11-1650256 19 17.310835 test
26 Q0 G00-76-2323292 20 10.320647 test
26

### Q3 (b): Write your code below

In [15]:
# Put your code for creating the index here (you can add more cells).
# define a Schema with the new analyzer
stmLwrStpIntraAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()

mySchema3 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpIntraAnalyzer))

# create the index based on the new schema
myIndex3 = createIndex(mySchema3)
addFilesToIndex(myIndex3, filesToIndex)
# Make sure you save the final index in the variable INDEX_Q3, your query parser in QP_Q3, and your searcher in SEARCHER_Q3

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [0]:
# define a query parser for the field "file_content" in the index
myQueryParser3 = QueryParser("file_content", schema=myIndex3.schema)
mySearcher3 = myIndex3.searcher()

In [0]:
INDEX_Q3 = myIndex3 # Replace None with your index for Q3
QP_Q3 =  myQueryParser3 # Replace None with your query parser for Q3
SEARCHER_Q3 = mySearcher3 # Replace None with your searcher for Q3

In [18]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q3, SEARCHER_Q3) 

num_q                    1       1.0000
num_ret                  1       3.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

### Q3 (c): Provide answer to Q3 (c) here [I modified the text analyzer by adding the stemming, lemmatization, lowercaseFilter, StopFilter, IntraWordFilter, and etc. The new overall MAP improved to 0.3366. The false negative "26 0 G00-92-1620651 1" document in part(a) has now become a true positive in part (c). ]

In [19]:
printRelName(TOPIC_FILE, QRELS_FILE, QP_Q3, SEARCHER_Q3,"26")

---------------------------Topic_id and Topic_phrase----------------------------------
26 Nuclear power plants
---------------------------Return documents----------------------------------
26 Q0 G00-76-3273936 0 23.102509 test
26 Q0 G00-28-2250792 1 23.047790 test
26 Q0 G00-61-1118212 2 22.994890 test
26 Q0 G00-72-1085257 3 22.236040 test
26 Q0 G00-30-0129773 4 21.930598 test
26 Q0 G00-01-3645577 5 21.732175 test
26 Q0 G00-84-2503293 6 21.532254 test
26 Q0 G00-13-1543158 7 21.230415 test
26 Q0 G00-50-2186799 8 21.070390 test
26 Q0 G00-30-1518511 9 21.026246 test
26 Q0 G00-15-0501460 10 20.693912 test
26 Q0 G00-53-1684082 11 20.545936 test
26 Q0 G00-49-0195605 12 20.312592 test
26 Q0 G00-37-2508472 13 20.269900 test
26 Q0 G00-11-1650256 14 20.252264 test
26 Q0 G00-63-1389256 15 20.203892 test
26 Q0 G00-14-1375985 16 20.126123 test
26 Q0 G00-62-1736084 17 20.018352 test
26 Q0 G00-64-3503951 18 19.973450 test
26 Q0 G00-73-1499832 19 19.930765 test
26 Q0 G00-50-1075346 20 19.905387 test
26

### Q3 (d): Provide answer to Q3 (d) here [Yes]

### Q3 (e): Provide answer to Q3 (e) here [Yes]

### Q3 (f): Provide answer to Q3 (f) here [It was a good idea. Most of the queries get better and the overall MAP also get better from 0.1971 to 0.3366. This means when people search for the queries, they tend to get more relevant documents from the returned documents than the previous system.]

## Question 4

In [20]:
import nltk
from nltk.stem import *

# download required resources
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [22]:
# attempt 1, add LancasterStemmer() to the analyzer
newAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | CustomFilter(LancasterStemmer().stem)

mySchema4 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = newAnalyzer))

# create the index based on the new schema
myIndex4 = createIndex(mySchema4)
addFilesToIndex(myIndex4, filesToIndex)


already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [23]:
from whoosh import scoring, qparser
INDEX_Q4 = myIndex4 # Replace None with your index for Q4

myQueryParser_a1 = QueryParser("file_content", schema=myIndex4.schema)
mySearcher4 = myIndex4.searcher()

pyTrecEval(TOPIC_FILE, QRELS_FILE, myQueryParser_a1, mySearcher4)
# MAP all = 0.3456

num_q                    1       1.0000
num_ret                  1       3.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

In [24]:
# attempt 2, join query term with "OR" instead of default "AND"
myQueryParser_a2 = QueryParser("file_content", schema=INDEX_Q4.schema,group=qparser.OrGroup.factory(0))
mySearcher_a2 = myIndex4.searcher()
pyTrecEval(TOPIC_FILE, QRELS_FILE, myQueryParser_a2, mySearcher_a2)
# MAP all = 0.3797

num_q                    1       1.0000
num_ret                  1       481.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.0618
gm_map                   1       -2.7839
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0500
iprec_at_recall_0.00     1       0.0968
iprec_at_recall_0.10     1       0.0968
iprec_at_recall_0.20     1       0.0968
iprec_at_recall_0.30     1       0.0968
iprec_at_recall_0.40     1       0.0968
iprec_at_recall_0.50     1       0.0968
iprec_at_recall_0.60     1       0.0968
iprec_at_recall_0.70     1       0.0412
iprec_at_recall_0.80     1       0.0412
iprec_at_recall_0.90     1       0.0410
iprec_at_recall_1.00     1       0.0410
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0500
P_30                     1       0.06

In [25]:
# attempt 3, adding scoring function to searcher,try tf-idf first
myQueryParser_a3 = QueryParser("file_content", schema=INDEX_Q4.schema,group=qparser.OrGroup)
mySearcher_a3 = myIndex4.searcher(weighting=scoring.TF_IDF)
pyTrecEval(TOPIC_FILE, QRELS_FILE, myQueryParser_a3, mySearcher_a3)
# MAP all = 0.1302
# This scroing module decrease the map value, try another scoring module

num_q                    1       1.0000
num_ret                  1       481.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.0491
gm_map                   1       -3.0135
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0476
iprec_at_recall_0.00     1       0.0870
iprec_at_recall_0.10     1       0.0870
iprec_at_recall_0.20     1       0.0870
iprec_at_recall_0.30     1       0.0870
iprec_at_recall_0.40     1       0.0870
iprec_at_recall_0.50     1       0.0494
iprec_at_recall_0.60     1       0.0494
iprec_at_recall_0.70     1       0.0494
iprec_at_recall_0.80     1       0.0494
iprec_at_recall_0.90     1       0.0216
iprec_at_recall_1.00     1       0.0216
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.06

In [26]:
# attempt 4, adding scoring function to searcher,try BM2.5F scoring module
myQueryParser_a4 = QueryParser("file_content", schema=INDEX_Q4.schema,group=qparser.OrGroup)
mySearcher_a4 = myIndex4.searcher(weighting=scoring.BM25F(B=0.5, K1=1.55))
pyTrecEval(TOPIC_FILE, QRELS_FILE, myQueryParser_a4, mySearcher_a4)
# MAP all = 0.4000

num_q                    1       1.0000
num_ret                  1       481.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.0613
gm_map                   1       -2.7915
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0526
iprec_at_recall_0.00     1       0.1034
iprec_at_recall_0.10     1       0.1034
iprec_at_recall_0.20     1       0.1034
iprec_at_recall_0.30     1       0.1034
iprec_at_recall_0.40     1       0.1034
iprec_at_recall_0.50     1       0.1034
iprec_at_recall_0.60     1       0.1034
iprec_at_recall_0.70     1       0.0421
iprec_at_recall_0.80     1       0.0421
iprec_at_recall_0.90     1       0.0370
iprec_at_recall_1.00     1       0.0370
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0500
P_30                     1       0.10

### Please answer the following questions here
(a) A clear list of all final modifications made.
<p>1. Added the NLTK's LancasterStemmer() into the analyzer. 
<p>2. Modified the query parser from default "AND" to "OR" by using group keyword argument.
<p>3. Added a scoring function to the searcher, tried tf-idf and BM25F scoring module.


(b)  Why each modification was made – how did it help? 
<p> #1 Lancasterstemmer produced a even shorter stem, therefore the terms are reduced and more can be matched with the query term and improve the recall. #2 was made because the "OR" query can make the documents that contain more of the query terms score higher. When using "AND", all the terms must be present for a document to match.  #3 This is the ranking function that ranks a set of documents based on the query terms appearing in each document, which can improve the relevancy of search result. 


(c)  The  final  MAP  performance  that  these  modifications  attained.
<p>The final MAP is 0.4000 after the modifications applied. 

In [0]:
QP_Q4 = myQueryParser_a4 # Replace None with your query parser for Q4
SEARCHER_Q4 = mySearcher_a4 # Replace None with your searcher for Q4

In [28]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q4, SEARCHER_Q4) 

num_q                    1       1.0000
num_ret                  1       481.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.0613
gm_map                   1       -2.7915
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0526
iprec_at_recall_0.00     1       0.1034
iprec_at_recall_0.10     1       0.1034
iprec_at_recall_0.20     1       0.1034
iprec_at_recall_0.30     1       0.1034
iprec_at_recall_0.40     1       0.1034
iprec_at_recall_0.50     1       0.1034
iprec_at_recall_0.60     1       0.1034
iprec_at_recall_0.70     1       0.0421
iprec_at_recall_0.80     1       0.0421
iprec_at_recall_0.90     1       0.0370
iprec_at_recall_1.00     1       0.0370
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0500
P_30                     1       0.10

## Validation

In [0]:
# Run the following cells to make sure your code returns the correct value types

In [0]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

### Q2 Validation

In [31]:
assert(isinstance(INDEX_Q2, FileIndex)), "Index Type"
assert(isinstance(QP_Q2, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q2, Searcher)), "Searcher Type"
print("Q2 Types Validated")

Q2 Types Validated


### Q3 Validation

In [32]:
assert(isinstance(INDEX_Q3, FileIndex)), "Index Type"
assert(isinstance(QP_Q3, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q3, Searcher)), "Searcher Type"
print("Q3 Types Validated")

Q3 Types Validated


### Q4 Validation

In [33]:
assert(isinstance(INDEX_Q4, FileIndex)), "Index Type"
assert(isinstance(QP_Q4, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q4, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
