In [1]:
# import needed libraries
import pandas as pd
import helper.Utils as Utils
import pyterrier as pt
import configure  as cf


In [2]:

if not pt.started():
    print("Enabling PRF in pyterier")
    # In this lab, we need to specify that we start PyTerrier with PRF enabled
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

Enabling PRF in pyterier
PyTerrier 0.6.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
# define some constants and configure the input and output files
from pyterrier.measures import RR, Success, R, Rprec, P, MAP
eval_metrics = ["map", MAP@5, P@1, RR, Rprec, R@5, R@10, R@20, R@50, R@100, R@200, Success@5]
RANK = cf.RANK
SCORE = cf.SCORE
TWEET_ID_COLUMN = cf.TWEET_ID_COLUMN
TWEET_TEXT_COLUMN = cf.TWEET_TEXT_COLUMN
VCLAIM_ID = cf.VCLAIM_ID
VCLAIM = cf.VCLAIM
TITLE = cf.TITLE 
LABEL = cf.LABEL

dev_query_path = cf.ENG_CLEF_2020_DEV_QUERIES
dev_query_with_url_info_path = cf.ENG_CLEF_2020_URL_CLEANED_DEV_QUERIES
qrels_file = cf.ENG_CLEF_2020_QRELS
claims_file = cf.ENG_CLEF_2020_VCLAIMS
evaluation_path = "./data/en2020-dev-sig-test-lexical-retrievals.xlsx"

## Report results on dev set before and after applying preprocessing (expansion)
    - Documents are the verified claims as they provided by organizers.
    - Queries are tweets after replacing the embedded URLs with their corresponding information (Picture/Video URL with its best guess, URL webpage with its page title)
    

In [4]:
# 1 read the documents (vclaims)
df_doc = Utils.read_file(claims_file)
df_doc[VCLAIM_ID] = df_doc[VCLAIM_ID].astype(str)
df_doc[VCLAIM] = df_doc[VCLAIM].apply(Utils.remove_punctuation)


# 2. Load qrels file
df_qrels = pd.read_csv(qrels_file, sep="\t", names=["qid", "Q0", "docno", LABEL])
df_qrels["qid"]=df_qrels["qid"].astype(str)
df_qrels["docno"]=df_qrels["docno"].astype(str)



In [5]:
def get_document(df):
    for i, row in df.iterrows():
        yield {"docno": row[VCLAIM_ID], "text": row[VCLAIM], "title": row[TITLE]}

# 3.1 build multi-field index
iter_indexer = pt.IterDictIndexer("./indexes/en-clef-2020-index-multi-field-without-preprocessing",  overwrite=True, verbose=True)
iter_indexer.setProperty("tokeniser", "EnglishTokeniser")
indexref2 = iter_indexer.index(get_document(df_doc), fields=["text", "title"], meta=['docno'])
multi_field_index = pt.IndexFactory.of(indexref2)


In [6]:
# 4. Load queries and remove punctuations that confuses the retrieval model
# 4.1 raw queries without any preprocessing
df_dev = Utils.read_file(dev_query_path)
df_dev["query"] =df_dev[TWEET_TEXT_COLUMN]
df_dev["query"] =df_dev["query"].apply(Utils.remove_punctuation)
df_dev["qid"] = df_dev[TWEET_ID_COLUMN].astype(str)
df_dev = df_dev[["qid", "query"]]

# 4.2 queries with urls replaces with their corresponding information
df_dev_with_url_info = Utils.read_file(dev_query_with_url_info_path)
df_dev_with_url_info["query"] =df_dev_with_url_info["cleaned"]
df_dev_with_url_info["query"] =df_dev_with_url_info["query"].apply(Utils.remove_punctuation)
df_dev_with_url_info["qid"] = df_dev_with_url_info[TWEET_ID_COLUMN].astype(str)
df_dev_with_url_info = df_dev_with_url_info[["qid", "query"]]



## Calculate significance Test for models before and after preprocessing (extracting url info)

In [7]:
#intialize BM25 model to get the top 100 potentially relevant documents
def compute_sig_test(df_query, first_res, second_res, first_method_name, second_method_name):
    df_bm25_eval = pt.Experiment([first_res, second_res],
                df_query,
                qrels=df_qrels,
                eval_metrics=["map",MAP@5, P@1, RR, R@100],
                names=[first_method_name, second_method_name],
                baseline=0
            )
    return df_bm25_eval


DEPTH=100
bm25_retr = pt.BatchRetrieve(multi_field_index, controls = {"wmodel": "BM25"},num_results=DEPTH)
BM25F = pt.BatchRetrieve(multi_field_index,wmodel="BM25F", num_results=DEPTH)
DPH = pt.BatchRetrieve(multi_field_index,wmodel="DPH", num_results=DEPTH)
JM = pt.BatchRetrieve(multi_field_index,wmodel="Hiemstra_LM",controls ={"c":0.05},num_results=DEPTH)
rm3_pipe = bm25_retr >> pt.rewrite.RM3(multi_field_index,fb_terms=10, fb_docs=1) >> bm25_retr

retrievers = [bm25_retr, DPH, JM, rm3_pipe]
names = ["BM25", "DPH", "JM", "RM3"]


df_eval = pd.DataFrame()


for i in range(len(retrievers)):
    retr = retrievers[i]
    retr_name = names[i]
    res = retr.transform(df_dev)
    res_after_preprocessing =retr.transform(df_dev_with_url_info)
    df_retr_eval = compute_sig_test(df_dev, res, res_after_preprocessing, retr_name, retr_name+"+PreP")
    df_eval = df_eval.append(df_retr_eval, ignore_index=False)  


df_eval.to_excel(evaluation_path, index=False)
df_eval

Unnamed: 0,name,map,RR,P@1,R@100,AP@5,map +,map -,map p-value,RR +,...,RR p-value,P@1 +,P@1 -,P@1 p-value,R@100 +,R@100 -,R@100 p-value,AP@5 +,AP@5 -,AP@5 p-value
0,BM25,0.716285,0.717131,0.593909,0.949239,0.709814,,,,,...,,,,,,,,,,
1,BM25+PreP,0.738649,0.739495,0.609137,0.954315,0.732657,14.0,9.0,0.066712,14.0,...,0.066712,6.0,3.0,0.318543,2.0,1.0,0.565026,13.0,5.0,0.075912
0,DPH,0.69125,0.692096,0.563452,0.939086,0.685195,,,,,...,,,,,,,,,,
1,DPH+PreP,0.726616,0.727462,0.598985,0.954315,0.72132,16.0,5.0,0.006605,16.0,...,0.006605,9.0,2.0,0.034467,3.0,0.0,0.08326,15.0,3.0,0.007791
0,JM,0.693881,0.694727,0.558376,0.944162,0.687394,,,,,...,,,,,,,,,,
1,JM+PreP,0.71804,0.718886,0.57868,0.944162,0.712352,14.0,9.0,0.061875,14.0,...,0.061875,7.0,3.0,0.206712,1.0,1.0,1.0,13.0,5.0,0.066004
0,RM3,0.69608,0.696926,0.593909,0.898477,0.692386,,,,,...,,,,,,,,,,
1,RM3+PreP,0.717729,0.718575,0.609137,0.913706,0.713367,22.0,6.0,0.122399,22.0,...,0.122399,6.0,3.0,0.318543,4.0,1.0,0.180377,13.0,4.0,0.141223
