In [34]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 150)

import pyterrier as pt
import os

import string

In [35]:
if not pt.started():
    pt.init()

In [36]:
corpusDf = pd.read_csv('query_11_20_annotation.csv')
corpusDf = corpusDf.astype({'id': str})
corpusDf['text'] = corpusDf['text'].str.replace('\n', ' ')
corpusDf = corpusDf.rename(columns={'id': 'docno', 'label': 'readability'})
corpusDf.head()

Unnamed: 0,text,readability,int-1,int-2,int-3,int-4,int-5,int-6,int-7,int-8,...,int-12,int-13,int-14,int-15,int-16,int-17,int-18,int-19,int-20,docno
0,"Hi! I've been meaning to write for ages and finally today I'm actually doing something about it. Not that I'm trying to make excuses for myself, i...",B2,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
1,﻿It was not so much how hard people found the challenge but how far they would go to avoid it that left researchers gobsmacked. The task? To sit i...,B2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
2,"Keith recently came back from a trip to Chicago, Illinois. This midwestern metropolis is found along the shore of Lake Michigan. During his visit,...",B2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,"The Griffith Observatory is a planetarium, and an exhibit hall located in Los Angeles's Griffith Park. It features several astronomical displays a...",B2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3
4,-LRB- The Hollywood Reporter -RRB- It's official: AMC's The Walking Dead companion series has a title. Executive producer Robert Kirkman announced...,B2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [37]:
queryDf = pd.read_csv('../Texts_and_queries/queries/all_queries.csv')
queryDf = queryDf.rename(columns={'QueryId': 'qid', 'Query Description': 'query'})
queryDf

Unnamed: 0,qid,query,Readability
0,1,What is father's love like?,1
1,1,What is father's love like?,0
2,1,What is father's love like?,2
3,2,Is climbing mountains happy?,1
4,3,Why do people eat apples?,1
5,4,Suggestions on where to travel.,1
6,5,How do I repair my phone?,1
7,6,Clothes to wear in winter.,1
8,7,What is a healthy lifestyle?,1
9,8,How do I learn to drive cars?,1


In [38]:
# preprocessing queries
## eliminating punctuations
queryDf['query'] = queryDf['query'].apply(lambda s: s.translate(str.maketrans('', '', string.punctuation)))
queryDf

Unnamed: 0,qid,query,Readability
0,1,What is fathers love like,1
1,1,What is fathers love like,0
2,1,What is fathers love like,2
3,2,Is climbing mountains happy,1
4,3,Why do people eat apples,1
5,4,Suggestions on where to travel,1
6,5,How do I repair my phone,1
7,6,Clothes to wear in winter,1
8,7,What is a healthy lifestyle,1
9,8,How do I learn to drive cars,1


In [39]:
topics = queryDf.drop(columns=['Readability'])
topics = topics.astype({'qid': str})
# choose training data
train_topics = topics[15:]
train_topics = train_topics.reset_index()
train_topics = train_topics.drop(columns=['index'])
train_topics

Unnamed: 0,qid,query
0,14,What kind of advice would you give to a freshman at your college
1,15,How did the elderly in your family adjust to the age of the Internet
2,16,Any tips for travelling abroad
3,17,Share a scientific explanation for a common natural phenomenon
4,18,How do international students view America
5,19,Has any of your family members experienced the Second World War What was it like
6,20,What kind of advice would you give to a new employee at your company


In [40]:
# choose test data
test_topics = topics[12:14]
test_topics

Unnamed: 0,qid,query
12,11,Whats your favorite cooking recipe
13,12,How did you come up with names for your pets


In [66]:
qid = str(topics['qid'][0])
labelColName = 'int-' + qid
qrels = pd.DataFrame(data={'qid': qid, 'docno': corpusDf['docno'], 'label': corpusDf[labelColName]})

for qid in topics['qid'][1:]:
    qid = str(qid)
    # qid = '12'
    # 'int-12' column in corpusDf means the 12th query
    labelColName = 'int-' + qid
    tempDf = pd.DataFrame(data={'qid': qid, 'docno': corpusDf['docno'], 'label': corpusDf[labelColName]})
    qrels = qrels.append(tempDf, ignore_index=True)
qrels = qrels.astype({'label': int})
# qrels['iteration'] = '2.5'
qrels

Unnamed: 0,qid,docno,label
0,1,0,1
1,1,1,0
2,1,2,0
3,1,3,0
4,1,4,0
...,...,...,...
32863,20,1489,0
32864,20,1490,0
32865,20,1491,0
32866,20,1492,0


In [67]:
index_dir = './first_index'
indexer = pt.DFIndexer(index_dir, blocks=True, overwrite=True)
index_ref = indexer.index(corpusDf["text"], corpusDf["text"], corpusDf["docno"], corpusDf['readability'])
index_ref.toString()

'./first_index/data.properties'

In [68]:
index = pt.IndexFactory.of(index_ref)

In [69]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [70]:
bm25.search("What is your favorite cooking recipe")[:10]

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,32,32,0,11.718433,What is your favorite cooking recipe
1,1,17,17,1,10.66591,What is your favorite cooking recipe
2,1,30,30,2,10.5086,What is your favorite cooking recipe
3,1,1475,1475,3,10.484189,What is your favorite cooking recipe
4,1,1061,1061,4,9.975495,What is your favorite cooking recipe
5,1,651,651,5,9.770045,What is your favorite cooking recipe
6,1,820,820,6,9.616181,What is your favorite cooking recipe
7,1,1054,1054,7,9.46745,What is your favorite cooking recipe
8,1,1127,1127,8,9.031279,What is your favorite cooking recipe
9,1,643,643,9,8.219655,What is your favorite cooking recipe


In [71]:
pt.Experiment(
    [bm25],
    train_topics,
    qrels,
    names=['bm25'],
    eval_metrics=["map", "ndcg", "ndcg_cut_5", "ndcg_cut_10"])

Unnamed: 0,name,map,ndcg,ndcg_cut_5,ndcg_cut_10
0,bm25,0.884952,0.806663,0.360986,0.446218


In [72]:
sdm = pt.rewrite.SDM()
qe = pt.rewrite.Bo1QueryExpansion(index)

In [73]:
# solve the metadata confusion, could use readability level as a feature to train
pt.text.get_text(index, ["readability"])

<pyterrier.transformer.ApplyGenericTransformer at 0x7f71840d2bb0>

In [74]:
RANK_CUTOFF = 100
SEED=42

int_feats = (bm25 % RANK_CUTOFF) >> pt.text.get_text(index, ["readability"]) >> (
    pt.transformer.IdentityTransformer()
    ** # sequential dependence
    (sdm >> bm25)
    ** # match the intermediate readability
    (pt.apply.doc_score(lambda row: int("B" in row["readability"])))
)

# for reference, lets record the feature names here too
fnames=["BM25", "SDM", "Readability"]

In [78]:
import fastrank

train_request = fastrank.TrainRequest.coordinate_ascent()

params = train_request.params
params.init_random = True
params.normalize = True
params.seed = 1234567

ca_pipe = int_feats >> pt.ltr.apply_learned_model(train_request, form='fastrank')

%time ca_pipe.fit(train_topics, qrels)

AssertionError: 

In [75]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=400, verbose=1, random_state=SEED, n_jobs=2)

rf_pipe = int_feats >> pt.ltr.apply_learned_model(rf)

%time rf_pipe.fit(train_topics, qrels)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s


CPU times: user 1.83 s, sys: 91.8 ms, total: 1.93 s
Wall time: 1.59 s


[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.5s finished


In [76]:
pipeline_task3 = bm25 % RANK_CUTOFF
pt.Experiment(
    [rf_pipe, pipeline_task3],
    topics,
    qrels, 
    names=['rf', 'bm25'],
    eval_metrics=["map", "ndcg", "ndcg_cut_10", "mrt"])

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    2.3s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    9.5s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:   19.4s finished


Unnamed: 0,name,map,ndcg,ndcg_cut_10,mrt
0,rf,0.714742,0.811182,0.876008,1566.221332
1,bm25,0.735113,0.771651,0.773166,83.86909


In [77]:
example_results = rf_pipe.search(topics['query'][13])
example_results

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.0s finished


Unnamed: 0,qid,docid,docno,score,query,readability,features,rank
2,1,548,548,2.585000,How did you come up with names for your pets,A2,"[15.432539997331556, 15.585670577626193, 0.0]",0
24,1,995,995,2.315333,How did you come up with names for your pets,B1,"[5.113012050337778, 5.113012050337778, 1.0]",1
25,1,362,362,1.725000,How did you come up with names for your pets,A2,"[5.097060048155976, 5.097060048155976, 0.0]",2
9,1,82,82,1.662250,How did you come up with names for your pets,B2,"[10.351873941780271, 10.351873941780271, 1.0]",3
1,1,723,723,1.640000,How did you come up with names for your pets,C1,"[16.189571317158613, 16.27085192611112, 0.0]",4
...,...,...,...,...,...,...,...,...
93,1,203,203,0.087500,How did you come up with names for your pets,B2,"[3.588649738056071, 3.588649738056071, 1.0]",95
86,1,39,39,0.072500,How did you come up with names for your pets,B2,"[3.605642268958931, 3.605642268958931, 1.0]",96
64,1,378,378,0.057500,How did you come up with names for your pets,A2,"[4.046394889323499, 4.046394889323499, 0.0]",97
65,1,359,359,0.057500,How did you come up with names for your pets,A2,"[4.0376453871320415, 4.0376453871320415, 0.0]",98


In [60]:
# first compute the portion of matched readability level
# use qid, docid, score to match expected readability level (define a weight function) to adjust rank
# then recompute ndcg, map, the portion of matched readability level, etc. by hand to compare