# Interesting features
## Query-based features
- Query Performance Prediction
- Query Intent Prediction

- Comparative Query Classification
  - Is there a comparative information need -> comparison intent

- Entity Linking / Query Interpretation
  - Entity count
  - Average entity score / variance of entity score / min / max

## Document-based features
- Genre Classification
- Health Classification
- Readability/Quality/Coherence Features

## Document-query features
- Splade
- monoT5
- BM25

## Expansion -> re-retrieve
- DocT5Query
- LLM Query Expansion
- Entity Linking: BM25 if we use the top entity for retrieval / 0 if no entities

# Data
- `longeval-train-20230513-training`
- `longeval-heldout-20230513-training`
- `longeval-short-july-20230513-training`
- `longeval-long-september-20230513-training`

In [1]:
import pyterrier as pt
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
def empty_docno(value=1):
    def __transform(df):
        if 'docno' in df.columns:
            return df

        res = df.copy()
        res['docno'] = value

        return res

    return pt.apply.generic(__transform)


def remove_docno():
    return pt.apply.generic(lambda df: df.drop(columns=['docno']))


def columns_to_features(columns, cols_to_keep=('docno', 'query', 'qid')):
    def __transform(df):
        if 'features' in df.columns:
            raise ValueError("Cannot merge features into a dataframe that already has a 'features' column")

        res = df[[col for col in cols_to_keep if col in df.columns]].copy()

        features = df[columns].values
        res['features'] = list(features.reshape((len(features), -1)))

        return res

    return pt.apply.generic(__transform)  #  >> empty_docno()

In [3]:
dataset = pt.get_dataset("irds:ir-benchmarks/longeval-train-20230513-training")
topics = dataset.get_topics(variant='text')

topics

Unnamed: 0,qid,query
0,q06223196,car shelter
1,q062228,airport
2,q062287,antivirus comparison
3,q06223261,free antivirus
4,q062291,orange antivirus
...,...,...
667,q062224914,tax garden shed
668,q062224961,land of france
669,q062225030,find my training pole job
670,q062225194,gpl car


# Query-based features

In [4]:
topics = dataset.get_topics(variant='text')

qpp = tira.pt.transform_queries('ir-benchmarks/qpptk/all-predictors', dataset) >> columns_to_features(['max-idf', 'avg-idf', 'scq', 'max-scq', 'avg-scq', 'var', 'max-var', 'avg-var', 'wig+10', 'nqc+100', 'smv+100'])
intent_prediction = tira.pt.transform_queries('ir-benchmarks/dossier/pre-retrieval-query-intent', dataset) >> columns_to_features('intent_prediction')

# query_features = empty_docno() >> (intent_prediction ** qpp) >> remove_docno()
query_features = intent_prediction ** qpp

# query_features(topics)

# Document-based features

In [5]:
document_health_classification = tira.pt.transform_documents("ir-benchmarks/fschlatt/document-health-classification", dataset)
genre_mlp_classifier = tira.pt.transform_documents('ir-benchmarks/tu-dresden-01/genre-mlp', dataset)

document_features = document_health_classification ** genre_mlp_classifier

# Document-query features

In [6]:
bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)
monot5 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/MonoT5 Base (tira-ir-starter-gygaggle)', dataset)

doc_query_features = bm25 ** monot5

In [10]:
monot5(topics.head(2))

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id
0,q06223196,car shelter,0,1,-0.003520,castorini/monot5-base-msmarco-10k,doc062201708464,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
1,q06223196,car shelter,0,2,-0.005353,castorini/monot5-base-msmarco-10k,doc062200108613,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
2,q06223196,car shelter,0,3,-0.006328,castorini/monot5-base-msmarco-10k,doc062200206319,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
3,q06223196,car shelter,0,4,-0.006333,castorini/monot5-base-msmarco-10k,doc062200112743,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
4,q06223196,car shelter,0,5,-0.006599,castorini/monot5-base-msmarco-10k,doc062201708471,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
...,...,...,...,...,...,...,...,...,...,...
195,q062228,airport,0,96,-5.237506,castorini/monot5-base-msmarco-10k,doc062204502359,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
196,q062228,airport,0,97,-6.071942,castorini/monot5-base-msmarco-10k,doc062208002863,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
197,q062228,airport,0,98,-6.174983,castorini/monot5-base-msmarco-10k,doc062201204407,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
198,q062228,airport,0,99,-7.354122,castorini/monot5-base-msmarco-10k,doc062208104118,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01


In [11]:
bm25(topics.head(2))

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id
0,q06223196,car shelter,Q0,1,14.954230,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708464,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04
1,q06223196,car shelter,Q0,2,14.940287,pyterrier.default_pipelines.wmodel_text_scorer,doc062200206319,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04
2,q06223196,car shelter,Q0,3,14.822774,pyterrier.default_pipelines.wmodel_text_scorer,doc062200108613,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04
3,q06223196,car shelter,Q0,4,14.809379,pyterrier.default_pipelines.wmodel_text_scorer,doc062200115614,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04
4,q06223196,car shelter,Q0,5,14.772246,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708471,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04
...,...,...,...,...,...,...,...,...,...,...
195,q062228,airport,Q0,96,8.728996,pyterrier.default_pipelines.wmodel_text_scorer,doc062202004154,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04
196,q062228,airport,Q0,97,8.728772,pyterrier.default_pipelines.wmodel_text_scorer,doc062202106805,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04
197,q062228,airport,Q0,98,8.727324,pyterrier.default_pipelines.wmodel_text_scorer,doc062208908281,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04
198,q062228,airport,Q0,99,7.647557,pyterrier.default_pipelines.wmodel_text_scorer,doc062201707138,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04


In [12]:
((bm25 % 100) >> (query_features ** document_features))(topics.head(2))

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id,...,predicted_label,probability_Discussion,probability_Shop,probability_Download,probability_Articles,probability_Help,probability_Linklists,probability_Porttrait private,probability_Protrait non private,features
0,q06223196,car shelter,Q0,1,14.954230,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708464,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.065149,0.332923,0.018598,0.035639,0.083025,0.075057,0.104556,0.285054,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
1,q06223196,car shelter,Q0,2,14.940287,pyterrier.default_pipelines.wmodel_text_scorer,doc062200206319,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.069072,0.609511,0.034462,0.031660,0.032858,0.115177,0.023488,0.083772,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
2,q06223196,car shelter,Q0,3,14.822774,pyterrier.default_pipelines.wmodel_text_scorer,doc062200108613,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.027731,0.360929,0.013323,0.022076,0.034057,0.040168,0.043006,0.458711,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
3,q06223196,car shelter,Q0,4,14.809379,pyterrier.default_pipelines.wmodel_text_scorer,doc062200115614,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.057209,0.364561,0.022853,0.059166,0.213303,0.045845,0.023758,0.213304,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
4,q06223196,car shelter,Q0,5,14.772246,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708471,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.031917,0.696000,0.019099,0.013245,0.027034,0.046168,0.030590,0.135949,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,q062228,airport,Q0,95,8.729615,pyterrier.default_pipelines.wmodel_text_scorer,doc062209203341,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Discussion,0.375695,0.034172,0.003748,0.046004,0.357152,0.041571,0.050274,0.091385,"[Abstain, 3.4870977938, 3.4870977938, 44.64196..."
194,q062228,airport,Q0,96,8.728996,pyterrier.default_pipelines.wmodel_text_scorer,doc062202004154,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.065680,0.087092,0.011471,0.038211,0.356411,0.024222,0.053257,0.363657,"[Abstain, 3.4870977938, 3.4870977938, 44.64196..."
195,q062228,airport,Q0,97,8.728772,pyterrier.default_pipelines.wmodel_text_scorer,doc062202106805,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.024931,0.040075,0.005162,0.010808,0.062437,0.024075,0.037758,0.794753,"[Abstain, 3.4870977938, 3.4870977938, 44.64196..."
196,q062228,airport,Q0,98,8.727324,pyterrier.default_pipelines.wmodel_text_scorer,doc062208908281,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Linklists,0.040335,0.106685,0.015584,0.013474,0.160971,0.508761,0.025781,0.128410,"[Abstain, 3.4870977938, 3.4870977938, 44.64196..."


In [13]:
((bm25 % 100) >> doc_query_features)(topics.head(2))

  warn("Got number of results different expected from %s, expected %d received %d, feature scores for any "
  warn("Got number of results different expected from %s, expected %d received %d, feature scores for any "


KeyboardInterrupt: 

In [11]:
full_pipeline = (bm25 % 1000) >> (query_features ** doc_query_features ** document_features)

In [12]:
full_pipeline(topics.head(10))

  warn("Got number of results different expected from %s, expected %d received %d, feature scores for any "
  warn("Got number of results different expected from %s, expected %d received %d, feature scores for any "
