# Interesting features
## Query-based features
- Query Performance Prediction
- Query Intent Prediction

- Comparative Query Classification
  - Is there a comparative information need -> comparison intent

- Entity Linking / Query Interpretation
  - Entity count
  - Average entity score / variance of entity score / min / max

## Document-based features
- Genre Classification
- Health Classification
- Readability/Quality/Coherence Features

## Document-query features
- Splade
- monoT5
- BM25

## Expansion -> re-retrieve
- DocT5Query
- LLM Query Expansion
- Entity Linking: BM25 if we use the top entity for retrieval / 0 if no entities

# Data
- `longeval-train-20230513-training`
- `longeval-heldout-20230513-training`
- `longeval-short-july-20230513-training`
- `longeval-long-september-20230513-training`

In [1]:
# re-install tira from github, for faster prototyping
!pip3 uninstall -y tira
!pip3 install git+https://github.com/tira-io/tira.git@development#\&subdirectory=python-client

Found existing installation: tira 0.0.119
Uninstalling tira-0.0.119:
  Successfully uninstalled tira-0.0.119
[0mCollecting git+https://github.com/tira-io/tira.git@development#&subdirectory=python-client
  Cloning https://github.com/tira-io/tira.git (to revision development) to /tmp/pip-req-build-k755b7fv
  Running command git clone --filter=blob:none --quiet https://github.com/tira-io/tira.git /tmp/pip-req-build-k755b7fv
  Running command git checkout -b development --track origin/development
  Switched to a new branch 'development'
  Branch 'development' set up to track remote branch 'development' from 'origin'.
  Resolved https://github.com/tira-io/tira.git to commit 871badf00e3dba3f4fd7400d684807aa55551d3a
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: tira
  Building wheel for tira (pyproject.toml) ... [?25ldone
[?25h  Crea

In [3]:
import pyterrier as pt
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

ensure_pyterrier_is_loaded()
tira = Client()

In [4]:
dataset = pt.get_dataset("irds:ir-benchmarks/longeval-train-20230513-training")
topics = dataset.get_topics(variant='text')

topics

Unnamed: 0,qid,query
0,q06223196,car shelter
1,q062228,airport
2,q062287,antivirus comparison
3,q06223261,free antivirus
4,q062291,orange antivirus
...,...,...
667,q062224914,tax garden shed
668,q062224961,land of france
669,q062225030,find my training pole job
670,q062225194,gpl car


# Query-based features

In [8]:
topics = dataset.get_topics(variant='text')

qpp = tira.pt.query_features('ir-benchmarks/qpptk/all-predictors', dataset)
intent_prediction = tira.pt.query_features('ir-benchmarks/dossier/pre-retrieval-query-intent', dataset)
query_health_classification = tira.pt.query_features("ir-benchmarks/fschlatt/query-health-classification", dataset)

query_features = intent_prediction ** qpp ** query_health_classification

# Document-based features

In [9]:
document_health_classification = tira.pt.doc_features("ir-benchmarks/fschlatt/document-health-classification", dataset)
genre_mlp_classifier = tira.pt.doc_features('ir-benchmarks/tu-dresden-01/genre-mlp', dataset)
spacy_features = tira.pt.doc_features('ir-benchmarks/tu-dresden-04/spacy-document-features', dataset)

document_features = document_health_classification ** genre_mlp_classifier ** spacy_features

# Document-query features

In [10]:
bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)
monot5 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/MonoT5 Base (tira-ir-starter-gygaggle)', dataset)

doc_query_features = bm25 ** monot5

In [15]:
full_pipeline = (bm25 % 1000) >> (query_features ** doc_query_features ** document_features)

In [19]:
df = full_pipeline(topics)
df

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id,features
0,q06223196,car shelter,Q0,1,14.954230,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708464,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
1,q06223196,car shelter,Q0,2,14.940287,pyterrier.default_pipelines.wmodel_text_scorer,doc062200206319,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
2,q06223196,car shelter,Q0,3,14.822774,pyterrier.default_pipelines.wmodel_text_scorer,doc062200108613,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
3,q06223196,car shelter,Q0,4,14.809379,pyterrier.default_pipelines.wmodel_text_scorer,doc062200115614,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
4,q06223196,car shelter,Q0,5,14.772246,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708471,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
...,...,...,...,...,...,...,...,...,...,...,...
66562,q062225197,cheapest car,Q0,96,11.412518,pyterrier.default_pipelines.wmodel_text_scorer,doc062202000120,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[Abstain, 4.1715416992, 3.2128049688, 82.39457..."
66563,q062225197,cheapest car,Q0,97,11.385375,pyterrier.default_pipelines.wmodel_text_scorer,doc062200206552,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[Abstain, 4.1715416992, 3.2128049688, 82.39457..."
66564,q062225197,cheapest car,Q0,98,11.364426,pyterrier.default_pipelines.wmodel_text_scorer,doc062200101247,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[Abstain, 4.1715416992, 3.2128049688, 82.39457..."
66565,q062225197,cheapest car,Q0,99,11.360744,pyterrier.default_pipelines.wmodel_text_scorer,doc062200110399,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[Abstain, 4.1715416992, 3.2128049688, 82.39457..."


In [20]:
len(df.iloc[0]['features'])

121