# Interesting features
## Query-based features
- Query Performance Prediction
- Query Intent Prediction

- Comparative Query Classification
  - Is there a comparative information need -> comparison intent

- Entity Linking / Query Interpretation
  - Entity count
  - Average entity score / variance of entity score / min / max

## Document-based features
- Genre Classification
- Health Classification
- Readability/Quality/Coherence Features

## Document-query features
- Splade
- monoT5
- BM25

## Expansion -> re-retrieve
- DocT5Query
- LLM Query Expansion
- Entity Linking: BM25 if we use the top entity for retrieval / 0 if no entities

# Data
- `longeval-train-20230513-training`
- `longeval-heldout-20230513-training`
- `longeval-short-july-20230513-training`
- `longeval-long-september-20230513-training`

In [4]:
# re-install tira from github, for faster prototyping
!pip3 uninstall -y tira
!pip3 install git+https://github.com/tira-io/tira.git@development#\&subdirectory=python-client

Found existing installation: tira 0.0.119
Uninstalling tira-0.0.119:
  Successfully uninstalled tira-0.0.119
[0mCollecting git+https://github.com/tira-io/tira.git@development#&subdirectory=python-client
  Cloning https://github.com/tira-io/tira.git (to revision development) to /tmp/pip-req-build-_0mq8bh7
  Running command git clone --filter=blob:none --quiet https://github.com/tira-io/tira.git /tmp/pip-req-build-_0mq8bh7
  Running command git checkout -b development --track origin/development
  Switched to a new branch 'development'
  Branch 'development' set up to track remote branch 'development' from 'origin'.
  Resolved https://github.com/tira-io/tira.git to commit e87fc16bd4fbd412f116e6da121cba3a0658162b
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: tira
  Building wheel for tira (pyproject.toml) ... [?25ldone
[?25h  Crea

In [2]:
import pyterrier as pt
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

ensure_pyterrier_is_loaded()
tira = Client()

In [3]:
def empty_docno(value=1):
    def __transform(df):
        if 'docno' in df.columns:
            return df

        res = df.copy()
        res['docno'] = value

        return res

    return pt.apply.generic(__transform)


def remove_docno():
    return pt.apply.generic(lambda df: df.drop(columns=['docno']))


def columns_to_features(columns, cols_to_keep=('docno', 'query', 'qid')):
    def __transform(df):
        if 'features' in df.columns:
            raise ValueError("Cannot merge features into a dataframe that already has a 'features' column")

        res = df[[col for col in cols_to_keep if col in df.columns]].copy()

        features = df[columns].values
        res['features'] = list(features.reshape((len(features), -1)))

        return res

    return pt.apply.generic(__transform)  #  >> empty_docno()

In [4]:
dataset = pt.get_dataset("irds:ir-benchmarks/longeval-train-20230513-training")
topics = dataset.get_topics(variant='text')

topics

Unnamed: 0,qid,query
0,q06223196,car shelter
1,q062228,airport
2,q062287,antivirus comparison
3,q06223261,free antivirus
4,q062291,orange antivirus
...,...,...
667,q062224914,tax garden shed
668,q062224961,land of france
669,q062225030,find my training pole job
670,q062225194,gpl car


# Query-based features

In [21]:
topics = dataset.get_topics(variant='text')

qpp = tira.pt.transform_queries('ir-benchmarks/qpptk/all-predictors', dataset) >> columns_to_features(['max-idf', 'avg-idf', 'scq', 'max-scq', 'avg-scq', 'var', 'max-var', 'avg-var', 'wig+10', 'nqc+100', 'smv+100'])
intent_prediction = tira.pt.transform_queries('ir-benchmarks/dossier/pre-retrieval-query-intent', dataset) >> columns_to_features('intent_prediction')

query_health_classification = tira.pt.transform_queries("ir-benchmarks/fschlatt/query-health-classification", dataset)


# query_features = empty_docno() >> (intent_prediction ** qpp) >> remove_docno()
query_features = intent_prediction ** qpp ** query_health_classification

## FIXME: add features for health classification
query_health_classification(topics)

Unnamed: 0,qid,query,mean_health_score,median_health_score,mean_medical_score,median_medical_score
0,q06223196,car shelter,10.1915,10.1915,10.1732,10.1732
1,q062228,airport,0.1463,0.1463,0.4139,0.4139
2,q062287,antivirus comparison,7.0495,7.0495,162.7479,162.7479
3,q06223261,free antivirus,12.9669,12.9669,45.9657,45.9657
4,q062291,orange antivirus,11.5499,11.5499,19.6544,19.6544
...,...,...,...,...,...,...
667,q062224914,tax garden shed,12.2835,2.9900,35.8161,8.5308
668,q062224961,land of france,10.0038,0.8350,21.8215,6.7338
669,q062225030,find my training pole job,27.4100,19.1686,45.1709,34.9903
670,q062225194,gpl car,7.2348,7.2348,17.6295,17.6295


# Document-based features

In [23]:
document_health_classification = tira.pt.transform_documents("ir-benchmarks/fschlatt/document-health-classification", dataset)
genre_mlp_classifier = tira.pt.transform_documents('ir-benchmarks/tu-dresden-01/genre-mlp', dataset)
spacy_features = tira.pt.transform_documents('ir-benchmarks/tu-dresden-04/spacy-document-features', dataset)
#FIXME: ADD readability features

document_features = document_health_classification ** genre_mlp_classifier ** spacy_features

In [24]:
(bm25 >> document_features)(topics)

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id,...,duplicate_ngram_chr_fraction_6,duplicate_ngram_chr_fraction_7,duplicate_ngram_chr_fraction_8,duplicate_ngram_chr_fraction_9,duplicate_ngram_chr_fraction_10,top_ngram_chr_fraction_2,top_ngram_chr_fraction_3,top_ngram_chr_fraction_4,oov_ratio,features
0,q06223196,car shelter,Q0,1,14.954230,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708464,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,0.055263,0.055263,0.055263,0.055263,0.000000,0.014474,0.013158,0.000000,,"[14.95423017114749, 14.95423017114749, 14.9542..."
1,q06223196,car shelter,Q0,2,14.940287,pyterrier.default_pipelines.wmodel_text_scorer,doc062200206319,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,0.149485,0.094993,0.046392,0.026510,0.000000,0.009573,0.017673,0.047128,,"[14.940286624755384, 14.940286624755384, 14.94..."
2,q06223196,car shelter,Q0,3,14.822774,pyterrier.default_pipelines.wmodel_text_scorer,doc062200108613,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.021924,0.000000,0.000000,,"[14.82277355026469, 14.82277355026469, 14.8227..."
3,q06223196,car shelter,Q0,4,14.809379,pyterrier.default_pipelines.wmodel_text_scorer,doc062200115614,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,0.094800,0.094800,0.094800,0.094800,0.094800,0.008000,0.015600,0.024000,,"[14.80937869841142, 14.80937869841142, 14.8093..."
4,q06223196,car shelter,Q0,5,14.772246,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708471,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,0.454999,0.454999,0.448842,0.448842,0.430079,0.028731,0.030490,0.016417,,"[14.77224642743908, 14.77224642743908, 14.7722..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66562,q062225197,cheapest car,Q0,96,11.412518,pyterrier.default_pipelines.wmodel_text_scorer,doc062202000120,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,0.096200,0.058000,0.036000,0.026000,0.026000,0.016000,0.025200,0.008000,,"[11.41251801095611, 11.41251801095611, 11.4125..."
66563,q062225197,cheapest car,Q0,97,11.385375,pyterrier.default_pipelines.wmodel_text_scorer,doc062200206552,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,0.016890,0.016890,0.000000,0.000000,0.000000,0.066977,0.028538,0.021840,,"[11.385375439895611, 11.385375439895611, 11.38..."
66564,q062225197,cheapest car,Q0,98,11.364426,pyterrier.default_pipelines.wmodel_text_scorer,doc062200101247,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.011861,0.009037,0.014403,,"[11.364425593201668, 11.364425593201668, 11.36..."
66565,q062225197,cheapest car,Q0,99,11.360744,pyterrier.default_pipelines.wmodel_text_scorer,doc062200110399,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,0.026644,0.000000,0.000000,0.000000,0.000000,0.011905,0.022676,0.018141,,"[11.360744242789316, 11.360744242789316, 11.36..."


# Document-query features

In [7]:
bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)
monot5 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/MonoT5 Base (tira-ir-starter-gygaggle)', dataset)

doc_query_features = bm25 ** monot5

In [8]:
monot5(topics.head(2))

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id
0,q06223196,car shelter,0,1,-0.003520,castorini/monot5-base-msmarco-10k,doc062201708464,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
1,q06223196,car shelter,0,2,-0.005353,castorini/monot5-base-msmarco-10k,doc062200108613,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
2,q06223196,car shelter,0,3,-0.006328,castorini/monot5-base-msmarco-10k,doc062200206319,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
3,q06223196,car shelter,0,4,-0.006333,castorini/monot5-base-msmarco-10k,doc062200112743,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
4,q06223196,car shelter,0,5,-0.006599,castorini/monot5-base-msmarco-10k,doc062201708471,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
...,...,...,...,...,...,...,...,...,...,...
195,q062228,airport,0,96,-5.237506,castorini/monot5-base-msmarco-10k,doc062204502359,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
196,q062228,airport,0,97,-6.071942,castorini/monot5-base-msmarco-10k,doc062208002863,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
197,q062228,airport,0,98,-6.174983,castorini/monot5-base-msmarco-10k,doc062201204407,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
198,q062228,airport,0,99,-7.354122,castorini/monot5-base-msmarco-10k,doc062208104118,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01


In [9]:
(bm25 >> monot5)(topics.head(2))

Unnamed: 0,qid,query,docno,q0,rank,score,system,tira_task,tira_dataset,tira_first_stage_run_id
0,q06223196,car shelter,doc062201708464,0,1,-0.003520,castorini/monot5-base-msmarco-10k,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
1,q06223196,car shelter,doc062200206319,0,3,-0.006328,castorini/monot5-base-msmarco-10k,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
2,q06223196,car shelter,doc062200108613,0,2,-0.005353,castorini/monot5-base-msmarco-10k,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
3,q06223196,car shelter,doc062200115614,0,26,-0.075846,castorini/monot5-base-msmarco-10k,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
4,q06223196,car shelter,doc062201708471,0,5,-0.006599,castorini/monot5-base-msmarco-10k,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
...,...,...,...,...,...,...,...,...,...,...
195,q062228,airport,doc062202004154,0,53,-2.171376,castorini/monot5-base-msmarco-10k,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
196,q062228,airport,doc062202106805,0,87,-3.456243,castorini/monot5-base-msmarco-10k,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
197,q062228,airport,doc062208908281,0,94,-4.552380,castorini/monot5-base-msmarco-10k,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01
198,q062228,airport,doc062201707138,0,81,-3.121650,castorini/monot5-base-msmarco-10k,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-56-01


In [10]:
((bm25 % 100) >> (query_features ** document_features))(topics.head(2))

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id,...,predicted_label,probability_Discussion,probability_Shop,probability_Download,probability_Articles,probability_Help,probability_Linklists,probability_Porttrait private,probability_Protrait non private,features
0,q06223196,car shelter,Q0,1,14.954230,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708464,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.065149,0.332923,0.018598,0.035639,0.083025,0.075057,0.104556,0.285054,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
1,q06223196,car shelter,Q0,2,14.940287,pyterrier.default_pipelines.wmodel_text_scorer,doc062200206319,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.069072,0.609511,0.034462,0.031660,0.032858,0.115177,0.023488,0.083772,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
2,q06223196,car shelter,Q0,3,14.822774,pyterrier.default_pipelines.wmodel_text_scorer,doc062200108613,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.027731,0.360929,0.013323,0.022076,0.034057,0.040168,0.043006,0.458711,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
3,q06223196,car shelter,Q0,4,14.809379,pyterrier.default_pipelines.wmodel_text_scorer,doc062200115614,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.057209,0.364561,0.022853,0.059166,0.213303,0.045845,0.023758,0.213304,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
4,q06223196,car shelter,Q0,5,14.772246,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708471,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.031917,0.696000,0.019099,0.013245,0.027034,0.046168,0.030590,0.135949,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,q062228,airport,Q0,95,8.729615,pyterrier.default_pipelines.wmodel_text_scorer,doc062209203341,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Discussion,0.375695,0.034172,0.003748,0.046004,0.357152,0.041571,0.050274,0.091385,"[Abstain, 3.4870977938, 3.4870977938, 44.64196..."
194,q062228,airport,Q0,96,8.728996,pyterrier.default_pipelines.wmodel_text_scorer,doc062202004154,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.065680,0.087092,0.011471,0.038211,0.356411,0.024222,0.053257,0.363657,"[Abstain, 3.4870977938, 3.4870977938, 44.64196..."
195,q062228,airport,Q0,97,8.728772,pyterrier.default_pipelines.wmodel_text_scorer,doc062202106805,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.024931,0.040075,0.005162,0.010808,0.062437,0.024075,0.037758,0.794753,"[Abstain, 3.4870977938, 3.4870977938, 44.64196..."
196,q062228,airport,Q0,98,8.727324,pyterrier.default_pipelines.wmodel_text_scorer,doc062208908281,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Linklists,0.040335,0.106685,0.015584,0.013474,0.160971,0.508761,0.025781,0.128410,"[Abstain, 3.4870977938, 3.4870977938, 44.64196..."


In [11]:
((bm25 % 100) >> doc_query_features)(topics.head(2))

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id,features
0,q06223196,car shelter,Q0,1,14.954230,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708464,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[14.95423017114749, -0.0035196519456803]"
1,q06223196,car shelter,Q0,2,14.940287,pyterrier.default_pipelines.wmodel_text_scorer,doc062200206319,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[14.940286624755384, -0.0063284239731729]"
2,q06223196,car shelter,Q0,3,14.822774,pyterrier.default_pipelines.wmodel_text_scorer,doc062200108613,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[14.82277355026469, -0.0053529264405369]"
3,q06223196,car shelter,Q0,4,14.809379,pyterrier.default_pipelines.wmodel_text_scorer,doc062200115614,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[14.80937869841142, -0.0758459642529487]"
4,q06223196,car shelter,Q0,5,14.772246,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708471,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[14.77224642743908, -0.0065992991439998]"
...,...,...,...,...,...,...,...,...,...,...,...
193,q062228,airport,Q0,95,8.729615,pyterrier.default_pipelines.wmodel_text_scorer,doc062209203341,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[8.729615254689215, -1.1097716093063354]"
194,q062228,airport,Q0,96,8.728996,pyterrier.default_pipelines.wmodel_text_scorer,doc062202004154,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[8.728995592737434, -2.1713762283325195]"
195,q062228,airport,Q0,97,8.728772,pyterrier.default_pipelines.wmodel_text_scorer,doc062202106805,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[8.728772233407902, -3.456242561340332]"
196,q062228,airport,Q0,98,8.727324,pyterrier.default_pipelines.wmodel_text_scorer,doc062208908281,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,"[8.727323616940186, -4.552380084991455]"


In [12]:
full_pipeline = (bm25 % 1000) >> (query_features ** doc_query_features ** document_features)

In [13]:
full_pipeline(topics.head(10))

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id,...,predicted_label,probability_Discussion,probability_Shop,probability_Download,probability_Articles,probability_Help,probability_Linklists,probability_Porttrait private,probability_Protrait non private,features
0,q06223196,car shelter,Q0,1,14.954230,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708464,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.065149,0.332923,0.018598,0.035639,0.083025,0.075057,0.104556,0.285054,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
1,q06223196,car shelter,Q0,2,14.940287,pyterrier.default_pipelines.wmodel_text_scorer,doc062200206319,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.069072,0.609511,0.034462,0.031660,0.032858,0.115177,0.023488,0.083772,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
2,q06223196,car shelter,Q0,3,14.822774,pyterrier.default_pipelines.wmodel_text_scorer,doc062200108613,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.027731,0.360929,0.013323,0.022076,0.034057,0.040168,0.043006,0.458711,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
3,q06223196,car shelter,Q0,4,14.809379,pyterrier.default_pipelines.wmodel_text_scorer,doc062200115614,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.057209,0.364561,0.022853,0.059166,0.213303,0.045845,0.023758,0.213304,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
4,q06223196,car shelter,Q0,5,14.772246,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708471,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Shop,0.031917,0.696000,0.019099,0.013245,0.027034,0.046168,0.030590,0.135949,"[Abstain, 4.0958698531, 3.1749690458, 80.93334..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,q0622141,unlicensed car insurance,Q0,96,16.100618,pyterrier.default_pipelines.wmodel_text_scorer,doc062201708809,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.061470,0.196385,0.078399,0.041046,0.083203,0.141784,0.038455,0.359258,"[Abstain, 7.0065309859, 4.0609342862, 134.4942..."
996,q0622141,unlicensed car insurance,Q0,97,16.009340,pyterrier.default_pipelines.wmodel_text_scorer,doc062200211754,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.040501,0.244226,0.019607,0.018090,0.114814,0.171848,0.045538,0.345376,"[Abstain, 7.0065309859, 4.0609342862, 134.4942..."
997,q0622141,unlicensed car insurance,Q0,98,15.905582,pyterrier.default_pipelines.wmodel_text_scorer,doc062200201587,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Help,0.055728,0.029518,0.008044,0.343882,0.347058,0.130947,0.041508,0.043315,"[Abstain, 7.0065309859, 4.0609342862, 134.4942..."
998,q0622141,unlicensed car insurance,Q0,99,15.844073,pyterrier.default_pipelines.wmodel_text_scorer,doc062205006628,ir-benchmarks,IRDSDataset('ir-benchmarks/longeval-train-2023...,2024-03-18-12-15-04,...,Protrait non private,0.041974,0.233245,0.011819,0.010909,0.062658,0.071336,0.043251,0.524808,"[Abstain, 7.0065309859, 4.0609342862, 134.4942..."


In [14]:
full_pipeline(topics.head(10)).iloc[0]['features']

array(['Abstain', 4.0958698531, 3.1749690458, 80.9333491421,
       47.6725992603, 40.4666745711, 3.9709926948, 2.0144139575,
       1.9854963474, 5.9957913108, 0.0396683849, 0.0257729994,
       14.95423017114749, -0.0035196519456803, 14.95423017114749,
       14.95423017114749], dtype=object)