In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install python-terrier==0.10.0 nltk scikit-learn lightgbm

In [2]:
import pandas as pd
import json
import pyterrier as pt

In [3]:
# Paths to JSONL files
jsonl_train_path = 'text_data/tvr_train_release.jsonl'
jsonl_val_path = 'text_data/tvr_val_release.jsonl'
subs_path = 'text_data/tvqa_preprocessed_subtitles.jsonl'

In [None]:
# Paths to JSONL files on Colab
jsonl_train_path = '/content/drive/MyDrive/IR/text_data/tvr_train_release.jsonl'
jsonl_val_path = '/content/drive/MyDrive/IR/text_data/tvr_val_release.jsonl'
subs_path = '/content/drive/MyDrive/IR/text_data/tvqa_preprocessed_subtitles.jsonl'

In [4]:
# Load subtitles into a dictionary for quick access
subtitles_dict = {}
with open(subs_path, 'r') as subs_file:
    for line in subs_file:
        sub_data = json.loads(line)
        subtitles_dict[sub_data['vid_name']] = sub_data['sub']

In [5]:
# Function to find matching subtitles
def find_matching_subtitles(vid_name, ts_range, subtitles_dict):
    matching_subs = []
    if vid_name in subtitles_dict:
        for subtitle in subtitles_dict[vid_name]:
            if (ts_range[0] <= subtitle['start'] <= ts_range[1]) or (ts_range[0] <= subtitle['end'] <= ts_range[1]) or (subtitle['start'] <= ts_range[0] and subtitle['end'] >= ts_range[1]):
                matching_subs.append(subtitle['text'])
    return matching_subs

In [6]:
def parse_jsonl(jsonl_path):
    # Initialize empty lists for your data
    queries_data = []
    documents_data = []
    query_rankings_data = []

    with open(jsonl_path, 'r') as file:
        for idx, line in enumerate(file):
            data = json.loads(line)
            # drop non text-based queries
            if data['type'] != 't':
                continue

            # Find matching subtitles
            matching_subs = find_matching_subtitles(data['vid_name'], data['ts'], subtitles_dict)
            if matching_subs == []:
                continue

            # Extract data for the Query Set DataFrame
            queries_data.append({'qid': data['desc_id'], 'query': data['desc']})

            # Extract data for the Documents Set DataFrame, including matching subtitles
            documents_data.append({'docno': idx, 'vid_name': data['vid_name'], 'ts': data['ts'],
                                'duration': data['duration'], 'type': data['type'], 'text': "".join(matching_subs)})

            # Extract data for the Query Rankings DataFrame
            query_rankings_data.append({'qid': data["desc_id"], 'query': data['desc'], 'docno': idx, 'rank': 1, 'score': 1.0})

    return queries_data, documents_data, query_rankings_data

In [7]:
train_queries_data, train_documents_data, train_query_rankings_data = parse_jsonl(jsonl_train_path)

# Convert lists to DataFrames (training)
train_query_set_df = pd.DataFrame(train_queries_data)
train_documents_set_df = pd.DataFrame(train_documents_data)
train_query_rankings_df = pd.DataFrame(train_query_rankings_data)

val_queries_data, val_documents_data, val_query_rankings_data = parse_jsonl(jsonl_val_path)

# Convert lists to DataFrames (validation)
val_query_set_df = pd.DataFrame(val_queries_data)
val_documents_set_df = pd.DataFrame(val_documents_data)
val_query_rankings_df = pd.DataFrame(val_query_rankings_data)


In [None]:
#print the length of the dataframes
print(len(train_query_set_df))
print(len(train_documents_set_df))
print(len(train_query_rankings_df))

print(len(val_query_set_df))
print(len(val_documents_set_df))
print(len(val_query_rankings_df))


### First Stage Retrieval [TODO: BOX]
The following part of the code will define three different first stage retrieval pipelines as an input for the trained model.

In [None]:
if not pt.started():
    pt.init()

In [10]:
train_documents_set_df['docno'] = train_documents_set_df['docno'].astype(str)


In [11]:
# Create an index
from pathlib import Path

indexer = pt.IterDictIndexer(
    "./index_path/",
    meta={
        "docno": 16,
        "vid_name": 64,
        "text": 131072,
    },
    stemmer="porter",
    stopwords="terrier",
    overwrite=True,
    type=pt.index.IndexingType.MEMORY,
)


In [12]:
indexed = indexer.index(train_documents_set_df.to_dict('records'))


In [13]:
# Initialize BatchRetrieve with the created index and specify BM25 as the weighting model
first_stage_bm25 = pt.BatchRetrieve(
    indexed,
    wmodel="BM25",
    num_results=10,
    metadata=["docno", "vid_name", "text"]
)

Computing feature

In [14]:
# features, can use any of the features in the list

pl2_retriever = pt.BatchRetrieve(indexed, wmodel="PL2")
dph_retriever = pt.BatchRetrieve(indexed, wmodel="DPH")
#tf_idf_retriever = pt.BatchRetrieve(indexed, wmodel="TF_IDF")
#bb2_retriever = pt.BatchRetrieve(indexed, wmodel="BB2")


In [15]:
# build a pipeline with the features
pipeline_with_features = ~first_stage_bm25 >> (
    pl2_retriever ** dph_retriever
)

In [18]:
# Prepare the queries for the pipeline, remove special characters and extra spaces
prepared_queries = train_query_set_df
prepared_queries['qid'] = prepared_queries['qid'].astype(str)
prepared_queries['query'] = prepared_queries['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_queries['query'] = prepared_queries['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_queries['query'] = prepared_queries['query'].str.replace(r'\s+', ' ', regex=True).str.strip()



In [None]:
# # divided in batch
# from tqdm import tqdm

# batch_size = 100
# n_batches = (len(prepared_queries) + batch_size - 1) // batch_size

# for batch_num in tqdm(range(n_batches), desc="Processing batches"):
#     start_idx = batch_num * batch_size
#     end_idx = start_idx + batch_size
#     batch_queries = prepared_queries[start_idx:end_idx]

#     # run pipeline
#     batch_results = pipeline_with_features(batch_queries)

#     # write or append
#     mode = 'w' if batch_num == 0 else 'a'
#     header = True if batch_num == 0 else False

#     # to csv
#     batch_results.to_csv('results_with_features.csv', mode=mode, header=header, index=False)


In [None]:
#batch_results.to_csv('results_with_features.csv', mode='w', header=True, index=False)

results_with_features = pipeline_with_features(prepared_queries)
print(results_with_features)



### Learning [TODO: ROB]


#### Definition of learned rankers

In [None]:
from sklearn.svm import SVR

learned_models = [
    SVR()
]

#### Rankers training

In [None]:
trained_models = []
names = []

for fsr_pipeline in fsr_pipelines:
    for model in learned_models:
        names.append(f"{fsr_pipeline.name()}_{model.__class__.__name__}")
        pipe = ~fsr_pipeline >> pt.ltr.apply_learned_model(model)
        pipe.fit(
            train_query_rankings_df,
            train_query_set_df,
            train_documents_set_df
        )
        trained_models.append(pipe)

#### Rankers evaluation

In [None]:
from pyterrier.measures import nDCG, RR, MAP

pt.Experiment(
    trained_models,
    val_query_set_df,
    val_query_rankings_df,
    names=names,
    eval_metrics=[nDCG @ 10, RR @ 10, MAP],
)