In [43]:
#from google.colab import drive
#drive.mount('/content/drive')

In [44]:
# pip install python-terrier==0.10.0 nltk scikit-learn lightgbm xgboost fastrank
# pip install pandas

In [45]:
# !pip install --upgrade git+https://github.com/Georgetown-IR-Lab/OpenNIR

In [46]:
import pandas as pd
import json
import pyterrier as pt
import random

In [47]:
#UNCOMMENT THE FOLLOWING LINE TO USE EITHER THE TVR DATASET OR THE QVH DATASET

# dataset_choice = "TVR"
dataset_choice = "QVH"

In [48]:
# Paths to JSONL files
if dataset_choice == "TVR":
    jsonl_train_path = 'text_data/tvr_train_release.jsonl'
    jsonl_val_path = 'text_data/tvr_val_release.jsonl'
    subs_path = 'text_data/tvqa_preprocessed_subtitles.jsonl'

elif dataset_choice == "QVH":
    jsonl_train_path = "text_data_QVH/highlight_train_release.jsonl"
    subs_path = "text_data_QVH/subs_train.jsonl"

In [49]:
# Load subtitles into a dictionary for quick access
subtitles_dict = {}
if dataset_choice == "TVR":
    with open(subs_path, 'r') as subs_file:
        for line in subs_file:
            sub_data = json.loads(line)
            subtitles_dict[sub_data['vid_name']] = sub_data['sub']
elif dataset_choice == "QVH":
    with open(subs_path, 'r') as subs_file:
        for line in subs_file:
            sub_data = json.loads(line)
            triple = sub_data['vid'].split("_")
            name = sub_data['vid']
            #turn the list name into a string
            name = "".join(name)
            if name not in subtitles_dict:
                subtitles_dict[name] = [(float(triple[-2]) + sub_data["relevant_windows"][0][0], float(triple[-2]) + sub_data["relevant_windows"][0][1], sub_data['query'])]
            else:
                subtitles_dict[name].append((float(triple[-2]) + sub_data["relevant_windows"][0][0], float(triple[-2]) + sub_data["relevant_windows"][0][1], sub_data['query']))
            
            # print(subtitles_dict[name])

In [50]:
# Function to find matching subtitles in TVR case
def find_matching_subtitles(vid_name, ts_range, subtitles_dict):
    matching_subs = []
    if vid_name in subtitles_dict:
        for subtitle in subtitles_dict[vid_name]:
            if (ts_range[0] <= subtitle['start'] <= ts_range[1]) or (ts_range[0] <= subtitle['end'] <= ts_range[1]) or (subtitle['start'] <= ts_range[0] and subtitle['end'] >= ts_range[1]):
                matching_subs.append(subtitle['text'])
    return matching_subs

In [51]:
def parse_jsonl_TVR(jsonl_path, split_type):
    # Initialize empty lists for your data
    queries_data = []
    documents_data = []
    query_rankings_data = []

    with open(jsonl_path, 'r') as file:
        for idx, line in enumerate(file):
            data = json.loads(line)
            # drop non text-based queries
            if data['type'] not in ['t']:   # t: text-based, vt: video-text-based -> You can choose here
                continue

            # Find matching subtitles
            matching_subs = find_matching_subtitles(data['vid_name'], data['ts'], subtitles_dict)

            if matching_subs == []:
                continue

            # Extract data for the Query Set DataFrame
            queries_data.append({'qid': str(data['desc_id']), 'query': data['desc']})

            # Extract data for the Documents Set DataFrame, including matching subtitles
            documents_data.append({'docno': split_type + str(idx), 'vid_name': data['vid_name'], 'ts': data['ts'],
                                'duration': data['duration'], 'type': data['type'], 'text': "".join(matching_subs)})

            # Extract data for the Query Rankings DataFrame
            query_rankings_data.append({'qid': str(data["desc_id"]), 'query': data['desc'], 'docno': split_type + str(idx), 'rank': 1, 'score': 1.0})

    return queries_data, documents_data, query_rankings_data

def parse_jsonl_QVH(jsonl_path):
    queries_data = []
    documents_data = []
    query_rankings_data = []
    with open(jsonl_path, 'r') as file:
        for idx,line in enumerate(file):

            # Load the JSON object from the line
            data = json.loads(line)

            triple = data["vid"].split("_")
            document_name = triple[0:-2]
            document_name = data["vid"]
            start_time = float(triple[-2])
            end_time = float(triple[-1])

            if document_name not in subtitles_dict:
                #print("Document not found in subtitles: ", document_name)
                continue
            
            subs = []
            for relevant_window in data["relevant_windows"]:
                ts = [start_time+relevant_window[0], start_time+relevant_window[1]]
                subs.extend([sub for sub in subtitles_dict[document_name] if sub[0] <= ts[1] and ts[0] <= sub[1]])
                if len(subs) == 0:
                    #print("No subtitles found for ", document_name, " at time ", ts)
                    continue
            
            documents_data.append({"docno": str(idx), "vid_name": document_name, "ts": [start_time, end_time], "duration" : data["duration"], "text": " ".join([sub[2] for sub in subs])})
            queries_data.append({"qid" : str(data["qid"]), "query": data["query"]})
            query_rankings_data.append({"qid": str(data["qid"]), "query": data["query"], "docno": str(idx), "rank": 1, "score": 1.0})

    return queries_data, documents_data, query_rankings_data

          

In [52]:

if dataset_choice == "TVR":
    queries_data_train, documents_data_train, query_rankings_data_train = parse_jsonl_TVR(jsonl_train_path, "t")
    queries_data_val, documents_data_val, query_rankings_data_val = parse_jsonl_TVR(jsonl_val_path, "v")
    #have to create a test set; to do it, extract a random 10% of the train set
    random.seed(42)
    query_rankings_data_test = random.sample(query_rankings_data_train, int(len(query_rankings_data_train)*0.1))
    query_rankings_data_train = [query for query in query_rankings_data_train if query not in query_rankings_data_test]
    queries_data_test = [query for query in queries_data_train if query["qid"] in [query["qid"] for query in query_rankings_data_test]]
    queries_data_train = [query for query in queries_data_train if query not in queries_data_test]
    documents_data_test = [doc for doc in documents_data_train if doc["docno"] in [query["docno"] for query in query_rankings_data_test]]
    documents_data_train = [doc for doc in documents_data_train if doc not in documents_data_test]

elif dataset_choice == "QVH":
    queries_data_train, documents_data_train, query_rankings_data_train = parse_jsonl_QVH(jsonl_train_path)
    #have to create a val set; to do it, extract a random 10% of the train set
    random.seed(42)
    query_rankings_data_val = random.sample(query_rankings_data_train, int(len(query_rankings_data_train)*0.1))
    query_rankings_data_train = [query for query in query_rankings_data_train if query not in query_rankings_data_val]
    queries_data_val = [query for query in queries_data_train if query["qid"] in [query["qid"] for query in query_rankings_data_val]]
    queries_data_train = [query for query in queries_data_train if query not in queries_data_val]
    documents_data_val = [doc for doc in documents_data_train if doc["docno"] in [query["docno"] for query in query_rankings_data_val]]
    documents_data_train = [doc for doc in documents_data_train if doc not in documents_data_val]
    #have to create a test set; to do it, extract a random 10% of the train set
    random.seed(42)
    query_rankings_data_test = random.sample(query_rankings_data_train, int(len(query_rankings_data_train)*0.1))
    query_rankings_data_train = [query for query in query_rankings_data_train if query not in query_rankings_data_test]
    queries_data_test = [query for query in queries_data_train if query["qid"] in [query["qid"] for query in query_rankings_data_test]]
    queries_data_train = [query for query in queries_data_train if query not in queries_data_test]
    documents_data_test = [doc for doc in documents_data_train if doc["docno"] in [query["docno"] for query in query_rankings_data_test]]
    documents_data_train = [doc for doc in documents_data_train if doc not in documents_data_test]



In [53]:
# Create DataFrames for the Query Set, Documents Set, and Query Rankings
queries_train_df = pd.DataFrame(queries_data_train)
documents_train_df = pd.DataFrame(documents_data_train)

queries_val_df = pd.DataFrame(queries_data_val)
documents_val_df = pd.DataFrame(documents_data_val)

queries_test_df = pd.DataFrame(queries_data_test)
documents_test_df = pd.DataFrame(documents_data_test)

q_rels = pd.concat([pd.DataFrame(query_rankings_data_train), pd.DataFrame(query_rankings_data_val), pd.DataFrame(query_rankings_data_test)]).reset_index(drop=True)

#print length of the dataframes
print("Train set:")
print("Queries: ", len(queries_train_df))
print("Documents: ", len(documents_train_df))

print("Val set:")
print("Queries: ", len(queries_val_df))
print("Documents: ", len(documents_val_df))

print("Test set:")
print("Queries: ", len(queries_test_df))
print("Documents: ", len(documents_test_df))

print("Query Rankings: ", len(q_rels))



Train set:
Queries:  4445
Documents:  4445
Val set:
Queries:  548
Documents:  548
Test set:
Queries:  493
Documents:  493
Query Rankings:  5486


### First Stage Retrieval [TODO: BOX]
The following part of the code will define three different first stage retrieval pipelines as an input for the trained model.

In [54]:
if not pt.started():
    pt.init()

In [55]:
# Create an index

index_path = r"c:\Users\chrys\Documents\CSE\MASTERS\Q3\Information Retrieval\moment_retrieval\moment_retrieval\index_path"

indexer = pt.IterDictIndexer(
    index_path,
    meta={
        "docno": 64,
        "vid_name": 64,
        "text": 131072,
    },
    stemmer="porter",
    stopwords="terrier",
    overwrite=True,
    type=pt.index.IndexingType.MEMORY,
)

In [56]:
joint_documents_set_df = pd.concat([documents_train_df, documents_val_df, documents_test_df])

print("Length: ", len(joint_documents_set_df))

Length:  5486


In [57]:
indexed = indexer.index(
    joint_documents_set_df.to_dict(orient="records")
)

In [58]:
from pyterrier.measures import *

n_r = 50  # Number of documents retrieved in the first stage

first_stage_bm25 = pt.BatchRetrieve(
    indexed,
    wmodel="BM25",
    num_results=n_r,
    metadata=["docno", "vid_name", "text"]
)

# Initialize BatchRetrieve with the created index and specify LemurTF_IDF as the weighting model
first_stage_lemurtfidf = pt.BatchRetrieve(
    indexed,
    wmodel="LemurTF_IDF",
    num_results=n_r,
    metadata=["docno", "vid_name", "text"]
)

first_stage_pl2 = pt.BatchRetrieve(
    indexed,
    wmodel="PL2",
    num_results=n_r,
    metadata=["docno", "vid_name", "text"]
)

first_stage_in_exp_b2 = pt.BatchRetrieve(
    indexed,
    wmodel="In_expB2",
    num_results=n_r,
    metadata=["docno", "vid_name", "text"]
)

### 
first_stage_retrieval_list = [first_stage_bm25, first_stage_pl2, first_stage_in_exp_b2] if dataset_choice == "TVR" else [first_stage_bm25, first_stage_lemurtfidf, first_stage_in_exp_b2]

# Computing feature

In [59]:
#We create features for the second stage using the first stage retrievers

#TF-IDF based features
lemur_tf_idf_retriever = pt.BatchRetrieve(indexed, wmodel="LemurTF_IDF")  # LemurTF_IDF -> It is a TF-IDF based weighting model
bm25_retriever = pt.BatchRetrieve(indexed, wmodel="BM25")

#Language model based features
hiem_retriever = pt.BatchRetrieve(indexed, wmodel="Hiemstra_LM")
dirichlet_retriever = pt.BatchRetrieve(indexed, wmodel="DirichletLM")

#Divergence from randomness based features
pl2_retriever = pt.BatchRetrieve(indexed, wmodel="PL2")
dlh_retriever = pt.BatchRetrieve(indexed, wmodel="DLH")

#Can add more!
coordinate_match_retriever = pt.BatchRetrieve(indexed, wmodel="CoordinateMatch")
js_kls_retrieveer = pt.BatchRetrieve(indexed, wmodel="Js_KLs")

bm25_QE_retriever = pt.BatchRetrieve(indexed, wmodel="BM25", controls={"qe": "on", "qemodel": "Bo1"})
hiem_QE_retriever = pt.BatchRetrieve(indexed, wmodel="Hiemstra_LM", controls={"qe": "on", "qemodel": "Bo1"})
pl2_QE_retriever = pt.BatchRetrieve(indexed, wmodel="PL2", controls={"qe": "on", "qemodel": "Bo1"})
coordinate_match_QE_retriever = pt.BatchRetrieve(indexed, wmodel="CoordinateMatch", controls={"qe": "on", "qemodel": "Bo1"})
# 1. PL2 + DLH
# 2. BM25 + Hiemstra_LM + PL2 + CoordinateMatch
# 3. BM25 (QE) + HiemstraLM (QE) + PL2 (QE) + CoordinateMatch (QE)
# 4. LemurTF_IDF + DirichletLM + DLH + Js_KLs
feature_combinations = [ 
    (pl2_retriever ** dlh_retriever), 
    (bm25_retriever ** hiem_retriever ** pl2_retriever ** coordinate_match_retriever), 
    (bm25_QE_retriever ** hiem_QE_retriever ** pl2_QE_retriever ** coordinate_match_QE_retriever), 
    (lemur_tf_idf_retriever ** dirichlet_retriever ** dlh_retriever ** js_kls_retrieveer)
]

In [60]:
# Prepare the queries for the pipeline, remove special characters and extra spaces
prepared_trainqueries = queries_train_df
prepared_trainqueries['query'] = prepared_trainqueries['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_trainqueries['query'] = prepared_trainqueries['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_trainqueries['query'] = prepared_trainqueries['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

prepared_train_qrels = pd.DataFrame(query_rankings_data_train)
prepared_train_qrels['query'] = prepared_train_qrels['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_train_qrels['query'] = prepared_train_qrels['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_train_qrels['query'] = prepared_train_qrels['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

prepared_train_qrels['label'] = prepared_train_qrels['score']

prepared_train_qrels['label'] = prepared_train_qrels['label'].astype(int)

prepared_val_qrels = pd.DataFrame(query_rankings_data_val)
prepared_val_qrels['query'] = prepared_val_qrels['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_val_qrels['query'] = prepared_val_qrels['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_val_qrels['query'] = prepared_val_qrels['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

prepared_val_qrels['label'] = prepared_val_qrels['score']

prepared_val_qrels['label'] = prepared_val_qrels['label'].astype(int)

prepared_test_qrels = pd.DataFrame(query_rankings_data_test)
prepared_test_qrels['query'] = prepared_test_qrels['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_test_qrels['query'] = prepared_test_qrels['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_test_qrels['query'] = prepared_test_qrels['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

prepared_test_qrels['label'] = prepared_test_qrels['score']

prepared_test_qrels['label'] = prepared_test_qrels['label'].astype(int)

prepared_qrels = q_rels
prepared_qrels['query'] = prepared_qrels['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_qrels['query'] = prepared_qrels['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_qrels['query'] = prepared_qrels['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

prepared_qrels['label'] = prepared_qrels['score']

prepared_qrels['label'] = prepared_qrels['label'].astype(int)

prepared_valqueries = queries_val_df.reset_index()
prepared_valqueries['query'] = prepared_valqueries['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_valqueries['query'] = prepared_valqueries['query'].str.replace(r'[^\w\s]', ' ', regex=True)

prepared_valqueries['query'] = prepared_valqueries['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

#test set
prepared_testqueries = queries_test_df.reset_index()
prepared_testqueries['query'] = prepared_testqueries['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_testqueries['query'] = prepared_testqueries['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_testqueries['query'] = prepared_testqueries['query'].str.replace(r'\s+', ' ', regex=True).str.strip()


Storage format: .csv or trec

trec(https://pyterrier.readthedocs.io/en/latest/io.html): The pt io format, but it doesn't contain feature.

In [61]:
from sklearn.svm import LinearSVR
import xgboost as xgb
import fastrank
from pyterrier.measures import *

index = pt.IndexFactory.of(r"c:\Users\chrys\Documents\CSE\MASTERS\Q3\Information Retrieval\moment_retrieval\moment_retrieval\index_path")

learned_models = [
    {
        'model': LinearSVR(),
        'form': 'reg',
        'name': 'linearSVR',
    },
    {
       'model': xgb.XGBRanker(tree_method="hist", objective="rank:ndcg", device= "cuda"),
       'form': 'ltr',
       'name': 'XGBoost (NDCG)',
    },
    {
       'model': fastrank.TrainRequest.coordinate_ascent(),
       'form': 'fastrank',
       'name': 'FastRank Coordinate Ascent',
    },
    {
       'model': fastrank.TrainRequest.random_forest(),
       'form': 'fastrank',
       'name': 'FastRank Random Forest',
    }
  ]

eval_metrics = [   # MRT -> Mean Response Time -> Take that for 10 configurations
    MAP, MRR,
    nDCG @ 1, nDCG @ 3, nDCG @ 5, nDCG @ 10,
    nDCG @ 20, nDCG @ 30, nDCG @ 50,
    Recall @ 1, Recall @ 3, Recall @ 5, Recall @ 10,
    Recall @ 20, Recall @ 30, Recall @ 50,
]

In [69]:
import os
'''
Try all possible combinations of 
- First stage retrieval models (first_stage_retrieval_list)
- Feature combinations  (feature_combinations)
- Learned models  (learned_models)
'''

print("Dataset choice: ", dataset_choice)

for fsr_alg in first_stage_retrieval_list:    #  3
    for i, feature_comb in enumerate(feature_combinations):
        print("Length of feature_combinations:", len(feature_combinations))  # Add logging

        for model in learned_models:
            # combine the features with ** operator
            name = fsr_alg.controls["wmodel"] + f"_{i}_" + f"{model['name']}"
            print("Name of configuration: ", name)

            # if the file already exists, skip the training
            if os.path.exists(f"experiments/{dataset_choice}/test/{name}.csv") and os.path.exists(f"experiments/{dataset_choice}/train/{name}.csv"):
                print("File already exists")
                continue
            
            fsr = ~fsr_alg >> (feature_comb)    
            
            pipeline = fsr >> pt.ltr.apply_learned_model(model['model'], form=model['form'])

            pipeline.fit(
                prepared_trainqueries,
                prepared_train_qrels,  
                prepared_valqueries,
                prepared_val_qrels
            )

            print("Training done")

            pt.Experiment(
                [pipeline],
                prepared_testqueries,
                prepared_test_qrels,
                names=[name],
                eval_metrics=eval_metrics,
            ).to_csv(f"experiments/{dataset_choice}/test/{name}.csv")

            pt.Experiment(
                [pipeline],
                prepared_trainqueries.sample(frac=0.1, random_state=42),  # select the 10% of the train set
                prepared_train_qrels,   
                names=[name],
                eval_metrics=eval_metrics,
            ).to_csv(f"experiments/{dataset_choice}/train/{name}.csv")

            print("Experiments done")


Dataset choice:  QVH
Length of feature_combinations: 4
Name of configuration:  BM25_0_linearSVR
Lenght of the train queries:  4445
Lenght of the train qrels:  4445
Lenght of the val queries:  548
Lenght of the val qrels:  548
11:27:51.948 [main] WARN org.terrier.querying.SimpleDecorate - Problem performing decoration
java.lang.IndexOutOfBoundsException: Index 5557 out of bounds for length 5486
	at java.base/jdk.internal.util.Preconditions.outOfBounds(Preconditions.java:64)
	at java.base/jdk.internal.util.Preconditions.outOfBoundsCheckIndex(Preconditions.java:70)
	at java.base/jdk.internal.util.Preconditions.checkIndex(Preconditions.java:266)
	at java.base/java.util.Objects.checkIndex(Objects.java:359)
	at java.base/java.util.ArrayList.get(ArrayList.java:427)
	at org.terrier.realtime.memory.MemoryMetaIndex.getItems(MemoryMetaIndex.java:162)
	at org.terrier.querying.SimpleDecorate.filter(SimpleDecorate.java:61)
	at org.terrier.querying.LocalManager$PostFilterProcess.process(LocalManager.

AssertionError: 