In [54]:
import json
import pandas as pd
import pyterrier as pt

In [55]:
jsonl_path = "text_data_QVH/highlight_train_release.jsonl"
subs_path = "text_data_QVH/subs_train.jsonl"

In [56]:
subtitles_dict = {}

with open(subs_path, 'r') as subs_file:
    for line in subs_file:
        sub_data = json.loads(line)
        triple = sub_data['vid'].split("_")
        name = triple[0:-2]
        #turn the list name into a string
        name = "".join(name)
        if name not in subtitles_dict:
            subtitles_dict[name] = [(float(triple[-2]) + sub_data["relevant_windows"][0][0], float(triple[-2]) + sub_data["relevant_windows"][0][1], sub_data['query'])]
        else:
            subtitles_dict[name].append((float(triple[-2]) + sub_data["relevant_windows"][0][0], float(triple[-2]) + sub_data["relevant_windows"][0][1], sub_data['query']))

In [58]:
def parse_jsonl(jsonl_path):
    queries_data = []
    documents_data = []
    query_rankings_data = []
    with open(jsonl_path, 'r') as file:
        for idx,line in enumerate(file):

            # Load the JSON object from the line
            data = json.loads(line)

            triple = data["vid"].split("_")
            document_name = triple[0:-2]
            document_name = "".join(document_name)
            start_time = float(triple[-2])
            end_time = float(triple[-1])

            if document_name not in subtitles_dict:
                #print("Document not found in subtitles: ", document_name)
                continue

            queries_data.append({"qid" : data["qid"], "query": data["query"]})

            all_scores = []
            momentaneus_rank =[]
            for id,relevant_window in enumerate(data["relevant_windows"]):
                ts = [start_time+relevant_window[0], start_time+relevant_window[1]]
                subs = [sub for sub in subtitles_dict[document_name] if sub[0] <= ts[1] and ts[0] <= sub[1]]
                documents_data.append({"docno" : str(idx) +"_"+str(ts[0]) + "_" + str(ts[1]), "vid_name" : document_name, "ts": ts, "duration": data["duration"], "subtitles": [sub[2] for sub in subs]})
                scores = [data["saliency_scores"][i]  for i,clip_id in enumerate(data["relevant_clip_ids"]) if clip_id*2 >= relevant_window[0] and clip_id*2 <= relevant_window[1]]
                #each entry of scores is a triple of integers. Create a variable score which is the average of all the scores
                score = 0 if len(scores) ==0 else sum(sum(scores[i]) for i in range(len(scores)))/(3*len(scores))

                momentaneus_rank.append({"qid" : data["qid"], "query": data["query"] , "docno" : str(idx) +"_"+str(ts[0]) + "_" + str(ts[1]), "score": score, "rank":1})
            
            #adjust the rank of the momentaneus_rank based on the score
            momentaneus_rank = sorted(momentaneus_rank, key=lambda x: x["score"], reverse=True)
            for i in range(len(momentaneus_rank)):
                momentaneus_rank[i]["rank"] = i+1
        
            query_rankings_data.extend(momentaneus_rank)

    return queries_data, documents_data, query_rankings_data

In [60]:
queries_data, documents_data, query_rankings_data = parse_jsonl(jsonl_path)

In [61]:
print("Queries data: ", len(queries_data))
print("Documents data: ", len(documents_data))
print("Query rankings data: ", len(query_rankings_data))

train_query_set_df = pd.DataFrame(queries_data)
train_documents_set_df = pd.DataFrame(documents_data)
train_query_rankings_df = pd.DataFrame(query_rankings_data)

Queries data:  5556
Documents data:  9882
Query rankings data:  9882


In [65]:
print(train_documents_set_df.head())

           docno     vid_name              ts  duration  \
0  0_432.0_442.0  j7rJstUseKg  [432.0, 442.0]       150   
1  0_444.0_454.0  j7rJstUseKg  [444.0, 454.0]       150   
2  0_456.0_466.0  j7rJstUseKg  [456.0, 466.0]       150   
3  0_468.0_478.0  j7rJstUseKg  [468.0, 478.0]       150   
4  0_480.0_490.0  j7rJstUseKg  [480.0, 490.0]       150   

                                           subtitles  \
0  [patriots, can defend themselves from tyranny....   
1  [would identify as Christian., Our laws are ba...   
2  [My views on the religious beliefs of Christia...   
3  [I think that their belief systems are in star...   
4  [But I think Islam is terroristic in nature., ...   

                                                text  
0  patriots, can defend themselves from tyranny. ...  
1  would identify as Christian. Our laws are base...  
2  My views on the religious beliefs of Christian...  
3  I think that their belief systems are in stark...  
4  But I think Islam is terrorist

In [63]:
if not pt.started():
    pt.init()