In [1]:
import pandas as pd
import json

In [2]:
# Paths to JSONL files
jsonl_train_path = './text_data/tvr_val_release.jsonl'
jsonl_val_path = './text_data/tvr_val_release.jsonl'
subs_path = './text_data/tvqa_preprocessed_subtitles.jsonl'

In [3]:
# Load subtitles into a dictionary for quick access
subtitles_dict = {}
with open(subs_path, 'r') as subs_file:
    for line in subs_file:
        sub_data = json.loads(line)
        subtitles_dict[sub_data['vid_name']] = sub_data['sub']

In [4]:
# Function to find matching subtitles
def find_matching_subtitles(vid_name, ts_range, subtitles_dict):
    matching_subs = []
    if vid_name in subtitles_dict:
        for subtitle in subtitles_dict[vid_name]:
            if (ts_range[0] <= subtitle['start'] <= ts_range[1]) or (ts_range[0] <= subtitle['end'] <= ts_range[1]):
                matching_subs.append(subtitle['text'])
    return matching_subs

In [5]:
def parse_jsonl(jsonl_path):
    # Initialize empty lists for your data
    queries_data = []
    documents_data = []
    query_rankings_data = []

    with open(jsonl_path, 'r') as file:
        for idx, line in enumerate(file):
            data = json.loads(line)
            
            # Extract data for the Query Set DataFrame
            queries_data.append({'qid': data['desc_id'], 'query': data['desc']})
            
            # Find matching subtitles
            matching_subs = find_matching_subtitles(data['vid_name'], data['ts'], subtitles_dict)
            
            # Extract data for the Documents Set DataFrame, including matching subtitles
            documents_data.append({'docno': idx, 'vid_name': data['vid_name'], 'ts': data['ts'], 
                                'duration': data['duration'], 'type': data['type'], 'subtitles': matching_subs})
            
            # Extract data for the Query Rankings DataFrame
            query_rankings_data.append({'qid': data["desc_id"], 'query': data['desc'], 'docno': idx, 'rank': 1, 'score': 1.0})

    return queries_data, documents_data, query_rankings_data

In [6]:
train_queries_data, train_documents_data, train_query_rankings_data = parse_jsonl(jsonl_train_path)

# Convert lists to DataFrames (training)
train_query_set_df = pd.DataFrame(train_queries_data).set_index('qid')
train_documents_set_df = pd.DataFrame(train_documents_data).set_index('docno')
train_query_rankings_df = pd.DataFrame(train_query_rankings_data)

val_queries_data, val_documents_data, val_query_rankings_data = parse_jsonl(jsonl_val_path)

# Convert lists to DataFrames (validation)
val_query_set_df = pd.DataFrame(val_queries_data).set_index('qid')
val_documents_set_df = pd.DataFrame(val_documents_data).set_index('docno')
val_query_rankings_df = pd.DataFrame(val_query_rankings_data)


In [7]:
#print the head of the dataframes
print(train_query_set_df.head())
print(train_documents_set_df.head())
print(train_query_rankings_df.head())

print(val_query_set_df.head())
print(val_documents_set_df.head())
print(val_query_rankings_df.head())


                                                   query
qid                                                     
90200     Phoebe puts one of her ponytails in her mouth.
94603          Monica tells Ross never knew he did that.
89063  Cross explains why he's laying in the bed to B...
88605  Penny looks confused and shakes her head while...
90309  Brennan grabs Stark and lifts him up off of th...
                           vid_name              ts  duration type  \
docno                                                                
0      friends_s01e03_seg02_clip_19  [16.48, 33.87]     61.46    v   
1      friends_s04e21_seg02_clip_18       [0, 2.29]     57.35   vt   
2       castle_s06e12_seg02_clip_22    [0.46, 5.47]     91.19   vt   
3              s02e14_seg02_clip_02  [35.54, 40.34]     64.02    v   
4        house_s04e03_seg02_clip_04  [18.72, 26.48]     91.29    v   

                                               subtitles  
docno                                              