In [2]:
from hydra.core.global_hydra import GlobalHydra
from hydra import initialize_config_dir, compose
import pandas as pd
from pathlib import Path
import json
import random
import os

# Initialize Hydra with the config path
GlobalHydra.instance().clear()
initialize_config_dir(config_dir="/home/ta-tldr/Project/tldr/config/", version_base=None)
cfg = compose(config_name="LongVideoBench.yaml")
dataset_root = cfg.LongVideoBench.dataset_root
video_root = dataset_root + "videos/"

# per_token = True
per_token = False

# load csv file
df_info = pd.read_csv(Path(dataset_root, "csv", "response_mutual_info_text_task.csv"), delimiter="|")
df_comp = pd.read_csv(Path(dataset_root, "csv", "response_mutual_info_video_text.csv"), delimiter="|")
CoT_df_info = pd.read_csv(Path(dataset_root, "csv", "CoT_mutual_info_text_task.csv"), delimiter="|")
CoT_df_comp = pd.read_csv(Path(dataset_root, "csv", "CoT_mutual_info_video_text.csv"), delimiter="|")

# read in response data
cot_responses_df = pd.read_csv('/nas/pohan/datasets/LongVideoBench/csv/vlm_CoT.csv', delimiter='|')
vlm_responses_df = pd.read_csv('/nas/pohan/datasets/LongVideoBench/csv/vlm_responses.csv', delimiter='|')

In [3]:
# Filter out rows with -10 in logprob_text or logprob
def filter_logprob(df_info, threshold=-10):
    return set(df_info[(df_info['logprob_text'] > threshold) & (df_info['logprob'] > threshold)]['record_id'])

# Apply the function to each dataframe
filtered_record_ids = (filter_logprob(df_info) & filter_logprob(CoT_df_info))
df_info_filtered = df_info[df_info['record_id'].isin(filtered_record_ids)]
CoT_info_filtered = CoT_df_info[CoT_df_info['record_id'].isin(filtered_record_ids)]
vlm_responses_df = vlm_responses_df[vlm_responses_df['record_id'].isin(filtered_record_ids)]

In [4]:
# Count records per video
# sampled_rids = df_info_filtered['record_id'].unique()
sampled_rids = ['N7RTTiHsSjI_1', '1vvYsirvA2I_1', 'BktEeBeA7a8_1', 'JIoE18JYGcM_1', 'P0BSTjziVys_1', 'Bjymxow3TVQ_0', 'athabNMGceo_1', 'VSZ8ywgGNGM_0', '88LbBgZP1vQ_0', 'VkNF0rXuDXw_1']
print(f"Total number of unique record_ids: {len(sampled_rids)}")

Total number of unique record_ids: 10


In [5]:
# only keep the rows that are in the sampled_record_ids
df_info_filtered = df_info_filtered[df_info_filtered['record_id'].isin(sampled_rids)]
CoT_info_filtered = CoT_info_filtered[CoT_info_filtered['record_id'].isin(sampled_rids)]
df_comp = df_comp[df_comp['record_id'].isin(sampled_rids)]
CoT_df_comp = CoT_df_comp[CoT_df_comp['record_id'].isin(sampled_rids)]
vlm_responses_df = vlm_responses_df[vlm_responses_df['record_id'].isin(sampled_rids)]

In [6]:
def initialize_dataframe():
    columns = ['record_id', 'group', 'predicted_word_video', 'predicted_word', 'orig_word', 'complexity_score', 'rank_video', 'rank']
    dtypes = {
        'record_id': 'int', 
        'group': 'object',
        'predicted_word_video': 'object',
        'predicted_word': 'object',        
        'orig_word': 'object',             
        'complexity_score': 'float', 
        'rank_video': 'object',            
        'rank': 'object'                   
    }
    return pd.DataFrame(columns=columns).astype(dtypes)

def process_data(df_info_filtered, df_comp):
    df_comp_total = initialize_dataframe()
    log_comp_scores = []
    comp_ranks = []
    max_id, min_id = None, None
    max_diff, min_diff = -100, 100
    num_tokens = []
    # Filter out rows with -10 in logprob_video or logprob
    for record_id in df_info_filtered['record_id'].unique():
        for group in df_comp['group'].unique():
            row = df_comp[(df_comp['record_id'] == record_id) & (df_comp['group'] == group)]
            if per_token:
                diff_count = len(row)
            else:
                diff_count = 1  # Placeholder value
            diff = row['diff'].sum()
            if diff == 0:
                print(f"Diff is 0 for record_id: {record_id}, group: {group}. This might be due to no video.")
                comp_ranks.append(0)
                num_tokens.append(1)
            else:
                log_comp_scores.append(diff / diff_count)
                comp_ranks.append(row['logprob_video'].values[0] - row['logprob'].values[0])
                num_tokens.append(diff_count)
                
            if diff > max_diff:
                max_diff = diff
                max_id = record_id
            if diff < min_diff:
                min_diff = diff
                min_id = record_id
            
            new_row = pd.DataFrame({
                'record_id': [record_id], 
                'group': [group],
                'predicted_word_video': [row['predicted_word_video'].values], 
                'predicted_word': [row['predicted_word'].values], 
                'orig_word': [row['orig_word'].values], 
                'complexity_score': [diff / diff_count], 
                'rank_video': [row['rank_video'].values], 
                'rank': [row['rank'].values]
            })
            df_comp_total = pd.concat([df_comp_total, new_row], ignore_index=True)
    
    return df_comp_total, log_comp_scores, comp_ranks, max_id, min_id, num_tokens


# Process data
df_comp_total, log_comp_scores, comp_ranks, max_id, min_id, num_tokens = process_data(df_info_filtered, df_comp)
CoT_df_comp_total, CoT_log_comp_scores, CoT_comp_ranks, CoT_max_id, CoT_min_id, CoT_num_tokens = process_data(df_info_filtered, CoT_df_comp)

# print(f"Max complexity id: {max_id}, Min complexity id: {min_id}")
# print(f"Max CoT complexity id: {CoT_max_id}, Min CoT complexity id: {CoT_min_id}")
# print(f"Avg num of tokens: {sum(num_tokens) / len(num_tokens)}")

In [14]:
vlm_responses_df[vlm_responses_df['record_id'] == 'JIoE18JYGcM_1']

Unnamed: 0,video_id,record_id,response,temperature
45,JIoE18JYGcM,JIoE18JYGcM_1,The video explores the unique physical adaptat...,0.0
46,JIoE18JYGcM,JIoE18JYGcM_1,The video discusses the unique ability of flam...,0.5
47,JIoE18JYGcM,JIoE18JYGcM_1,This video from SciShow explores the unique on...,1.0
48,JIoE18JYGcM,JIoE18JYGcM_1,This video from SciShow covers the unique one-...,1.25
49,JIoE18JYGcM,JIoE18JYGcM_1,This video teaches the physics principle regar...,1.5


In [8]:
# set random seed
random.seed(42)

random_responses = pd.DataFrame(columns=['record_id', 'random_response'])

vlm_stimuli_by_record = vlm_responses_df.groupby('record_id')
for record_id, group in vlm_stimuli_by_record:
    # ramdomly select a response
    random_response = group['response'].sample(n=1).values[0]
    random_responses = pd.concat([random_responses, pd.DataFrame({
        'record_id': [record_id], 
        'random_response': [random_response]
    })], ignore_index=True)

random_responses

Unnamed: 0,record_id,random_response
0,1vvYsirvA2I_1,The video shows the preparation of a colorful ...
1,88LbBgZP1vQ_0,The video documents the activities of a short-...
2,Bjymxow3TVQ_0,A workshop instructor demonstrates techniques ...
3,BktEeBeA7a8_1,A serene forest scene shows a woman enjoying n...
4,JIoE18JYGcM_1,The video discusses the unique ability of flam...
5,N7RTTiHsSjI_1,"In the video, while employees interact in an o..."
6,P0BSTjziVys_1,The video teaches tips on balancing chemical e...
7,VSZ8ywgGNGM_0,The video explores the fascinating effects of ...
8,VkNF0rXuDXw_1,Using stick figures and physical calculations ...
9,athabNMGceo_1,The video details unique digestive traits and ...


In [9]:
info_scores = df_info_filtered[['record_id', 'group', 'informativeness_score']].copy()
info_scores['temperature'] = info_scores['group'].apply(lambda x: float(x.split('_')[1]) if len(x.split('_')) > 1 else None)


# Process the responses dataframe
maxinfo_responses = pd.DataFrame(columns=['record_id', 'maxinfo_response'])

# Track the current index for adding rows
index = 0

for rid in sampled_rids:
    # find the max complexity score and its corresponding temperature
    record_scores = info_scores[info_scores['record_id'] == rid]
    max_info_score = record_scores['informativeness_score'].max()
    max_temp = record_scores.loc[record_scores['informativeness_score'] == max_info_score, 'temperature'].values[0]
    # print(f"Record ID: {rid}, Max complexity score: {max_comp_score}, Temperature: {max_temp}")

    maxinfo_response = vlm_responses_df[(vlm_responses_df['record_id'] == rid) & (vlm_responses_df['temperature'] == max_temp)]['response'].values[0]
    # print(f"Max complexity response: {maxcomp_response}")

    # Add the row to the dataframe
    maxinfo_responses.loc[index] = {'record_id': rid, 'maxinfo_response': maxinfo_response}
    index += 1

maxinfo_responses

Unnamed: 0,record_id,maxinfo_response
0,N7RTTiHsSjI_1,"After completing work in an office building, M..."
1,1vvYsirvA2I_1,The video shows the preparation of a colorful ...
2,BktEeBeA7a8_1,After enjoying her reading while preparing let...
3,JIoE18JYGcM_1,The video explores the unique physical adaptat...
4,P0BSTjziVys_1,The video teaches tips on balancing chemical e...
5,Bjymxow3TVQ_0,A workshop instructor demonstrates the process...
6,athabNMGceo_1,The video details various animals with unique ...
7,VSZ8ywgGNGM_0,The video explores the fascinating effects of ...
8,88LbBgZP1vQ_0,The short-haired man accompanied by a long-hai...
9,VkNF0rXuDXw_1,The video teaches the concept of useful vs. no...


In [10]:
# Extract complexity scores and add temperature information
comp_scores = df_comp_total[['record_id', 'group', 'complexity_score']].copy()
comp_scores['temperature'] = comp_scores['group'].apply(lambda x: float(x.split('_')[1]) if len(x.split('_')) > 1 else None)


# Create dataframe for maximum complexity responses
maxcomp_responses = pd.DataFrame(columns=['record_id', 'maxcomp_response'])

# Track the current index for adding rows
index = 0

for rid in sampled_rids:
    # find the max complexity score and its corresponding temperature
    record_scores = comp_scores[comp_scores['record_id'] == rid]
    max_comp_score = record_scores['complexity_score'].max()
    max_temp = record_scores.loc[record_scores['complexity_score'] == max_comp_score, 'temperature'].values[0]
    # print(f"Record ID: {rid}, Max complexity score: {max_comp_score}, Temperature: {max_temp}")

    maxcomp_response = vlm_responses_df[(vlm_responses_df['record_id'] == rid) & (vlm_responses_df['temperature'] == max_temp)]['response'].values[0]
    # print(f"Max complexity response: {maxcomp_response}")

    # Add the row to the dataframe
    maxcomp_responses.loc[index] = {'record_id': rid, 'maxcomp_response': maxcomp_response}
    index += 1

maxcomp_responses

Unnamed: 0,record_id,maxcomp_response
0,N7RTTiHsSjI_1,The video depicts a surreal and humorous scena...
1,1vvYsirvA2I_1,The video shows the preparation of a blue-tint...
2,BktEeBeA7a8_1,The video portrays a woman preparing for a rel...
3,JIoE18JYGcM_1,This video from SciShow covers the unique one-...
4,P0BSTjziVys_1,The video teaches tips for balancing chemical ...
5,Bjymxow3TVQ_0,A workshop instructor demonstrates the process...
6,athabNMGceo_1,The video details unique digestive traits and ...
7,VSZ8ywgGNGM_0,A woman crouches to interact playfully with a ...
8,88LbBgZP1vQ_0,The video documents the activities of a short-...
9,VkNF0rXuDXw_1,Using stick figures and physical calculations ...


In [11]:
meta_data = json.load(open(dataset_root + "/lvb_val.json", "r"))
sampled_stimuli = pd.DataFrame(columns=['record_id', 'duration', 'topic_category', 'video_path', 'question', 'answer', 'vlm_answer', 'cot'])

for i, entry in enumerate(meta_data):
    if entry['id'] in sampled_rids:
        new_row = pd.DataFrame({
            'record_id': [entry['id']],
            'duration': [entry['duration']],
            'topic_category': [entry['topic_category']],
            'video_path': [entry['video_path']],
            'question': [entry['question']],
            'answer': [cot_responses_df[cot_responses_df['record_id'] == entry['id']]['answer'].values[0]],
            'vlm_answer': [cot_responses_df[cot_responses_df['record_id'] == entry['id']]['vlm_answer'].values[0]],
            'cot': [cot_responses_df[cot_responses_df['record_id'] == entry['id']]['vlm_reasoning'].values[0]],
            'random_response': [random_responses[random_responses['record_id'] == entry['id']]['random_response'].values[0]],
            'maxinfo_response': [maxinfo_responses[maxinfo_responses['record_id'] == entry['id']]['maxinfo_response'].values[0]],
            'maxcomp_response': [maxcomp_responses[maxcomp_responses['record_id'] == entry['id']]['maxcomp_response'].values[0]]
        })
        sampled_stimuli = pd.concat([sampled_stimuli, new_row], ignore_index=True)

  sampled_stimuli = pd.concat([sampled_stimuli, new_row], ignore_index=True)


In [12]:
# Calculate accuracy of VLM answers compared to ground truth answers
correct_answers = 0
total_answers = len(sampled_stimuli)

for _, row in sampled_stimuli.iterrows():
    # Extract just the letter from vlm_answer (e.g., extract 'A' from '(A) The man wearing...')
    vlm_letter = None
    if isinstance(row['vlm_answer'], str) and '(' in row['vlm_answer'] and ')' in row['vlm_answer']:
        vlm_letter = row['vlm_answer'].split('(')[1].split(')')[0].strip().upper()
    
    # Compare with the answer (which should be a letter A-E)
    # Rule as incorrect if vlm_answer contains "None of the options" or similar phrases
    if (vlm_letter and 
        vlm_letter.upper() == row['answer'].strip().upper() and
        not any(phrase in row['vlm_answer'].lower() for phrase in ["none of the options", "none of the above"])):
        correct_answers += 1

accuracy = correct_answers / total_answers if total_answers > 0 else 0
print(f"VLM Answer Accuracy: {accuracy*100:.2f}% ({correct_answers}/{total_answers})")

# # Display the dataframe
# sampled_stimuli.head()

VLM Answer Accuracy: 40.00% (4/10)


In [13]:
# Create a more structured and visually appealing display for each sample
from IPython.display import display, HTML, clear_output

def display_sample(index, total, row):
    clear_output(wait=True)
    video_path_full = f"{video_root}{row['video_path']}"
    print(f"{video_path_full}")
    file_size = os.path.getsize(video_path_full) / (1024 * 1024)  # Size in MB
    # Convert duration to minutes and seconds
    minutes = int(row['duration'] // 60)
    seconds = int(row['duration'] % 60)
    duration_str = f"{minutes} min {seconds} s"
    
    # Find the corresponding entry in meta_data to get candidates
    meta_entry = next((item for item in meta_data if item['id'] == row['record_id']), None)
    
    # Format candidates as options if available
    candidates_html = ""
    if meta_entry and 'candidates' in meta_entry:
        candidates = meta_entry['candidates']
        options = ['A', 'B', 'C', 'D', 'E']
        candidates_html = "<div style='background-color:#F5F5F5; padding:10px; border-radius:5px; margin:10px 0;'>"
        for i, candidate in enumerate(candidates):
            if i < len(options):
                candidates_html += f"<p><b>({options[i]})</b> {candidate}</p>"
        candidates_html += "</div>"
    
    html = f"""
    <div style="border:2px solid #4472C4; border-radius:10px; padding:15px; margin:10px 0;">
        <h3 style="color:#4472C4">Sample {index+1}/{total}</h3>
        <p><b>Record ID:</b> {row['record_id']}</p>
        <p><b>Topic Category:</b> {row['topic_category']}</p>
        <p><b>Duration:</b> {duration_str}</p>
        <p><b>File Size:</b> {file_size:.2f} MB</p>
        <hr style="border-top:1px solid #E0E0E0">
        <p><b>Question:</b> {row['question']}</p>
        {candidates_html}
        <div style="background-color:#E8F0FE; padding:10px; border-radius:5px; margin:10px 0;">
            <p><b>Correct Answer:</b> {row['answer']}</p>
        </div>
        <div style="background-color:#FFF3E0; padding:10px; border-radius:5px; margin:10px 0;">
            <p><b>VLM Answer:</b> {row['vlm_answer']}</p>
        </div>
        <div style="background-color:#E8F5E9; padding:10px; border-radius:5px; margin:10px 0;">
            <p><b>VLM Reasoning:</b> {row['cot']}</p>
        </div>
        <div style="background-color:#F9E4EC; padding:10px; border-radius:5px; margin:10px 0;">
            <p><b>Random Response:</b> {row['random_response']}</p>
        </div>
        <div style="background-color:#E0F7FA; padding:10px; border-radius:5px; margin:10px 0;">
            <p><b>Max Info Response:</b> {row['maxinfo_response']}</p>
        </div>
        <div style="background-color:#F3E5F5; padding:10px; border-radius:5px; margin:10px 0;">
            <p><b>Max Comp Response:</b> {row['maxcomp_response']}</p>
        </div>
    </div>
    <p>Press Enter to continue to next sample, or type 'exit' to stop...</p>
    """
    display(HTML(html))

# Sort samples by duration
sampled_stimuli = sampled_stimuli.sort_values(by='duration', ascending=False)

total_samples = len(sampled_stimuli)

# Display one sample per record_id
for i, (_, row) in enumerate(sampled_stimuli.iterrows()):
    display_sample(i, total_samples, row)
    user_input = input()  # Wait for user input before showing next sample
    if user_input.lower() == 'exit':
        print("Exiting the sample viewer.")
        break


/nas/pohan/datasets/LongVideoBench/videos/88LbBgZP1vQ.mp4


Exiting the sample viewer.
