In [6]:
from pathlib import Path
from collections import defaultdict
import re

root = Path("/data0/yfliu/outputs/baseline/nonparallel/output_top200/1gbs64_favc_sbasedonc_lstmvclub_unit_finetuned/lrs3/test_samples_N+swapped")
output_path = Path("./lrs3_negs.txt")

# 结构: 源说话人 -> 源视频ID -> 目标说话人集合 + 文件列表
speaker_video_map = defaultdict(lambda: defaultdict(lambda: {"targets": set(), "files": []}))

for file in root.rglob("*.mp4"):
    parts = file.parts[-2:]
    if len(parts) != 2:
        continue
    speaker_dir, filename = parts

    match = re.match(r"(\d{5})_test_([A-Za-z0-9_]+)_\d+_vc\.mp4", filename)
    if not match:
        continue

    vid_idx, tgt_spk = match.groups()
    src_spk = speaker_dir
    src_video_id = f"{src_spk}_{vid_idx}"

    entry = speaker_video_map[src_spk][src_video_id]
    entry["targets"].add(tgt_spk)
    entry["files"].append(str(file.relative_to(root)))

# speaker_video_map {src_spk: videos}
# videos {{src_spk}_{src_id}: video}
# video {'targets', 'files'}
# 'targets' {tgt_spk,}
# 'files' [{src_spk}/{src_id}_test_{tgt_spk}_{tgt_id}_vc.mp4,]

# 为每个源说话人找一个最多目标说话人的源视频
with output_path.open("w") as f:
    for src_spk, videos in sorted(speaker_video_map.items()):
        # 找到目标说话人最多的那个源视频
        best_video = max(videos.items(), key=lambda item: len(item[1]["files"]))
        src_video_id, info = best_video

        f.write(f"Source Speaker: {src_spk}\n")
        f.write(f"  → Source Video: {src_video_id} ({len(info['targets'])} unique target speakers and {len(info['files'])} files)\n")
        for filepath in sorted(info["files"]):
            f.write(f"    - {filepath}\n")
        f.write("\n")

print(f"✅ Done. Written to: {output_path.resolve()}")


✅ Done. Written to: /home/yfliu/upstream_backup/ViSVIC-Demo/lrs3_negs.txt


In [25]:
from pathlib import Path
from collections import defaultdict
import re

root = Path("/data0/yfliu/outputs/baseline/nonparallel/output_top200/1gbs64_favc_sbasedonc_lstmvclub_unit_finetuned/lrs3/test_samples_N+swapped")
output_path = Path("./lrs3_negs_finals.txt")

# 结构: 源说话人 -> 源视频ID -> 目标说话人集合 + 文件列表
speaker_video_map = defaultdict(lambda: defaultdict(lambda: {"targets": set(), "files": []}))

for file in root.rglob("*.mp4"):
    parts = file.parts[-2:]
    if len(parts) != 2:
        continue
    speaker_dir, filename = parts

    match = re.match(r"(\d{5})_test_([A-Za-z0-9_]+)_\d+_vc\.mp4", filename)
    if not match:
        continue

    vid_idx, tgt_spk = match.groups()
    src_spk = speaker_dir
    src_video_id = f"{src_spk}_{vid_idx}"

    entry = speaker_video_map[src_spk][src_video_id]
    entry["targets"].add(tgt_spk)
    entry["files"].append(str(file.relative_to(root)))

# speaker_video_map {src_spk: videos}
# videos {{src_spk}_{src_id}: video}
# video {'targets', 'files'}
# 'targets' {tgt_spk,}
# 'files' [{src_spk}/{src_id}_test_{tgt_spk}_{tgt_id}_vc.mp4,]

all_videos = []

with output_path.open("w") as f:
    for src_spk, videos in sorted(speaker_video_map.items()):
        for src_video_id, info in videos.items():
            all_videos.append((src_video_id, info))

        # 这个块保留用于记录各个 speaker 的原始信息（非必要可以去掉）
        best_video = max(videos.items(), key=lambda item: len(item[1]["files"]))
        src_video_id, info = best_video
        f.write(f"Source Speaker: {src_spk}\n")
        f.write(f"  → Source Video: {src_video_id} ({len(info['targets'])} unique target speakers and {len(info['files'])} files)\n")
        for filepath in sorted(info["files"]):
            f.write(f"    - {filepath}\n")
        f.write("\n")

# TODO Sort by len(info['files']) and keep only top4 in best_videos
# 获取全局文件最多的前4个视频
top5 = sorted(all_videos, key=lambda x: len(x[1]["files"]), reverse=True)[:4]
best_videos = {video_id: (video_id, info) for video_id, info in top5}

# TODO print best_videos with its files
print("Top 4 Videos with Most Files:")
for video_id, (vid, info) in best_videos.items():
    print(f"Video ID: {video_id}")
    print(f"  → {len(info['targets'])} unique target speakers, {len(info['files'])} files")
    for filepath in sorted(info["files"]):
        print(f"    - {filepath}")
    print()

Top 4 Videos with Most Files:
Video ID: 81Ub0SMxZQo_00002
  → 4 unique target speakers, 69 files
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00001_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00003_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00006_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00011_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00012_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00013_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00014_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00016_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00017_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00019_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00020_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00021_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00022_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00024_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00025_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18c_00027_vc.mp4
    - 81Ub0SMxZQo/00002_test_UAj1hsXp18

In [30]:
import re

def extract_key(filepath):
    # 从路径中提取 test 后面的核心键，如 UAj1hsXp18c_00001
    match = re.search(r'test_(.*?)_vc\.mp4$', filepath)
    return match.group(1) if match else None

# Step 1: 获取top4
sorted_lines = sorted(all_videos, key=lambda x: len(x[1]["files"]), reverse=True)
tops = sorted_lines[:2]+sorted_lines[3:5]
best_videos = {video_id: (video_id, info) for video_id, info in tops}
output_path = Path("./lrs3_negs_finals.txt")
fw = output_path.open("w")

# Step 2: 提取每个视频中的标识符集合
key_sets = []
file_maps = {}  # {video_id: {key: full_path}}

for video_id, (vid, info) in best_videos.items():
    key_to_file = {}
    for fpath in info["files"]:
        key = extract_key(fpath)
        if key:
            key_to_file[key] = fpath
    file_maps[video_id] = key_to_file
    key_sets.append(set(key_to_file.keys()))

# Step 3: 取交集键集合
common_keys = set.intersection(*key_sets)

# Step 4: 更新每个视频的 files 只保留交集项
for video_id, (vid, info) in best_videos.items():
    info["files"] = sorted([file_maps[video_id][k] for k in common_keys])

# Step 5: 打印
fw.write("Top 4 Videos with Common Files (based on test_* key):\n")
for video_id, (vid, info) in best_videos.items():
    fw.write(f"Video ID: {video_id}\n")
    fw.write(f"  → {len(info['targets'])} unique target speakers, {len(info['files'])} common files\n")
    for filepath in info["files"]:
        fw.write(f"    - {filepath}\n")
