In [179]:
import nltk
import json
import glob
import re
from moviepy.editor import VideoFileClip
import json
import os
import copy
import nltk

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /Users/ohadr/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [180]:
import datasets

In [None]:
datasets.Audio

In [175]:


def sanitize(x):
    for punc in ["：","，","｜","  "," ","#"]:
        x = x.replace(punc, " ").strip()
    x = x.replace("  ", " ").strip()
    x = re.sub(r'[^a-zA-Z0-9\s]', '', x)
    return x.strip()


def align_file_names_and_video_names(dataset_dir, downloaded_glob):
    with open(f"{dataset_dir}/youtube_urls_released.json", "r") as f:
        youtube_urls = json.load(f)
    existing_files = glob.glob(downloaded_glob)
    urls = list(youtube_urls.values())
    missing_urls = []
    for url in urls:
        if any(url in existing_file for existing_file in existing_files):
            continue
        missing_urls.append(url)

    video_names = []
    for split in ["train", "val", "test"]:
        data_file = f"{dataset_dir}/split/{split}.json"
        with open(data_file, "r") as f:
            data = json.load(f)
        video_names.extend(list({datum["video_name"] for datum in data}))

    video_names = list(set(video_names))
    file_names = [(sanitize(x.split("/")[-1].split("[")[0].strip()),x) for x in existing_files]
    video_names = [(sanitize(x), x) for x in video_names]
    file_names = sorted(file_names, key=lambda x:x[0])
    video_names = sorted(video_names, key= lambda x:x[0])
    mapping = {}
    for game_id,((f,old_f),(g,old_g)) in enumerate(zip(file_names, video_names)):
        assert f==g, f"{f} != {g}"
        mapping[old_g] = (game_id,  old_f)
    return mapping


def timestamp_to_seconds(timestamp):
    h, m, s = map(int, timestamp.split(":"))
    return h * 3600 + m * 60 + s


mapping = align_file_names_and_video_names(dataset_dir="/Users/ohadr/Werewolf/youtube",
                                           downloaded_glob="/Volumes/4TB/werewolf/raw_files/*")

folder = "/Volumes/4TB/werewolf/saved_games"
for split in ["train", "val", "test"]:
    with open(f"/Users/ohadr/Werewolf/youtube/split/{split}.json", "r") as f:
        games = json.load(f)
        for datum in games:
            file_idx, video_name = mapping[datum["video_name"]]
            output_filename = f"{folder}/file_{file_idx}_{datum['Game_ID']}_clip.mp3"  # Using Game_ID from metadata
            if os.path.exists(output_filename):
                continue

            start_seconds = timestamp_to_seconds(datum["startTime"])
            end_seconds = timestamp_to_seconds(datum["endTime"])

            # Load video and extract audio segment
            video = VideoFileClip(video_name)
            video_segment = video.subclip(start_seconds, end_seconds)

            # Extract audio segment
            audio = video.audio.subclip(start_seconds, end_seconds)

            # Save audio segment
            audio.write_audiofile(output_filename)

            # Clean up
            video.close()


def get_player_start_roles(datum):
    players = datum["playerNames"]
    start_roles = datum["startRoles"]
    return dict(zip(players, start_roles))


def get_player_end_roles(datum):
    players = datum["playerNames"]
    end_roles = datum["endRoles"]
    return dict(zip(players, end_roles))


def get_duration_seconds(start_time, end_time):
    start_parts = start_time.split(":")
    end_parts = end_time.split(":")

    if len(start_parts) == 3:
        start_hr, start_min, start_sec = start_parts
        start_total = int(start_hr) * 3600 + int(start_min) * 60 + int(start_sec)
    else:
        start_min, start_sec = start_parts
        start_total = int(start_min) * 60 + int(start_sec)

    if len(end_parts) == 3:
        end_hr, end_min, end_sec = end_parts
        end_total = int(end_hr) * 3600 + int(end_min) * 60 + int(end_sec)
    else:
        end_min, end_sec = end_parts
        end_total = int(end_min) * 60 + int(end_sec)

    return end_total - start_total


def parse_timestamp(timestamp):
    min, sec = timestamp.split(":")
    return int(min) * 60 + int(sec)


def add_endtimes(datum):
    dialogue = datum["Dialogue"]
    new_dialogue = []
    final_time = get_duration_seconds(datum["startTime"], datum["endTime"])
    start_roles = get_player_start_roles(datum)
    end_roles = get_player_end_roles(datum)

    for i in range(len(dialogue) - 1):
        curr = copy.deepcopy(dialogue[i])
        curr["start_time_in_seconds"] = parse_timestamp(dialogue[i]["timestamp"])
        next_time = parse_timestamp(dialogue[i + 1]["timestamp"])
        curr["end_time_in_seconds"] = min(next_time + 3, final_time)
        curr.pop("Rec_Id")
        curr.pop("timestamp")
        curr["target"] = ", ".join(curr["annotation"])
        curr.pop("annotation")
        curr["start_role"] = start_roles.get(curr["speaker"], "unknown")
        curr["end_role"] = end_roles.get(curr["speaker"], "unknown")

        new_dialogue.append(curr)

    # Handle last utterance
    last = copy.deepcopy(dialogue[-1])
    last["start_time_in_seconds"] = parse_timestamp(last["timestamp"])
    last["end_time_in_seconds"] = final_time
    new_dialogue.append(last)

    return new_dialogue


import json

final_data = {}
for split in ["train", "val", "test"]:
    with open(f"/Users/ohadr/Werewolf/youtube/split/{split}.json", "r") as f:
        data = json.load(f)
    for datum in data:
        datum["Dialogue"] = add_endtimes(datum)
        file_idx, video_name = mapping[datum["video_name"]]
        datum["file_name"] = f"file_{file_idx}_{datum['Game_ID']}_clip.mp3"
    final_data[split] = data

# Save mapping to JSON file
with open("/Volumes/4TB/werewolf/saved_games/mapping.json", "w") as f:
    json.dump(mapping, f, indent=4)

In [185]:
dialogues = datum["Dialogue"]

In [194]:
import io
from pydub import AudioSegment


def utterence_to_audio_snippet(start_time_in_seconds, end_time_in_seconds, audio):
    start_ms = start_time_in_seconds * 1000
    end_ms = end_time_in_seconds * 1000

    # Extract the audio segment
    snippet = audio[start_ms:end_ms].set_frame_rate(24000).set_channels(1)
    audio_buffer = io.BytesIO()
    snippet.export(audio_buffer, format="wav")
    return audio_buffer.getvalue()


def construct_audio_segment_and_transcript(dialogues, idx, K=10):
    curr = dialogues[idx]
    start, end = curr["start_time_in_seconds"], curr["end_time_in_seconds"]
    utterances = [curr]

    if start == 0 or idx == 0:
        return start, end, utterances

    # Add the previous utterances that start within K second window
    prev_idx = idx - 1
    while prev_idx >= 0:
        prev = dialogues[prev_idx]
        if prev["start_time_in_seconds"] < min(start - K, 0):
            break
        utterances = [prev] + utterances
        prev_idx -= 1

    start = max(0, start - K)  # Start 10 seconds before current utterance
    return start, end, copy.deepcopy(utterances)

def construct_spans(datum, K=10):
    dialogues = datum["Dialogue"]
    span_list = []
    for i in range(len(dialogues)):
        start, end, utterances = construct_audio_segment_and_transcript(
            dialogues, i, K=K
        )
        span_list.append((start, end, utterances))
    return span_list


def dialog_to_audio_list(datum, folder):
    audio_file = datum["file_name"]
    audio = AudioSegment.from_file(f"{folder}/{audio_file}")
    dialogue_data = []
    for start,end, utterances in construct_spans(datum):
        audio_data = utterence_to_audio_snippet(start, end, audio)
        dialogue_data.append((audio_data, utterances))

    return dialogue_data

In [195]:
dialogue_data = dialog_to_audio_list(datum, folder="/Volumes/4TB/werewolf/saved_games")

In [202]:
s = dialogue_data[0]

In [203]:
s[1]

[{'speaker': 'Mitchell',
  'utterance': 'What is that?',
  'start_time_in_seconds': 3,
  'end_time_in_seconds': 35,
  'target': 'No Strategy',
  'start_role': 'Seer',
  'end_role': 'Seer'}]

In [None]:
dialogue_data[]

In [None]:
def format_dialogue_entry(entry):
    """Format a dialogue entry dictionary in a readable way."""
    return (
        f"Speaker: {entry['speaker']}\n"
        f"Utterance: {entry['utterance']}\n"
        f"Time: {entry['start_time_in_seconds']}s - {entry['end_time_in_seconds']}s\n"
        f"Target: {entry['target']}\n"
        f"Role Change: {entry['start_role']} -> {entry['end_role']}"
    )