In [179]:
import nltk
import json
import glob
import re
from moviepy.editor import VideoFileClip
import json
import os
import copy
import nltk

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /Users/ohadr/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [175]:


def sanitize(x):
    for punc in ["：","，","｜","  "," ","#"]:
        x = x.replace(punc, " ").strip()
    x = x.replace("  ", " ").strip()
    x = re.sub(r'[^a-zA-Z0-9\s]', '', x)
    return x.strip()


def align_file_names_and_video_names(dataset_dir, downloaded_glob):
    with open(f"{dataset_dir}/youtube_urls_released.json", "r") as f:
        youtube_urls = json.load(f)
    existing_files = glob.glob(downloaded_glob)
    urls = list(youtube_urls.values())
    missing_urls = []
    for url in urls:
        if any(url in existing_file for existing_file in existing_files):
            continue
        missing_urls.append(url)

    video_names = []
    for split in ["train", "val", "test"]:
        data_file = f"{dataset_dir}/split/{split}.json"
        with open(data_file, "r") as f:
            data = json.load(f)
        video_names.extend(list({datum["video_name"] for datum in data}))

    video_names = list(set(video_names))
    file_names = [(sanitize(x.split("/")[-1].split("[")[0].strip()),x) for x in existing_files]
    video_names = [(sanitize(x), x) for x in video_names]
    file_names = sorted(file_names, key=lambda x:x[0])
    video_names = sorted(video_names, key= lambda x:x[0])
    mapping = {}
    for game_id,((f,old_f),(g,old_g)) in enumerate(zip(file_names, video_names)):
        assert f==g, f"{f} != {g}"
        mapping[old_g] = (game_id,  old_f)
    return mapping


def timestamp_to_seconds(timestamp):
    h, m, s = map(int, timestamp.split(":"))
    return h * 3600 + m * 60 + s


mapping = align_file_names_and_video_names(dataset_dir="/Users/ohadr/Werewolf/youtube",
                                           downloaded_glob="/Volumes/4TB/werewolf/raw_files/*")

folder = "/Volumes/4TB/werewolf/saved_games"
for split in ["train", "val", "test"]:
    with open(f"/Users/ohadr/Werewolf/youtube/split/{split}.json", "r") as f:
        games = json.load(f)
        for datum in games:
            file_idx, video_name = mapping[datum["video_name"]]
            output_filename = f"{folder}/file_{file_idx}_{datum['Game_ID']}_clip.mp3"  # Using Game_ID from metadata
            if os.path.exists(output_filename):
                continue

            start_seconds = timestamp_to_seconds(datum["startTime"])
            end_seconds = timestamp_to_seconds(datum["endTime"])

            # Load video and extract audio segment
            video = VideoFileClip(video_name)
            video_segment = video.subclip(start_seconds, end_seconds)

            # Extract audio segment
            audio = video.audio.subclip(start_seconds, end_seconds)

            # Save audio segment
            audio.write_audiofile(output_filename)

            # Clean up
            video.close()


def get_player_start_roles(datum):
    players = datum["playerNames"]
    start_roles = datum["startRoles"]
    return dict(zip(players, start_roles))


def get_player_end_roles(datum):
    players = datum["playerNames"]
    end_roles = datum["endRoles"]
    return dict(zip(players, end_roles))


def get_duration_seconds(start_time, end_time):
    start_parts = start_time.split(":")
    end_parts = end_time.split(":")

    if len(start_parts) == 3:
        start_hr, start_min, start_sec = start_parts
        start_total = int(start_hr) * 3600 + int(start_min) * 60 + int(start_sec)
    else:
        start_min, start_sec = start_parts
        start_total = int(start_min) * 60 + int(start_sec)

    if len(end_parts) == 3:
        end_hr, end_min, end_sec = end_parts
        end_total = int(end_hr) * 3600 + int(end_min) * 60 + int(end_sec)
    else:
        end_min, end_sec = end_parts
        end_total = int(end_min) * 60 + int(end_sec)

    return end_total - start_total


def parse_timestamp(timestamp):
    min, sec = timestamp.split(":")
    return int(min) * 60 + int(sec)


def add_endtimes(datum):
    dialogue = datum["Dialogue"]
    new_dialogue = []
    final_time = get_duration_seconds(datum["startTime"], datum["endTime"])
    start_roles = get_player_start_roles(datum)
    end_roles = get_player_end_roles(datum)

    for i in range(len(dialogue) - 1):
        curr = copy.deepcopy(dialogue[i])
        curr["start_time_in_seconds"] = parse_timestamp(dialogue[i]["timestamp"])
        next_time = parse_timestamp(dialogue[i + 1]["timestamp"])
        curr["end_time_in_seconds"] = min(next_time + 3, final_time)
        curr.pop("Rec_Id")
        curr.pop("timestamp")
        curr["target"] = ", ".join(curr["annotation"])
        curr.pop("annotation")
        curr["start_role"] = start_roles.get(curr["speaker"], "unknown")
        curr["end_role"] = end_roles.get(curr["speaker"], "unknown")

        new_dialogue.append(curr)

    # Handle last utterance
    last = copy.deepcopy(dialogue[-1])
    last["start_time_in_seconds"] = parse_timestamp(last["timestamp"])
    last["end_time_in_seconds"] = final_time
    new_dialogue.append(last)

    return new_dialogue


import json

final_data = {}
for split in ["train", "val", "test"]:
    with open(f"/Users/ohadr/Werewolf/youtube/split/{split}.json", "r") as f:
        data = json.load(f)
    for datum in data:
        datum["Dialogue"] = add_endtimes(datum)
        file_idx, video_name = mapping[datum["video_name"]]
        datum["file_name"] = f"file_{file_idx}_{datum['Game_ID']}_clip.mp3"
    final_data[split] = data

# Save mapping to JSON file
with open("/Volumes/4TB/werewolf/saved_games/mapping.json", "w") as f:
    json.dump(mapping, f, indent=4)

In [176]:
for datum in final_data["train"]:
    break

In [None]:
import io
from pydub import AudioSegment

def utterence_to_audio_snippet(utterence, audio):
    start_ms = utterence["start_time_in_seconds"] * 1000
    end_ms = utterence["end_time_in_seconds"] * 1000

    # Extract the audio segment
    snippet = audio[start_ms:end_ms].set_frame_rate(24000).set_channels(1)
    audio_buffer = io.BytesIO()
    snippet.export(audio_buffer, format="wav")
    return audio_buffer.getvalue()

def dialog_to_audio_list(datum):
    audio_file = datum["file_name"]
    audio = AudioSegment.from_file(audio_file)
    utterences = datum["Dialogue"]
    # Create audio snippets for each dialogue entry
    dialogue_data = []
    for i, dialogue in enumerate(utterences):
        dialogue_data.append(utterence_to_audio_snippet(dialogue, audio))
    return dialogue_data

In [None]:
def format_dialogue_entry(entry):
    """Format a dialogue entry dictionary in a readable way."""
    return (
        f"Speaker: {entry['speaker']}\n"
        f"Utterance: {entry['utterance']}\n"
        f"Time: {entry['start_time_in_seconds']}s - {entry['end_time_in_seconds']}s\n"
        f"Target: {entry['target']}\n"
        f"Role Change: {entry['start_role']} -> {entry['end_role']}"
    )

In [162]:
[(key, datum[key]) for key in datum.keys() if key != "Dialogue"]

[('YT_ID', 'part10'),
 ('video_name', 'ONE NIGHT ULTIMATE WEREWOLF 30  February 17th 2018'),
 ('Game_ID', 'Game2'),
 ('startTime', '00:10:30'),
 ('endTime', '00:17:17'),
 ('playerNames', ['Justin', 'Caitlynn', 'Paul', 'Alysha', 'Mitchell', 'Mike']),
 ('votingOutcome', [4, 5, 5, 0, 5, 1]),
 ('startRoles',
  ['Tanner', 'Insomniac', 'Werewolf', 'Seer', 'Werewolf', 'Robber']),
 ('endRoles',
  ['Robber', 'Insomniac', 'Werewolf', 'Seer', 'Werewolf', 'Tanner']),

In [66]:
metadata = [(key, datum[key]) for key in datum.keys() if key != "Dialogue"]

Dialogue = datum["Dialogue"]
# is of the form:


video_name = "/Volumes/4TB/ONE NIGHT ULTIMATE WEREWOLF #30 ｜ February 17th, 2018 [4u_FJOiM18c].mkv"



# Extract audio segment based on start and end times
start_time = metadata[3][1]  # Get startTime from metadata
end_time = metadata[4][1]    # Get endTime from metadata

# Convert timestamp strings to seconds



# Extract video segment with audio

# Save video segment with audio
output_filename = f"game_{metadata[2][1]}_clip.mp4"  # Using Game_ID from metadata
video_segment.write_videofile(output_filename, audio=True)

# Clean up
video.close()


Moviepy - Building video game_Game2_clip.mp4.
MoviePy - Writing audio in game_Game2_clipTEMP_MPY_wvf_snd.mp3


                                                                      

MoviePy - Done.
Moviepy - Writing video game_Game2_clip.mp4



                                                                   

Moviepy - Done !
Moviepy - video ready game_Game2_clip.mp4


'game_Game2_clip.mp3'

In [82]:
f"game_{metadata[2][1]}_clip.mp3"  # Using Game_ID from metadata
for snippet in Dialogue:
    print(snippet)

{'Rec_Id': 2, 'speaker': 'Justin', 'timestamp': '01:49', 'utterance': 'I am insomniac.', 'annotation': ['Identity Declaration']}
{'Rec_Id': 3, 'speaker': 'Caitlynn', 'timestamp': '01:49', 'utterance': 'I had myself a little nap. I almost didn\'t wake up. I was like sitting there like this. And it was like, "Insomniac.\' I\'m like, "Oh, shit. I got to do something."', 'annotation': ['Defense']}
{'Rec_Id': 4, 'speaker': 'Paul', 'timestamp': '01:56', 'utterance': "What's the opposite of the insomniac.", 'annotation': ['Interrogation']}
{'Rec_Id': 5, 'speaker': 'Caitlynn', 'timestamp': '02:01', 'utterance': 'This is what I was.', 'annotation': ['Evidence']}
{'Rec_Id': 6, 'speaker': 'Mitchell', 'timestamp': '02:04', 'utterance': 'Justin.', 'annotation': ['Accusation']}
{'Rec_Id': 7, 'speaker': 'Justin', 'timestamp': '02:06', 'utterance': 'Me? No.', 'annotation': ['Interrogation', 'Defense']}
{'Rec_Id': 8, 'speaker': 'Mike', 'timestamp': '02:09', 'utterance': 'Touché', 'annotation': ['Defens

In [79]:
Dialogue

[{'Rec_Id': 2,
  'speaker': 'Justin',
  'timestamp': '01:49',
  'utterance': 'I am insomniac.',
  'annotation': ['Identity Declaration']},
 {'Rec_Id': 3,
  'speaker': 'Caitlynn',
  'timestamp': '01:49',
  'utterance': 'I had myself a little nap. I almost didn\'t wake up. I was like sitting there like this. And it was like, "Insomniac.\' I\'m like, "Oh, shit. I got to do something."',
  'annotation': ['Defense']},
 {'Rec_Id': 4,
  'speaker': 'Paul',
  'timestamp': '01:56',
  'utterance': "What's the opposite of the insomniac.",
  'annotation': ['Interrogation']},
 {'Rec_Id': 5,
  'speaker': 'Caitlynn',
  'timestamp': '02:01',
  'utterance': 'This is what I was.',
  'annotation': ['Evidence']},
 {'Rec_Id': 6,
  'speaker': 'Mitchell',
  'timestamp': '02:04',
  'utterance': 'Justin.',
  'annotation': ['Accusation']},
 {'Rec_Id': 7,
  'speaker': 'Justin',
  'timestamp': '02:06',
  'utterance': 'Me? No.',
  'annotation': ['Interrogation', 'Defense']},
 {'Rec_Id': 8,
  'speaker': 'Mike',
  't

In [None]:
len()

In [None]:
yt-dlp wbMOusVMKoU

In [25]:
# for key, value in youtube_urls.items():
#     print(key, value)

In [2]:
import yt_dlp

In [76]:
# obj = 
yt_dlp.YoutubeDL().download(["-pKbbVe6tBc"])

[youtube] Extracting URL: -pKbbVe6tBc
[youtube] -pKbbVe6tBc: Downloading webpage
[youtube] -pKbbVe6tBc: Downloading tv client config
[youtube] -pKbbVe6tBc: Downloading player e7567ecf
[youtube] -pKbbVe6tBc: Downloading tv player API JSON
[youtube] -pKbbVe6tBc: Downloading ios player API JSON
[youtube] -pKbbVe6tBc: Downloading m3u8 information
[info] -pKbbVe6tBc: Downloading 1 format(s): 136+251
[download] Destination: ONE NIGHT ULTIMATE WEREWOLF #18 ｜ September 30th, 2017 [-pKbbVe6tBc].f136.mp4
[download] 100% of  287.03MiB in 00:00:38 at 7.46MiB/s     
[download] Destination: ONE NIGHT ULTIMATE WEREWOLF #18 ｜ September 30th, 2017 [-pKbbVe6tBc].f251.webm
[download] 100% of   22.79MiB in 00:00:09 at 2.50MiB/s     
[Merger] Merging formats into "ONE NIGHT ULTIMATE WEREWOLF #18 ｜ September 30th, 2017 [-pKbbVe6tBc].mkv"
Deleting original file ONE NIGHT ULTIMATE WEREWOLF #18 ｜ September 30th, 2017 [-pKbbVe6tBc].f251.webm (pass -k to keep)
Deleting original file ONE NIGHT ULTIMATE WEREWOLF #

0

In [19]:
?yt_dlp.Config

[0;31mInit signature:[0m [0myt_dlp[0m[0;34m.[0m[0mConfig[0m[0;34m([0m[0mparser[0m[0;34m,[0m [0mlabel[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      <no docstring>
[0;31mFile:[0m           ~/.pyenv/versions/3.10.10/lib/python3.10/site-packages/yt_dlp/utils/__init__.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     

In [18]:
obj = yt_dlp.YoutubeDL()

In [16]:
# ?yt_dlp.YoutubeDL

In [14]:
obj.download(list(youtube_urls.values()))

[0;31mSignature:[0m [0mobj[0m[0;34m.[0m[0mdownload[0m[0;34m([0m[0murl_list[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Download a given list of URLs.
[0;31mFile:[0m      ~/.pyenv/versions/3.10.10/lib/python3.10/site-packages/yt_dlp/YoutubeDL.py
[0;31mType:[0m      method

In [None]:
"/Volumes/4TB/old"

In [8]:
# !ls /Volumes/4TB/old

In [None]:
extract-audio 

[youtube] Extracting URL: _xwTgTCEqWw
[youtube] _xwTgTCEqWw: Downloading webpage
[youtube] _xwTgTCEqWw: Downloading tv client config
[youtube] _xwTgTCEqWw: Downloading player e7567ecf
[youtube] _xwTgTCEqWw: Downloading tv player API JSON
[youtube] _xwTgTCEqWw: Downloading ios player API JSON
[youtube] _xwTgTCEqWw: Downloading m3u8 information


KeyboardInterrupt: 

In [None]:
yt_dlp.

In [None]:
yt-dlp