In [1]:
import pickle 
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn
from moviepy.editor import VideoFileClip
from glob import glob
from tqdm import tqdm
from collections import defaultdict
from datasets import Dataset, DatasetDict
from datasets import Audio
from dotenv import load_dotenv
import os

def load_pickle(pickle_file):
    try:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f)
    except UnicodeDecodeError as e:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f, encoding='latin1')
    except Exception as e:
        print('Unable to load data ', pickle_file, ':', e)
        raise
    return pickle_data

load_dotenv()

True

In [2]:
# Function to extract audio
def extract_audio(mp4_file, output_audio_file):
    video = VideoFileClip(mp4_file)
    audio = video.audio
    audio.write_audiofile(
        output_audio_file,
    )


def get_wav():
    for mp4_dir in tqdm(
        glob("../_data/urfunny/urfunny2_videos/*.mp4")
    ):
        extract_audio(
            mp4_dir,
            mp4_dir.replace("urfunny2_videos", "urfunny2_audio").replace("mp4", "wav"),
        )

In [3]:
# get_wav()

In [4]:
data_folds_file="../_data/urfunny/sdk_features/data_folds.pkl"
data_folds=load_pickle(data_folds_file)

language_file="../_data/urfunny/sdk_features/language_sdk.pkl"
language_sdk=load_pickle(language_file)

humor_label_file="../_data/urfunny/sdk_features/humor_label_sdk.pkl"
humor_label_sdk = load_pickle(humor_label_file)

In [5]:
def verbalizer(label):
    return "humor" if label == 1 else "not_humor"

In [6]:
dataset = {}

for fold in data_folds:
    dataset[fold] = defaultdict(list)
    for file_id in data_folds[fold]:
        dataset[fold]["audio"].append("../_data/urfunny/urfunny2_audio/{}.wav".format(file_id))
        dataset[fold]["label"].append(verbalizer(humor_label_sdk[file_id]))
        dataset[fold]["punchline"].append(language_sdk[file_id]['punchline_sentence'])
        dataset[fold]["context"].append('. '. join(language_sdk[file_id]['context_sentences']))
        dataset[fold]["file_id"].append(file_id)

In [7]:
audio_dataset_train = Dataset.from_dict(dataset['train']).cast_column("audio", Audio())
audio_dataset_dev = Dataset.from_dict(dataset['dev']).cast_column("audio", Audio())
audio_dataset_test = Dataset.from_dict(dataset['test']).cast_column("audio", Audio())

In [8]:
audio_dataset = DatasetDict({
    "train": audio_dataset_train,
    "dev": audio_dataset_dev,
    "test": audio_dataset_test
})

In [9]:
audio_dataset.push_to_hub("SALT-NLP/URFunny_humor", private=True, token=os.getenv("HF_TOKEN"))

Map: 100%|██████████| 78/78 [00:00<00:00, 202.17 examples/s], ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.28s/ba]
Map: 100%|██████████| 78/78 [00:00<00:00, 163.81 examples/s]3:48,  2.36s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.41s/ba]
Map: 100%|██████████| 78/78 [00:00<00:00, 264.02 examples/s]3:44,  2.33s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.71ba/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 208.42 examples/s]2:56,  1.86s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.15ba/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 228.70 examples/s]2:53,  1.84s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.31ba/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 237.78 examples/s]2:43,  1.76s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.15s/ba]
Map: 100%|██████████| 78/78 [00:00<00:00, 210.04 examp

CommitInfo(commit_url='https://huggingface.co/datasets/SALT-NLP/URFunny_humor/commit/cc20beeae31bceb5d49d03bfe394ba6ddad48c08', commit_message='Upload dataset (part 00002-of-00003)', commit_description='', oid='cc20beeae31bceb5d49d03bfe394ba6ddad48c08', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
audio_dataset.push_to_hub("MichaelR207/URFunny_humor", private=True, token=os.getenv("HF_TOKEN"))

Map: 100%|██████████| 78/78 [00:00<00:00, 141.63 examples/s], ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.39ba/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 157.91 examples/s]2:43,  1.68s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.05ba/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 237.80 examples/s]2:54,  1.81s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.78ba/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 205.11 examples/s]2:36,  1.65s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.36ba/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 234.52 examples/s]2:31,  1.62s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.47ba/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 237.52 examples/s]2:29,  1.61s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.39ba/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 226.43 examp

CommitInfo(commit_url='https://huggingface.co/datasets/MichaelR207/URFunny_humor/commit/576f4ad9a615cc543b80167f85e5fb08ab0fb014', commit_message='Upload dataset (part 00002-of-00003)', commit_description='', oid='576f4ad9a615cc543b80167f85e5fb08ab0fb014', pr_url=None, pr_revision=None, pr_num=None)