In [1]:
annotations_path = '/home/tk/repos/MELD/data/MELD/train_sent_emo.csv'
VIDS_DIR = "/home/tk/datasets/MELD/MELD.Raw/train/train_splits"

import os
from glob import glob
import csv
import shutil
with open(annotations_path) as f:
    reader = csv.reader(f)
    annotations = list(reader)

utts = glob(os.path.join(VIDS_DIR, '*.mp4'))
dias = sorted(list(set([u.split('_utt')[0] for u in utts])))
for d in dias:
    assert "train_splits" in d

print(f"there are {len(utts)} vids (utterances) in the original train dataset")
print(f"there are {len(dias)} dialouges in the original train dataset")
print()

import random
random.seed(0)
random.shuffle(dias)

SMALL, MEDIUM, LARGE = 20, 40, 80
dias_reduced = dias[:LARGE]
utts_reduced = [utt for utt in utts for dia in dias_reduced if dia + '_utt' in utt]

assert sorted(list(set([utt.split('_utt')[0] for utt in utts_reduced]))) == sorted(dias_reduced)

print(f"{len(utts_reduced)} utterances in the smaller-dataset")

shutil.rmtree('./smaller-dataset', ignore_errors=True)
os.makedirs('./smaller-dataset')

for row in annotations[1:]:
    SrNo, Utterance, Speaker, Emotion, Sentiment, Dialogue_ID,\
        Utterance_ID, Season, Episode, StartTime, EndTime = row
        
    if f"dia{Dialogue_ID}_utt" not in str(utts_reduced):
        continue

    vid_full_path_src = os.path.join(VIDS_DIR, f"dia{Dialogue_ID}_utt{Utterance_ID}.mp4")
    vid_full_path_dst = os.path.join('./smaller-dataset/', os.path.basename(vid_full_path_src))

    shutil.copyfile(vid_full_path_src, vid_full_path_dst)

there are 9989 vids (utterances) in the original train dataset
there are 1038 dialouges in the original train dataset

789 utterances in the smaller-dataset


In [2]:
import json

actual_utts = sorted(glob('./smaller-dataset/*.mp4'))
print(f"actual number of videos copied: {len(actual_utts)}")

dias = sorted(set([path.split('_utt')[0] for path in actual_utts]))

dias_LARGE = dias[:len(dias)]
dias_MEDIUM = dias_LARGE[:len(dias_LARGE)//2]
dias_SMALL = dias_LARGE[:len(dias_LARGE)//4]

assert len(dias_LARGE) == LARGE
assert len(dias_MEDIUM) == MEDIUM
assert len(dias_SMALL) == SMALL

for datasize, datasize_ in zip([SMALL, MEDIUM, LARGE], ['small', 'medium', 'large']):
    dias_ = dias[:datasize]
    train, dev, test = int(len(dias_)*0.8), int(len(dias_)*0.9), int(len(dias_)*1.0)

    dias_train = dias_[:train]
    dias_dev = dias[train:dev]
    dias_test = dias[dev:test]

    assert len(dias_train) + len(dias_dev) + len(dias_test) == datasize

    utts_train = [os.path.basename(utt) for utt in actual_utts for dia in dias_train if dia + '_utt' in utt]
    utts_dev = [os.path.basename(utt) for utt in actual_utts for dia in dias_dev if dia + '_utt' in utt]
    utts_test = [os.path.basename(utt) for utt in actual_utts for dia in dias_test if dia + '_utt' in utt]

    with open(f"dataset-{datasize_}.json", 'w') as stream:
        to_dump = {'train': utts_train, 'dev': utts_dev, 'test': utts_test}
        json.dump(to_dump, stream)  

actual number of videos copied: 789


In [3]:
datasets = {}
for datasize_ in ['small', 'medium', 'large']:
    with open(f"dataset-{datasize_}.json", 'r') as stream:
        datasets[datasize_] = json.load(stream)

assert set(datasets['small']['train'] + datasets['small']['dev'] + datasets['small']['test']).issubset(set(datasets['medium']['train']))


assert set(datasets['medium']['train'] + datasets['medium']['dev'] + datasets['medium']['test']).issubset(set(datasets['large']['train']))