MIT License

Copyright (c) 2021 Taiki Miyagawa and Akinori F. Ebihara

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

# Train/Valid/Test Splitting (text files)
DATADIR/HMDB51/labelstvt/ will be made.

### HMDB51 Original Naming Rules of Label Texts
<font color=red>`glob` does not work for "[" or "]". Use "[[]" and "[]]" instead. 
    `path.replace("[", "[[").replace("]", "[]]").replace("[[", "[[]")` does a good job.</font>

```
#####################################
## Naming rules in label text file ##
#####################################
There are totally 153 files in this folder,
[action]_test_split[1-3].txt  corresponding to three splits reported in the paper.
The format of each file is
[video_name] [id]
The video is included in the training set if id is 1
The video is included in the testing set if id is 2
The video is not included for training/testing if id is 0
There should be 70 videos with id 1 , 30 videos with id 2 in each txt file.

PROPERTY                                   LABELS (ABBREVIATION)
visible body parts                         head(h), upper body(u), full body (f), lower body(l)
camera motion                              motion (cm), static (nm)
number of people involved in the action    Single (np1), two (np2), three (np3)
camera viewpoint                           Front (fr), back (ba), left(le), right(ri)
video quality                              good (goo), medium (med), ok (bad)


####################################
#########    Templates   ###########
####################################
label file names:
ClassName_test_split[1-3].txt

video names:
VideoName_ClassName_VisibleBodyParts_CameraMotion_NumberOfPeopleInvolvedInTheAction_CameraViewpoint_VideoQuality_Number\
.avi ID


####################################
#### Examples in class "smile" #####
####################################
my_smile_smile_h_cm_np1_fr_goo_0.avi 1
prelinger_LetsPlay1949_smile_h_nm_np1_fr_goo_27.avi 2
prelinger_LetsPlay1949_smile_h_nm_np1_le_goo_25.avi 2
prelinger_LetsPlay1949_smile_u_nm_np1_fr_med_24.avi 0
prelinger_LetsPlay1949_smile_u_nm_np1_ri_med_21.avi 2
prelinger_they_grow_up_so_fast_1_smile_u_nm_np1_fr_med_0.avi 1
show_your_smile_-)_smile_h_nm_np1_fr_med_0.avi 1
showyoursmile_smile_h_nm_np1_fr_goo_0.avi 1
smile_collection_7_smile_h_nm_np1_fr_goo_0.avi 1
smile_collection_7_smile_h_nm_np1_fr_goo_1.avi 1
youtube_smile_response_smile_h_nm_np1_fr_goo_0.avi 1
```

In [1]:
from glob import glob
import os, shutil
from copy import copy, deepcopy
import statistics
import matplotlib.pyplot as plt
import numpy as np

## Preliminaries

In [None]:
DATADIR = "Define this first. E.g., /data/t-miyagawa"
splitnum = 1 # Official splitting. 1, 2, or 3.

In [2]:
# Get videodir and numf
datadir = "{}/HMDB51png".format(DATADIR)
classdir = sorted(glob(datadir + "/*"))
classdir = [i + "/" for i in classdir]
classnames = [i[i.rfind("HMDB51png/") + 10 : -1] for i in classdir]
videodir = {
    k : 
    sorted(glob([v for v in classdir if v.find("/" + k + "/") != -1][0] + "/*"))
    for k in classnames}

numf = dict()
for k in classnames:
    v1 = videodir[k]
    v2 = [i.replace("[", "[[").replace("]", "[]]").replace("[[", "[[]") for i in v1]
    numf[k] = [len(glob(_video + "/*.png")) for _video in v2]

# Smear the keys
numf_concat = []
for k in classnames:
    v = numf[k]
    numf_concat.extend(v)

videodir_concat = []
for k in classnames:
    v = videodir[k]
    videodir_concat.extend(v)
    
# Classwise num of frames
numf_classwise = []
for k in classnames:
    v = numf[k]
    v = sum(v)
    numf_classwise.append(v)
    
# Classwise num of videos (clips)
numv_classwise = []
for k in classnames:
    v = videodir[k]
    v = len(v)
    numv_classwise.append(v)
    
# Classwise num of unique videos (groups)
numuv_classwise = []
for k in classnames:
    v1 = videodir[k]
        # ['DATADIR/HMDB51png/wave/20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0',
        #  'DATADIR/HMDB51png/wave/21_wave_u_nm_np1_fr_goo_5',
        #  'DATADIR/HMDB51png/wave/50_FIRST_DATES_wave_f_cm_np1_fr_med_0',
        #  'DATADIR/HMDB51png/wave/50_FIRST_DATES_wave_u_cm_np1_fr_goo_30',
        #  'DATADIR/HMDB51png/wave/50_FIRST_DATES_wave_u_cm_np1_fr_med_1',
        #  'DATADIR/HMDB51png/wave/50_FIRST_DATES_wave_u_cm_np1_fr_med_36',
    v2 = [i[i.rfind("/")+1:] for i in v1]
        # ['20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0',
        #  '21_wave_u_nm_np1_fr_goo_5',
        #  '50_FIRST_DATES_wave_f_cm_np1_fr_med_0',
        #  '50_FIRST_DATES_wave_u_cm_np1_fr_goo_30',
        #  '50_FIRST_DATES_wave_u_cm_np1_fr_med_1',
        #  '50_FIRST_DATES_wave_u_cm_np1_fr_med_36',
    v3 = [i[:i.rfind(k)-1] for i in v2]
        # ['20060723sfjffbartsinger',
        #  '21',
        #  '50_FIRST_DATES',
        #  '50_FIRST_DATES',
        #  '50_FIRST_DATES',
        #  '50_FIRST_DATES',
    v4 = []
    for i in v3:
        if not i in v4:
            v4.append(i)
        # ['20060723sfjffbartsinger',
        #  '21',
        #  '50_FIRST_DATES',
    numuv_classwise.append(len(v4))
    

# """
# Returns:
#     classnames: List. Len = Num of classes. Names of classes in alphabetical order.
#
#     videodir: Dict. Paths to video directories. Each values (paths) are in alphabetical order of video names.
#     numf: Dict. Num of frames for each videos. Each values (integers) are in alphabetical order of video names.
#
#     numf_concat: List. Len = Num of total videos. Order is the same as `videoddir_concat`.
#     videodir_concat: List. Len = Num of total videos. Order is the same as `numf_concat`.
#
#     numf_classwise: List. Len = Num of classes. The classwise numbers of frames in alphabetical order of class names.
#     numv_classwise: List. Len = Num of classes. The classwise numbers of videos in alphabetical order of class names.
#     numuv_classwise: List. Len = Num of classes. The classwise numbers of unique videos (groups) in alphabetical order of class names.
# """

## Save [train,valid,test]list[n].txt
Change `n` = splitnum. 

In [3]:
# Define file names
#################################################
orglabeldir = "{}/HMDB51/labels".foramt(DATADIR)
newtrtxt = "{}/HMDB51/labelstvt/trainlist0{}.txt".format(DATADIR, splitnum)
newvatxt = "{}/HMDB51/labelstvt/validlist0{}.txt".format(DATADIR, splitnum)
newtetxt = "{}/HMDB51/labelstvt/testlist0{}.txt".format(DATADIR, splitnum)

# Load label text files
#################################################
trclips = []
trclips_dc = dict()
trgroups = dict()
trgroups_concat = []
teclips = []
tegroups = dict()
tegroups_concat = []
num_clips = 0
for classname in classnames:
    labeltxt = orglabeldir + "/{}_test_split{}.txt".format(classname, splitnum) # classwise
    
    with open(labeltxt) as f:
        clips = f.readlines() # list
        num_clips += len(clips)
        # e.g., wave_test_split1.txt:
        # ['20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0.avi 2 \n',
        #  '21_wave_u_nm_np1_fr_goo_5.avi 1 \n',
        #  '50_FIRST_DATES_wave_f_cm_np1_fr_med_0.avi 1 \n',
        #  '50_FIRST_DATES_wave_u_cm_np1_fr_goo_30.avi 1 \n',
        #  '50_FIRST_DATES_wave_u_cm_np1_fr_med_1.avi 1 \n',
        #  '50_FIRST_DATES_wave_u_cm_np1_fr_med_36.avi 1 \n', ...'... \n']

    tmp1 = ["/" + clip[:clip.rfind(".avi")] for clip in clips if clip[-3] == "1"]
    trclips_dc[classname] = tmp1
    trclips.extend(tmp1)
        # e.g., ("/VideoName_ClassName_MetaData")
        # trclips = 
        # ['/April_09_brush_hair_u_nm_np1_ba_goo_0',
        #  '/April_09_brush_hair_u_nm_np1_ba_goo_1',
        #  '/April_09_brush_hair_u_nm_np1_ba_goo_2',
        #  '/Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np1_ri_med_3',
        #  '/Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np2_le_goo_0',
        #  '/Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np2_le_goo_1',
        #  '/Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np2_le_med_2', ...]
    tmp2 = ["/" + clip[:clip.rfind("_{}_".format(classname))] + "_" + classname for clip in clips if clip[-3] == "1"]
        # "_" + classname is necessary, because there are some videos with the same VideoName in different classes.
        # (There is no duplication of VideoName within each single class, though).
    trgroups[classname] = tmp2 # There can be duplicate strings in THE list in a key.
        # e.g,, ("/VideoName_ClassName")
        # tfgroups["bruch_hair"] = 
        # ['/April_09_brush_hair',
        #  '/April_09_brush_hair',
        #  '/April_09_brush_hair',
        #  '/Aussie_Brunette_Brushing_Hair_II_brush_hair',
        #  '/Aussie_Brunette_Brushing_Hair_II_brush_hair',
        #  '/Aussie_Brunette_Brushing_Hair_II_brush_hair',
        #  '/Aussie_Brunette_Brushing_Hair_II_brush_hair', ...]
    trgroups_concat.extend(tmp2)
    assert len(tmp1) == len(tmp2)
    
    tmp3 = ["/" + clip[:clip.rfind(".avi")] for clip in clips if clip[-3] == "2"]
    teclips.extend(tmp3)
    tmp4 = ["/" + clip[:clip.rfind("_{}_".format(classname))] + "_" + classname for clip in clips if clip[-3] == "2"]
        # "_" + classname is necessary, because there are some videos with the same VideoName in different classes.
        # (There is no duplication of VideoName within each single class, though).
    tegroups[classname] = tmp4
    tegroups_concat.extend(tmp4)
    assert len(tmp3) == len(tmp4)
    
# Assert
assert len(trclips) + len(teclips) < num_clips # because ID 0 is removed from train/val/test split.

# Assert: no duplicated names and no contamination in tr and te
for i, v in enumerate(trclips):
    assert not v in teclips, "{}, {}".format(i, v)
for i, v in enumerate(teclips):
    assert not v in trclips, "{}, {}".format(i, v)
for i, v in enumerate(trgroups_concat):
    assert not v in tegroups_concat, "{}, {}".format(i, v)
for i, v in enumerate(tegroups_concat):
    assert not v in trgroups_concat, "{}, {}".format(i, v)
    
# Create index set for unique videos ...
#################################################
trgroupsu_idxset_of_trgroups = dict()
for k, v in trgroups.items():
    idxset = []
    for cnt, itr_clip in enumerate(v):
        if cnt == 0:
            tmp_clipname = itr_clip
            tmp_idx = 0
            idxset.append(tmp_idx)
        else:
            if tmp_clipname != itr_clip:
                tmp_clipname = itr_clip
                tmp_idx += 1
                
            idxset.append(tmp_idx)
            
    assert len(idxset) == len(v)        
    trgroupsu_idxset_of_trgroups[k] = idxset
    
    # e.g.,
    # trgroupsu_idxset_of_trgroups =
    # {'brush_hair': [0,
    #   0,
    #   0,
    #   1,
    #   1,
    #   1,
    #   1,
    #   2,
    #   2, ...,22,22,23,24,25,25], ...
    
# ... and extract unique-group list
#################################################
trgroupsu = dict()
for k, v in deepcopy(trgroups).items():
    _tmp = list(np.unique(v)) # no duplication
    assert len(_tmp) == trgroupsu_idxset_of_trgroups[k][-1] + 1
    trgroupsu[k] = list(np.unique(v)) # no duplication

tegroupsu = dict()
for k, v in deepcopy(tegroups).items():
    tegroupsu[k] = list(np.unique(v)) # no duplication
    # Note:
    # trgroups =
    # {'brush_hair': 
    #  ['/April_09_brush_hair',
    #   '/April_09_brush_hair',
    #   '/April_09_brush_hair',
    #   '/Aussie_Brunette_Brushing_Hair_II_brush_hair',
    #   '/Aussie_Brunette_Brushing_Hair_II_brush_hair',
    #   '/Aussie_Brunette_Brushing_Hair_II_brush_hair',
    #   '/Aussie_Brunette_Brushing_Hair_II_brush_hair', ...
    #                      is now
    # trgroupsu = 
    # {'brush_hair': 
    #  ['/April_09_brush_hair',
    #   '/Aussie_Brunette_Brushing_Hair_II_brush_hair', ...

# New train/valid/test split
#################################################
newtegroupsu = deepcopy(tegroupsu)
newtrgroupsu = []
newvagroupsu = []
newtrgroupsu_idxset_of_trgroups = dict()
newvagroupsu_idxset_of_trgroups = dict()

for c, cls in enumerate(classnames):
    _tmp = trgroupsu[cls]        
    numva = int(len(_tmp) * 0.1) # num of validation examples in class `cls`
    assert numva > 0
    for v in _tmp:
        assert not v in newtrgroupsu # check there's no duplication
        assert not v in newvagroupsu # check there's no duplication

    newtrgroupsu_idxset_of_trgroups[cls] = [i for i in trgroupsu_idxset_of_trgroups[cls] if i <= trgroupsu_idxset_of_trgroups[cls][-1] - numva]
    newvagroupsu_idxset_of_trgroups[cls] = [i for i in trgroupsu_idxset_of_trgroups[cls] if i > trgroupsu_idxset_of_trgroups[cls][-1] - numva]
        # e.g.,
        # newtrgroupsu_idxset_of_trgroups =
        # {'brush_hair': [0,
        #   0,
        #   0,
        #   1,
        #   1,
        #   1,
        #   1,
        #   2,
        #   2, ..., 22,22,23], ...
    for i in newtrgroupsu_idxset_of_trgroups[cls]:
        assert not i in newvagroupsu_idxset_of_trgroups # no duplication
    for i in newvagroupsu_idxset_of_trgroups[cls]:
        assert not i in newtrgroupsu_idxset_of_trgroups # no duplication
        
    _tmptr = _tmp[:- numva]
    _tmpva = _tmp[- numva:]
    assert len(_tmptr) == newtrgroupsu_idxset_of_trgroups[cls][-1] + 1
    newtrgroupsu.extend(_tmptr)
    newvagroupsu.extend(_tmpva)
    # e.g.,
    # newtrgroupsu = 
    # ['/April_09_brush_hair',
    #  '/Aussie_Brunette_Brushing_Hair_II_brush_hair',
    #  '/Blonde_being_brushed_brush_hair',
    #  '/Brunette_Foxyanya_ultra_silky_long_hair_brushing_hairjob_brush_hair',
    #  '/Brushing_Hair_with_Beth_brush_hair',
    
# Assert
for i, v in enumerate(newvagroupsu):
    assert not v in newtrgroupsu, "{}, {}".format(i, v)
for i, v in enumerate(newtrgroupsu):
    assert not v in newtegroupsu, "{}, {}".format(i, v)
for i, v in enumerate(newtegroupsu):
    assert not v in newvagroupsu, "{}, {}".format(i, v)
    
# Fetch clip numbers
#################################################
newteclips = copy(teclips)
newtrclips = []
newvaclips = []

for classname in classnames:
    idxset_tr = newtrgroupsu_idxset_of_trgroups[classname] 
    idxset_va = newvagroupsu_idxset_of_trgroups[classname]
    newtrclips.extend(trclips_dc[classname][:len(idxset_tr)]) # trclips_dc (and trclips) includes both tr and va clips
    newvaclips.extend(trclips_dc[classname][len(idxset_tr):]) # trclips_dc (and trclips) includes both tr and va clips


assert 0 < len(newtrclips) < len(trclips)
assert 0 < len(newvaclips) < len(trclips)
assert len(newvaclips) + len(newtrclips) == len(trclips), "Contamination (tr & va) detected!"
for v in newvaclips:
    assert v in trclips
for v in newtrclips:
    assert v in trclips
assert len(trclips) == len(np.unique(trclips))
    
# e.g., 
# newtrclips = 
# ['/April_09_brush_hair_u_nm_np1_ba_goo_0',
#  '/April_09_brush_hair_u_nm_np1_ba_goo_1',
#  '/April_09_brush_hair_u_nm_np1_ba_goo_2',
#  '/Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np1_ri_med_3',
#  '/Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np2_le_goo_0',
#  '/Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np2_le_goo_1',
#  '/Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np2_le_med_2',
#  '/Blonde_being_brushed_brush_hair_f_nm_np2_ri_med_0',
#  '/Blonde_being_brushed_brush_hair_u_cm_np2_ri_med_1',

In [4]:
# Save train/valid/testlist01.txt 
# E.g., "DATADIR/HMDB51/labels/ucfTrainValidTestlist/validlist01.txt"
#################################################
# Comment-outed for safety
with open(newtrtxt, mode="w") as f:
    l = len(newtrclips)
    for i, v in enumerate(newtrclips):
        f.write(v + " \n")

with open(newvatxt, mode="w") as f:
    l = len(newvaclips)
    for i, v in enumerate(newvaclips):
        f.write(v + " \n")

with open(newtetxt, mode="w") as f:
    l = len(newteclips)
    for i, v in enumerate(newteclips):
        f.write(v + " \n")
        
# Assert
with open(newtrtxt, mode="r") as f:
    a = f.readlines()
with open(newvatxt, mode="r") as f:
    b = f.readlines()
with open(newtetxt, mode="r") as f:
    c = f.readlines()
for i in a:
    assert not i in b
    assert not i in c
for i in b:
    assert not i in a
    assert not i in c
for i in c:
    assert not i in a
    assert not i in b