# DVlog dataset
## DVlog dataset V1 (original dataset)

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

seed = 42

In [59]:
# load in the data from the excel file
df_data = pd.read_excel("../data/dvlog_transcripts.xlsx")
df_data.head()

Unnamed: 0,video_id,label,key,gender,duration,channelId,transcript,transcript annotated
0,0,depression,2s3EFyjUmfs,f,823.31,UCWx_Fqt2AziUyAHVjgUH1PQ,so I wanted to come on here and sit down with ...,[{'text': 'so I wanted to come on here and sit...
1,1,depression,MCaKQvEofrE,f,436.65,UCxxhKhiRPfPe_U78ao3FKLw,hello guys it's me again and i'm going to talk...,"[{'text': 'hello guys', 'start': 9.599, 'durat..."
2,2,depression,6o-DlNR_cIs,m,835.83,UCCXKRG3s-auYMFIAKjeey_Q,welcome back to another video today I'm gonna ...,"[{'text': ""welcome back to another video today..."
3,3,depression,gxZjgt3Gnug,m,420.61,UCuhfaNXfRLFfVVc6dSNoyhA,hi everybody and welcome to and in the clouds ...,[{'text': 'hi everybody and welcome to and in ...
4,4,depression,oc72xdTxJ50,f,444.55,UCbiKXPwk590XYYqqCEbtgoA,"Hey, Hey Me and my crown Me and my hairy hair ...","[{'text': 'Hey, Hey', 'start': 1.04, 'duration..."


In [60]:
# print the new sizes
print(f"size: {len(df_data)}")
print(df_data.groupby("label").count()["video_id"])

print(len(df_data["channelId"].unique()))

size: 961
label
depression    555
normal        406
Name: video_id, dtype: int64
816


In [61]:
# create a new column that is a concatenation of the values in your other columns and stratify on the new column
# https://stackoverflow.com/questions/45516424/sklearn-train-test-split-on-pandas-stratify-by-multiple-columns
df_data["strat"] = df_data["label"].astype(str) + "_" + df_data["gender"].astype(str)

# train, val, test sets with 7:1:2 ratio
# split the train set from the other sets
df_train, df_test = train_test_split(df_data, test_size=0.3, random_state=seed, stratify=df_data[['strat']])

# split the sets into a validation and trainset
df_val, df_test = train_test_split(df_test, test_size=0.66, random_state=seed, stratify=df_test[['strat']])
print(len(df_train), len(df_test), len(df_val))

672 191 98


In [62]:
df_train.groupby(["label", "gender"]).count()["strat"]

label       gender
depression  f         261
            m         127
normal      f         186
            m          98
Name: strat, dtype: int64

In [63]:
df_test.groupby(["label", "gender"]).count()["strat"]

label       gender
depression  f         74
            m         36
normal      f         53
            m         28
Name: strat, dtype: int64

In [64]:
df_val.groupby(["label", "gender"]).count()["strat"]

label       gender
depression  f         38
            m         19
normal      f         27
            m         14
Name: strat, dtype: int64

In [65]:
# set(df_train['channelId'].values).intersection(set(df_test['channelId'].values))

Build it back into one dvlog csv file so the dataloader has easy access to the data.

In [79]:
df_dvlog = df_data.copy(deep=True)

# remove not needed columns
df_dvlog.drop(["transcript", "transcript annotated", "strat", "channelId", "key", "duration"], axis=1, inplace=True)

# set the label where 1 = depression and 0 = normal
df_dvlog["label"] = np.where(df_dvlog["label"] == "depression", 1, 0)

# set which dataset each row belongs to
df_dvlog["dataset"] = None
df_dvlog.loc[df_train.index, "dataset"] = "train"
df_dvlog.loc[df_test.index, "dataset"] = "test"
df_dvlog.loc[df_val.index, "dataset"] = "val"

df_dvlog.head()

Unnamed: 0,video_id,label,gender,dataset
0,0,1,f,train
1,1,1,f,test
2,2,1,m,train
3,3,1,m,train
4,4,1,f,train


In [80]:
# save the dataset
df_dvlog.to_csv("../dvlog/dataset/dvlog_labels_v1.csv", index=False)

## DVlog dataset V2 (with available transcripts)


In [23]:
# load in the data from the excel file
df_data = pd.read_excel("../data/dvlog_transcripts.xlsx")
print(len(df_data))
df_data.head()

961


Unnamed: 0,video_id,label,key,gender,duration,channelId,transcript,transcript annotated
0,0,depression,2s3EFyjUmfs,f,823.31,UCWx_Fqt2AziUyAHVjgUH1PQ,so I wanted to come on here and sit down with ...,[{'text': 'so I wanted to come on here and sit...
1,1,depression,MCaKQvEofrE,f,436.65,UCxxhKhiRPfPe_U78ao3FKLw,hello guys it's me again and i'm going to talk...,"[{'text': 'hello guys', 'start': 9.599, 'durat..."
2,2,depression,6o-DlNR_cIs,m,835.83,UCCXKRG3s-auYMFIAKjeey_Q,welcome back to another video today I'm gonna ...,"[{'text': ""welcome back to another video today..."
3,3,depression,gxZjgt3Gnug,m,420.61,UCuhfaNXfRLFfVVc6dSNoyhA,hi everybody and welcome to and in the clouds ...,[{'text': 'hi everybody and welcome to and in ...
4,4,depression,oc72xdTxJ50,f,444.55,UCbiKXPwk590XYYqqCEbtgoA,"Hey, Hey Me and my crown Me and my hairy hair ...","[{'text': 'Hey, Hey', 'start': 1.04, 'duration..."


In [24]:
# video_dir = r"../data/dvlog_videos"
video_dir = r"E:\Master\dvlog_videos"

# get the id's of the videos that we managed to download
videos_list = sorted([int(x.split("_")[0]) for x in os.listdir(video_dir)])
len([x for x in range(0, 961) if x in videos_list])

827

In [25]:
# filter out the videos without a transcript
transcripts_list = df_data[df_data["transcript"] != "transcript not available anymore"]["video_id"]
len(transcripts_list)

796

In [26]:
# check the overlap between the videos and transcripts
print(f"Intersection: {len(set(transcripts_list) & set(videos_list))}")

# check the difference between videos and transcripts (only exist in videos set and not in transcripts)
print(f"Difference videos and transcripts: {len(set(videos_list).difference(set(transcripts_list)))}")

# check the difference between transcripts and videos (only exist in transcripts set and not in videos)
print(f"Difference transcripts and videos: {len(set(transcripts_list).difference(set(videos_list)))}")

Intersection: 783
Difference videos and transcripts: 44
Difference transcripts and videos: 13


### build the alternate dataset

In [27]:
# filter on columns for which we have the actual videos
df_data = df_data[df_data["video_id"].isin(videos_list)]

# create a new column that is a concatenation of the values in your other columns and stratify on the new column
# https://stackoverflow.com/questions/45516424/sklearn-train-test-split-on-pandas-stratify-by-multiple-columns
df_data["strat"] = df_data["label"].astype(str) + "_" + df_data["gender"].astype(str)

# train, val, test sets with 7:1:2 ratio
# split the train set from the other sets
df_train, df_test = train_test_split(df_data, test_size=0.3, random_state=seed, stratify=df_data[['strat']])

# split the sets into a validation and trainset
df_val, df_test = train_test_split(df_test, test_size=0.66, random_state=seed, stratify=df_test[['strat']])
print(len(df_train), len(df_test), len(df_val))

578 165 84


In [28]:
# get the overlap between the folds
print(f"Overlap between Train and Val {len(set(df_train['channelId'].values).intersection(set(df_val['channelId'].values)))}")
print(f"Overlap between Train and Test {len(set(df_train['channelId'].values).intersection(set(df_test['channelId'].values)))}")
print(f"Overlap between Val and Test {len(set(df_val['channelId'].values).intersection(set(df_test['channelId'].values)))}")

Overlap between Train and Val 10
Overlap between Train and Test 23
Overlap between Val and Test 8


In [29]:
def remove_duplicates(df1, df2, choices):
    # for each overlap with the validation and test set, pick one of the duplicates of those sets
    # and switch them with a similar example so the stratification holds
    orig_df = df1.copy()
    filter_df = df2.copy()
    unique_choices = choices.copy()

    # get the overlapping channels
    overlapping = set(orig_df['channelId'].values).intersection(set(filter_df['channelId'].values))

    for overlap in overlapping:
        # go over each duplicate row and select it
        dupl_row = filter_df[filter_df["channelId"] == overlap]

        # since some rows come from the same channel, we loop over the amount of rows we selected
        for x in range(len(dupl_row)):
            # get the current row
            curr_row = dupl_row.iloc[x]

            # get the stratification value from the current row
            strat_value, row_index = curr_row["strat"], curr_row["video_id"]
            # print(strat_value, row_index)

            # randomly select a row to switch with
            selected = unique_choices[unique_choices["strat"] == strat_value].sample(n=1, random_state=seed)
            
            # do the switch and update both dataframes
            # insert the selected row from the train set into the test set and remove it from the train set
            filter_df = pd.concat([filter_df, selected], ignore_index=False)
            orig_df.drop(index=selected["video_id"], inplace=True)
            # print(f"Overlap between Train and dataset {len(set(orig_df['channelId'].values).intersection(set(filter_df['channelId'].values)))}")

            # insert the row from the test set into the train set and remove it from the test set
            orig_df.loc[row_index] = curr_row
            filter_df.drop(index=row_index, inplace=True)

            # Remove the chosen unique row from the available choices since it is swapped
            unique_choices.drop(index=selected["video_id"], inplace=True)

    return orig_df, filter_df, unique_choices


# get all unique channels which only consist of one channel
uniq_channels = df_data.groupby("channelId").filter(lambda x: len(x) == 1)

# filter from the training set these channels
training_uniques = df_train[df_train["channelId"].isin(uniq_channels["channelId"])]

# remove the duplicates between the train and test set
df_train, df_test, new_uniques = remove_duplicates(df_train, df_test, training_uniques)
# remove the duplicates between the train and validation set
df_train, df_val, _ = remove_duplicates(df_train, df_val, new_uniques)

# filter from the test set the unique channels
test_uniques = df_test[df_test["channelId"].isin(uniq_channels["channelId"])]
# remove the duplicates between the test and validation set
df_test, df_val, _ = remove_duplicates(df_test, df_val, test_uniques)

print(len(df_train), len(df_test), len(df_val))

578 165 84


In [30]:
# get the overlap between the folds
print(f"Overlap between Train and Val {len(set(df_train['channelId'].values).intersection(set(df_val['channelId'].values)))}")
print(f"Overlap between Train and Test {len(set(df_train['channelId'].values).intersection(set(df_test['channelId'].values)))}")
print(f"Overlap between Val and Test {len(set(df_val['channelId'].values).intersection(set(df_test['channelId'].values)))}")

Overlap between Train and Val 0
Overlap between Train and Test 0
Overlap between Val and Test 0


In [31]:
df_dvlog = df_data.copy(deep=True)

# remove not needed columns
df_dvlog.drop(["transcript", "transcript annotated", "strat", "channelId", "key", "duration"], axis=1, inplace=True)

# set the label where 1 = depression and 0 = normal
df_dvlog["label"] = np.where(df_dvlog["label"] == "depression", 1, 0)

# set which dataset each row belongs to
df_dvlog["dataset"] = None
df_dvlog.loc[df_train.index, "dataset"] = "train"
df_dvlog.loc[df_test.index, "dataset"] = "test"
df_dvlog.loc[df_val.index, "dataset"] = "val"

df_dvlog.head()

Unnamed: 0,video_id,label,gender,dataset
0,0,1,f,train
1,1,1,f,test
2,2,1,m,train
3,3,1,m,train
4,4,1,f,test


In [33]:
# save the dataset
df_dvlog.to_csv("../dvlog/dataset/dvlog_labels_v2.csv", index=False)