In [68]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

seed = 42

In [59]:
# load in the data from the excel file
df_data = pd.read_excel("../data/dvlog_transcripts.xlsx")
df_data.head()

Unnamed: 0,video_id,label,key,gender,duration,channelId,transcript,transcript annotated
0,0,depression,2s3EFyjUmfs,f,823.31,UCWx_Fqt2AziUyAHVjgUH1PQ,so I wanted to come on here and sit down with ...,[{'text': 'so I wanted to come on here and sit...
1,1,depression,MCaKQvEofrE,f,436.65,UCxxhKhiRPfPe_U78ao3FKLw,hello guys it's me again and i'm going to talk...,"[{'text': 'hello guys', 'start': 9.599, 'durat..."
2,2,depression,6o-DlNR_cIs,m,835.83,UCCXKRG3s-auYMFIAKjeey_Q,welcome back to another video today I'm gonna ...,"[{'text': ""welcome back to another video today..."
3,3,depression,gxZjgt3Gnug,m,420.61,UCuhfaNXfRLFfVVc6dSNoyhA,hi everybody and welcome to and in the clouds ...,[{'text': 'hi everybody and welcome to and in ...
4,4,depression,oc72xdTxJ50,f,444.55,UCbiKXPwk590XYYqqCEbtgoA,"Hey, Hey Me and my crown Me and my hairy hair ...","[{'text': 'Hey, Hey', 'start': 1.04, 'duration..."


In [60]:
# filter out the videos without a transcript
# df_data = df_data[df_data["transcript"] != "transcript not available anymore"]

# print the new sizes
print(f"size: {len(df_data)}")
print(df_data.groupby("label").count()["video_id"])

print(len(df_data["channelId"].unique()))

size: 961
label
depression    555
normal        406
Name: video_id, dtype: int64
816


In [61]:
# create a new column that is a concatenation of the values in your other columns and stratify on the new column
# https://stackoverflow.com/questions/45516424/sklearn-train-test-split-on-pandas-stratify-by-multiple-columns
df_data["strat"] = df_data["label"].astype(str) + "_" + df_data["gender"].astype(str)

# train, val, test sets with 7:1:2 ratio
# split the train set from the other sets
df_train, df_test = train_test_split(df_data, test_size=0.3, random_state=seed, stratify=df_data[['strat']])

# split the sets into a validation and trainset
df_val, df_test = train_test_split(df_test, test_size=0.66, random_state=seed, stratify=df_test[['strat']])
print(len(df_train), len(df_test), len(df_val))

672 191 98


In [62]:
df_train.groupby(["label", "gender"]).count()["strat"]

label       gender
depression  f         261
            m         127
normal      f         186
            m          98
Name: strat, dtype: int64

In [63]:
df_test.groupby(["label", "gender"]).count()["strat"]

label       gender
depression  f         74
            m         36
normal      f         53
            m         28
Name: strat, dtype: int64

In [64]:
df_val.groupby(["label", "gender"]).count()["strat"]

label       gender
depression  f         38
            m         19
normal      f         27
            m         14
Name: strat, dtype: int64

In [65]:
# set(df_train['channelId'].values).intersection(set(df_test['channelId'].values))

Build it back into one dvlog csv file so the dataloader has easy access to the data.

In [79]:
df_dvlog = df_data.copy(deep=True)

# remove not needed columns
df_dvlog.drop(["transcript", "transcript annotated", "strat", "channelId", "key", "duration"], axis=1, inplace=True)

# set the label where 1 = depression and 0 = normal
df_dvlog["label"] = np.where(df_dvlog["label"] == "depression", 1, 0)

# set which dataset each row belongs to
df_dvlog["dataset"] = None
df_dvlog.loc[df_train.index, "dataset"] = "train"
df_dvlog.loc[df_test.index, "dataset"] = "test"
df_dvlog.loc[df_val.index, "dataset"] = "val"

df_dvlog.head()

Unnamed: 0,video_id,label,gender,dataset
0,0,1,f,train
1,1,1,f,test
2,2,1,m,train
3,3,1,m,train
4,4,1,f,train


In [80]:
# save the dataset
df_dvlog.to_csv("../dvlog/dataset/dvlog_dataset.csv", index=False)