# dataset

In [1]:
import os
import numpy as np
import pandas as pd

from skimage import transform
# import librosa

import torch
from torch.utils.data import Dataset, WeightedRandomSampler

# # local functions
# from dataset.utils import *

In [None]:
class DepressionDataset(Dataset):
    '''create a training, develop, or test dataset
       and load the participant features if it's called 
    '''
    def __init__(self,
                 root_dir,
                 mode,
                 transform=None):
        super(DepressionDataset, self).__init__()
        
        # only train, develop, test dataset allow
        assert mode in ["train", "validation", "test"], \
            "Argument --mode could only be ['train', 'validation', 'test']"

        self.mode = mode
        self.root_dir = root_dir
        self.transform = transform
        self.train_data_path = os.path.join(self.root_dir, 'train_split_Depression_AVEC2017.csv')
        self.valid_data_path = os.path.join(self.root_dir, 'dev_split_Depression_AVEC2017.csv')
        self.test_data_path = os.path.join(self.root_dir, 'full_test_split.csv')
        # load sent2vec model for converting text file to 2D array
#         self.sent2vec = SentenceTransformer('all-mpnet-base-v2')  # output dimension 768

        # load training data # 107 sessions
        if self.mode == "train":
            # pre-checking and cleaning the data
            self.data_df = self.pre_check(pd.read_csv(self.train_data_path))
            # store ground truth
            ####################################################################################################
            # self.patientIDs = self.data_df['Participant_ID'].to_numpy()
            self.patientIDs = np.array([303, 321, 362, 363, 426])  # for debugging on my laptop
            ####################################################################################################
            self.phq_binay_gt = np.array([0,1,1,0,1]) # self.data_df['PHQ8_Binary'].to_numpy()
            self.phq_score_gt = self.data_df['PHQ8_Score'].to_numpy()
            self.gender_gt = self.data_df['Gender'].to_numpy()
            self.phq_subscores_gt = self.data_df.iloc[:, 4:].to_numpy()

        # load development data # 35 sessions
        if self.mode == "validation":
            # pre-checking and cleaning the data
            self.data_df = self.pre_check(pd.read_csv(self.valid_data_path))
            # store ground truth
            self.patientIDs = self.data_df['Participant_ID'].to_numpy()
            self.phq_binay_gt = self.data_df['PHQ8_Binary'].to_numpy()
            self.phq_score_gt = self.data_df['PHQ8_Score'].to_numpy()
            self.gender_gt = self.data_df['Gender'].to_numpy()
            self.phq_subscores_gt = self.data_df.iloc[:, 4:].to_numpy()

        # load test data # 47 sessions
        if self.mode == "test":
            # pre-checking and cleaning the data
            self.data_df = self.pre_check(pd.read_csv(self.test_data_path))
            # store ground truth
            self.patientIDs = self.data_df['Participant_ID'].to_numpy()
            self.phq_binay_gt = self.data_df['PHQ_Binary'].to_numpy()
            self.phq_score_gt = self.data_df['PHQ_Score'].to_numpy()
            self.gender_gt = self.data_df['Gender'].to_numpy()
            # subscores in test data are not provided, thus we initialize it with 0 to avoid error for DataLoader
            self.phq_subscores_gt = np.zeros((self.patientIDs.shape[0],  8))
        
        
        # get sampler
        target = self.phq_binay_gt  # np.array([0,1,1,0,1])  # self.phq_binay_gt
        class_sample_count = np.unique(target, return_counts=True)[1]
        weight = 1. / class_sample_count
        samples_weight = weight[target]
        samples_weight = torch.from_numpy(samples_weight).double()
        self.sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
        
    
    def pre_check(self, data):
        '''
        Basic cleaning process to make sure no missing value
        and that the sum of each PHQ subscore equals to PHQ score 
        Argument:
            data: numpy array
        Return:
            data: numpy array with type "int"
        '''
        # make sure no NaN, Inf, -Inf
        if data.isin([np.nan, np.inf, -np.inf]).any(1).sum():
            print('Replacing NaN, Inf, or -Inf ...')
            data = data.replace([np.inf, -np.inf, np.nan], 0).astype('int')
        else: 
            data = data.astype('int')
            
        # compare the sum of each PHQ subscore to PHQ score
        unequal = data.iloc[:, 4:].sum(axis=1) != data.iloc[:,2]
        if unequal.any() and self.mode != 'test':
            lines = np.where(unequal)
            raise ValueError(("The sum of each PHQ subscore at line {} "
                              "is unequal to the PHQ score").format(lines[0]))
        
        # check whether the PHQ binary is correctly converted based on PHQ score 
        phq_binary = data.iloc[:, 1].to_numpy()
        phq_score = data.iloc[:, 2].to_numpy()
        phq_converted_binary = np.where(phq_score > 9, 1, 0)
        if (phq_converted_binary != phq_binary).any():
            where = np.where(phq_converted_binary != phq_binary)
            data.iloc[where, 1] = phq_converted_binary[where]
            
        return data
    
    
    def __len__(self):
        return len(self.patientIDs)
    
    
    def __iter__(self):
        return iter(self.patientIDs)
    
    
    def __getitem__(self, idx):
        '''
        Essentional function for creating dataset in PyTorch, which will automatically be
        called in Dataloader and load all the extracted features of the patient in the Batch
        based on the index of self.patientIDs
        Argument:
            idx: int, index of the patient ID in self.patientIDs
        Return:
            session: dict, contains all the extracted features and ground truth of a patient/session 
        '''
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # get the patient session path
        session_num = self.patientIDs[idx]
        session_path = os.path.join(self.root_dir, '{}_P'.format(session_num))
        
        # TODO: if other feature is needed, add more in the following part...
        
        # get key points and gaze direction path
        facial_landmarks_path = os.path.join(session_path, '{}_CLNF_features3D.txt'.format(session_num))
        gaze_direction_path = os.path.join(session_path, '{}_CLNF_gaze.txt'.format(session_num))
        
        # facial feature
        facial_landmarks = self.load_facial_landmarks(facial_landmarks_path)
        # gaze direction feature
        gaze_direction = pd.read_csv(gaze_direction_path).iloc[:, 4:].to_numpy()
        
        # summary
        session = {'patientID': session_num,
                   'session_path': session_path,
                   'facial_landmarks': facial_landmarks,
                   'gaze_direction': gaze_direction,
                   'phq_score_gt': self.phq_score_gt[idx],
                   'phq_binay_gt': self.phq_binay_gt[idx],
                   'phq_subscores_gt': self.phq_subscores_gt[idx],
                   'gender_gt': self.gender_gt[idx]}
        
#         # get all features path of the session
#         facial_landmarks_path = os.path.join(session_path, '{}_CLNF_features3D.txt'.format(session_num))
#         gaze_direction_path = os.path.join(session_path, '{}_CLNF_gaze.txt'.format(session_num))
#         audio_path = os.path.join(session_path, '{}_AUDIO.wav'.format(session_num))
#         text_path = os.path.join(session_path, '{}_TRANSCRIPT.csv'.format(session_num))
        
#         # facial feature
#         facial_landmarks = self.load_facial_landmarks(facial_landmarks_path)
#         # gaze direction feature
#         gaze_direction = pd.read_csv(gaze_direction_path).iloc[:, 4:].to_numpy()
#         # audion feature, but constrain the rows based to match the shape of landmarks/gaze_sample
#         audio, self.audio_parameters = self.load_audio(audio_path)
#         audio = audio[:facial_landmarks.shape[0]]
#         # text feature
#         self.text_feature = self.load_sent2vec(text_path, speaker='Participant')
#         sentence_embedding = self.text_feature['sentence_embeddings']

#         # summary
#         session = {'patientID': session_num,
#                    'session_path': session_path,
#                    'facial_landmarks': facial_landmarks,
#                    'gaze_direction': gaze_direction, 
#                    'audio': audio,
#                    'sentence_embeddings': sentence_embedding,
#                    'phq_score_gt': self.phq_score_gt[idx],
#                    'phq_binay_gt': self.phq_binay_gt[idx],
#                    'phq_subscores_gt': self.phq_subscores_gt[idx],
#                    'gender_gt': self.gender_gt[idx]}
        
        if self.transform:
            session = self.transform(session)
        
        return session
    
    
    def load_facial_landmarks(self, facial_landmarks_path, preprocess=True):
        ''' 
        load the facial landmarks and separately recale 
        each x, y, z coordiante of each frame
        Arguments:
            facial_landmarks_path: string, absolute path to 3D facial landmarks file
            preprocess: boolean, whether normalize the data
        Return:
            landmarks: 2D numpy.ndarray, coordinate (x,y,z) of 68 3D facial points
        '''
        # load the landmarks file
        landmarks = pd.read_csv(facial_landmarks_path).iloc[:, 4:].to_numpy()
        
        if preprocess:
            # recale x, y, z
            landmarks_x = minmax_scaler(landmarks[:, 0:68])
            landmarks_y = minmax_scaler(landmarks[:, 68:136])
            landmarks_z = minmax_scaler(landmarks[:, 136:204])
            # concatenate together
            landmarks = np.concatenate([landmarks_x, landmarks_y, landmarks_z], axis=1)
        
        return landmarks
    

class Padding(object):
    ''' pad zero to each feature matrix so that they all have the same size '''

    def __init__(self,
                 landmarks_output_size=(58989, 204), 
                 gaze_output_size=(58989, 12)):
        super(Padding, self).__init__()
        '''
        Each output size could be 'int' or 'tuple'. 
        Integer would be the number of desired rows
        and Tuple would be the desired 2D array size.

        Here is recommended to keep the number of columns 
        as they are and only set the number of rows with int

        To find the maximum length of rows, please use the 
        'find_max_length' function in utils to search through. 

        The value 386 are the maximum length in our case.
        '''
        assert isinstance(landmarks_output_size, (int, tuple))
        assert isinstance(gaze_output_size, (int, tuple))
        
        self.landmarks_output_size = landmarks_output_size
        self.gaze_output_size = gaze_output_size

        
    def __call__(self, session):
        facial_landmarks = session['facial_landmarks']
        gaze_direction = session['gaze_direction']
        
        # facial landmarks padding along heigh dimension (dim-0)
        if isinstance(self.landmarks_output_size, int):
            h, w = facial_landmarks.shape
            new_h = self.landmarks_output_size if h > self.landmarks_output_size else h
            padded_landmarks = np.zeros((self.landmarks_output_size, w))
            padded_landmarks[:new_h, :w] = facial_landmarks[:new_h, :w]
        # facial landmarks padding along both heigh and width dimension
        else:
            h, w = facial_landmarks.shape
            new_h = self.landmarks_output_size[0] if h > self.landmarks_output_size[0] else h
            new_w = self.landmarks_output_size[1] if w > self.landmarks_output_size[1] else w
            padded_landmarks = np.zeros(self.landmarks_output_size)
            padded_landmarks[:new_h, :new_w] = facial_landmarks[:new_h, :new_w]
            
        # gaze direction padding along heigh dimension (dim-0)
        if isinstance(self.gaze_output_size, int):
            h, w = gaze_direction.shape
            new_h = self.gaze_output_size if h > self.gaze_output_size else h
            padded_gaze = np.zeros((self.gaze_output_size, w))
            padded_gaze[:new_h, :w] = gaze_direction[:new_h, :w]
        # gaze direction padding along both heigh and width dimension
        else:
            h, w = gaze_direction.shape
            new_h = self.gaze_output_size[0] if h > self.gaze_output_size[0] else h
            new_w = self.gaze_output_size[1] if w > self.gaze_output_size[1] else w
            padded_gaze = np.zeros(self.gaze_output_size)
            padded_gaze[:new_h, :new_w] = gaze_direction[:new_h, :new_w]

        # summary
        padded_session = {'patientID': session['patientID'],
                          'session_path': session['session_path'],
                          'facial_landmarks': padded_landmarks,
                          'gaze_direction': padded_gaze, 
                          'phq_score_gt': session['phq_score_gt'],
                          'phq_binay_gt': session['phq_binay_gt'],
                          'phq_subscores_gt': session['phq_subscores_gt'],
                          'gender_gt': session['gender_gt']}

        return padded_session

    
# class Rescale(object):
#     """Rescale the image in a sample to a given size.
#     Arguments:
#         output_size:(tuple or int),  Desired output size. If tuple, output is
#             matched to output_size. If int, smaller of image edges is matched
#             to output_size keeping aspect ratio the same.
#     """

#     def __init__(self, output_size=(256, 256)):
#         assert isinstance(output_size, (int, tuple))
#         self.output_size = output_size

#     def __call__(self, session):
#         audio = session['audio']

#         h, w = audio.shape[:2]

#         if isinstance(self.output_size, int):
#             if h > w:
#                 new_h, new_w = self.output_size * h / w, self.output_size
#             else:
#                 new_h, new_w = self.output_size, self.output_size * w / h
#         else:
#             new_h, new_w = self.output_size

#         new_h, new_w = int(new_h), int(new_w)

#         rescaled_audio = transform.resize(audio, (new_h, new_w))

#         # summary
#         rescaled_session = {'patientID': session['patientID'],
#                             'session_path': session['session_path'],
#                             'audio': rescaled_audio,
#                             'phq_score_gt': session['phq_score_gt'],
#                             'phq_binay_gt': session['phq_binay_gt'],
#                             'phq_subscores_gt': session['phq_subscores_gt'],
#                             'gender_gt': session['gender_gt']}

#         return rescaled_session


# class RandomCrop(object):
#     """Crop randomly the image in a sample.
#     Arguments:
#         output_size:(tuple or int), Desired output size. If int, square crop
#             is made.
#     """

#     def __init__(self, output_size=(224, 224)):
#         assert isinstance(output_size, (int, tuple))

#         if isinstance(output_size, int):
#             self.output_size = (output_size, output_size)
#         else:
#             assert len(output_size) == 2
#             self.output_size = output_size

#     def __call__(self, session):
#         audio = session['audio']

#         h, w = audio.shape[:2]
#         new_h, new_w = self.output_size

#         top = np.random.randint(0, h - new_h)
#         left = np.random.randint(0, w - new_w)

#         cropped_audio = audio[top:top + new_h, left:left + new_w]

#         # summary
#         cropped_session = {'patientID': session['patientID'],
#                            'session_path': session['session_path'],
#                            'audio': cropped_audio,
#                            'phq_score_gt': session['phq_score_gt'],
#                            'phq_binay_gt': session['phq_binay_gt'],
#                            'phq_subscores_gt': session['phq_subscores_gt'],
#                            'gender_gt': session['gender_gt']}

#         return cropped_session


class ToTensor(object):
    """Convert ndarrays in sample to Tensors or np.int to torch.tensor."""

    def __call__(self, session):
        
        facial_landmarks = session['facial_landmarks']
        gaze_direction = session['gaze_direction']
        
        converted_session = {'patientID': session['patientID'],
                             'session_path': session['session_path'],
                             'facial_landmarks': torch.from_numpy(session['facial_landmarks']).type(torch.FloatTensor),
                             'gaze_direction': torch.from_numpy(session['gaze_direction']).type(torch.FloatTensor),
                             'phq_score_gt': torch.tensor(session['phq_score_gt']).type(torch.FloatTensor),
                             'phq_binay_gt': torch.tensor(session['phq_binay_gt']).type(torch.FloatTensor),
                             'phq_subscores_gt': torch.from_numpy(session['phq_subscores_gt']).type(torch.FloatTensor),
                             'gender_gt': torch.tensor(session['gender_gt']).type(torch.FloatTensor)}

        return converted_session

In [128]:
pre_check(data)

Unnamed: 0,Participant_ID,PHQ8_Binary,PHQ8_Score,Gender,PHQ8_NoInterest,PHQ8_Depressed,PHQ8_Sleep,PHQ8_Tired,PHQ8_Appetite,PHQ8_Failure,PHQ8_Concentrating,PHQ8_Moving
0,303,0,0,0,0,0,0,0,0,0,0,0
1,304,0,6,0,0,1,1,2,2,0,0,0
2,305,0,7,1,0,1,1,2,2,1,0,0
3,310,0,4,1,1,1,0,0,0,1,1,0
4,312,0,2,1,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
102,485,0,2,1,0,1,0,0,0,0,0,1
103,486,0,4,0,1,1,0,1,0,1,0,0
104,487,0,0,0,0,0,0,0,0,0,0,0
105,488,0,0,0,0,0,0,0,0,0,0,0


In [71]:
data.iloc[64]

Participant_ID        409
PHQ8_Binary             0
PHQ8_Score             10
Gender                  1
PHQ8_NoInterest         1
PHQ8_Depressed          1
PHQ8_Sleep              2
PHQ8_Tired              3
PHQ8_Appetite           0
PHQ8_Failure            1
PHQ8_Concentrating      2
PHQ8_Moving             0
Name: 64, dtype: int64

In [40]:
data

Unnamed: 0,Participant_ID,PHQ8_Binary,PHQ8_Score,Gender,PHQ8_NoInterest,PHQ8_Depressed,PHQ8_Sleep,PHQ8_Tired,PHQ8_Appetite,PHQ8_Failure,PHQ8_Concentrating,PHQ8_Moving
0,303,0,0,0,0,0,0,0,0,0,0,0
1,304,0,6,0,0,1,1,2,2,0,0,0
2,305,0,7,1,0,1,1,2,2,1,0,0
3,310,0,4,1,1,1,0,0,0,1,1,0
4,312,0,2,1,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
102,485,0,2,1,0,1,0,0,0,0,0,1
103,486,0,4,0,1,1,0,1,0,1,0,0
104,487,0,0,0,0,0,0,0,0,0,0,0
105,488,0,0,0,0,0,0,0,0,0,0,0


In [2]:
root_dir = os.path.join('C:/Users/denni/Documents/KIT Studium/Bachelorarbeit', 'DAIC-WOZ Dataset')

train_data_path = os.path.join(root_dir, 'train_split_Depression_AVEC2017.csv')
valid_data_path = os.path.join(root_dir, 'dev_split_Depression_AVEC2017.csv')
test_data_path = os.path.join(root_dir, 'full_test_split.csv')

In [39]:
train_data = pd.read_csv(train_data_path)
valid_data = pd.read_csv(valid_data_path)
test_data = pd.read_csv(test_data_path)

In [107]:
train_patient_IDs = train_data['Participant_ID'].to_numpy().astype(np.int16)
valid_patient_IDs = valid_data['Participant_ID'].to_numpy().astype(np.int16)
test_patient_IDs = test_data['Participant_ID'].to_numpy().astype(np.int16)

train_patient_IDs.shape, valid_patient_IDs.shape, test_patient_IDs.shape

((107,), (35,), (47,))

In [108]:
full_patient_IDs = np.sort(np.hstack([train_patient_IDs, valid_patient_IDs, test_patient_IDs])).astype(np.int16)

full_patient_IDs, full_patient_IDs.shape

(array([300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312,
        313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325,
        326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338,
        339, 340, 341, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352,
        353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365,
        366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378,
        379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391,
        392, 393, 395, 396, 397, 399, 400, 401, 402, 403, 404, 405, 406,
        407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
        420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432,
        433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445,
        446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458,
        459, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472,
        473, 474, 475, 476, 477, 478, 479, 480, 481

In [92]:
type(full_patient_IDs[0])

numpy.int16

In [81]:
np.where(full_patient_IDs==409)

(array([106], dtype=int64),)

In [94]:
len(phq_converted_binary)

107

In [99]:
a = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
len(a)
a[106]

1

In [111]:
# load audio file
import h5py


audio_root = 'D:/DAIC-WOZ_dataset/Audio/logmel_snv_exp'
audio_path = os.path.join(audio_root, 'complete_database.h5')


with h5py.File(audio_path, 'r') as h5:
    features = h5['features'][:, 0]
    labels = h5['class'][:]
    scores = h5['score'][:]
    patientIDs = h5['folder'][:]
    index = h5['index'][:]

In [112]:
patientIDs

array([300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312,
       313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325,
       326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338,
       339, 340, 341, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352,
       353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365,
       366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378,
       379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391,
       392, 393, 395, 396, 397, 399, 400, 401, 402, 403, 404, 405, 406,
       407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
       420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432,
       433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445,
       446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458,
       459, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472,
       473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 48

In [122]:
np.hstack([np.where(full_patient_IDs==i) for i in train_patient_IDs]).squeeze()

array([  3,   4,   5,  10,  12,  13,  15,  16,  17,  18,  19,  20,  21,
        22,  24,  25,  26,  27,  28,  30,  33,  36,  38,  39,  40,  41,
        42,  43,  44,  46,  47,  49,  50,  51,  52,  54,  55,  56,  57,
        59,  61,  62,  63,  65,  67,  68,  69,  70,  71,  73,  74,  75,
        78,  79,  82,  84,  85,  90,  91,  92,  95,  97,  98,  99, 106,
       109, 111, 112, 113, 116, 120, 122, 123, 124, 125, 126, 127, 130,
       131, 134, 138, 140, 141, 142, 143, 144, 145, 146, 151, 152, 153,
       154, 156, 159, 160, 164, 167, 169, 170, 171, 174, 175, 181, 182,
       183, 184, 187], dtype=int64)

In [139]:
idx_in_whole_dataset = np.where(full_patient_IDs==303)[0][0]
idx_in_whole_dataset

3

In [145]:
features[idx_in_whole_dataset].reshape(80, -1)

array([[ 1.32743   ,  1.3249494 ,  1.2658678 , ...,  1.2685912 ,
         1.392497  ,  1.4017674 ],
       [ 1.107519  ,  1.202885  ,  1.1357735 , ...,  0.80705774,
         0.7334303 ,  0.7568188 ],
       [ 1.2476406 ,  0.8866333 ,  0.44323593, ...,  0.65998703,
         0.6704184 ,  0.39679855],
       ...,
       [-1.5545362 , -1.5219041 , -1.6075579 , ..., -1.7362987 ,
        -1.6579329 , -1.7135811 ],
       [-1.7514281 , -1.7431799 , -1.7791928 , ..., -1.8250825 ,
        -1.8601888 , -1.9085772 ],
       [-1.8333642 , -1.91712   , -1.9631115 , ..., -2.0160012 ,
        -1.976898  , -1.998173  ]], dtype=float32)

In [125]:
features[np.hstack([np.where(full_patient_IDs==i) for i in train_patient_IDs]).squeeze()]

array([array([ 1.32743  ,  1.3249494,  1.2658678, ..., -2.0160012, -1.976898 ,
              -1.998173 ], dtype=float32)                                     ,
       array([ 1.0601646,  1.2481416,  1.3038456, ..., -1.7629714, -1.7911508,
              -1.7723138], dtype=float32)                                     ,
       array([ 1.2560773,  1.0517627,  0.9097564, ..., -1.3292217, -1.3368998,
              -1.4047761], dtype=float32)                                     ,
       array([ 1.5073284,  1.4591094,  1.2482179, ..., -1.1067563, -1.0654975,
              -1.1343845], dtype=float32)                                     ,
       array([ 1.8171296,  1.8722442,  1.730458 , ..., -1.416058 , -1.4015712,
              -1.4170307], dtype=float32)                                     ,
       array([ 0.82127744,  0.882391  ,  0.9713714 , ..., -1.4978169 ,
              -1.4576048 , -1.418537  ], dtype=float32)               ,
       array([ 1.3755808,  1.2391801,  1.3063501, ..., -1.4128

In [142]:
# with h5py.File('complete_database.h5', 'r') as h5:
#     features = h5['features'][:, 0]
#     labels = h5['class'][:]
#     scores = h5['score'][:]
#     folders = h5['folder'][:]
#     genders = h5['gender'][:] 
#     index = h5['index'][:]

# test audio loading

In [141]:
import h5py

In [147]:
class DepressionDataset(Dataset):
    '''create a training, develop, or test dataset
       and load the participant features if it's called
    '''

    def __init__(self,
                 root_dir,
                 mode,
                 transform=None):
        super(DepressionDataset, self).__init__()

        # only train, develop, test dataset allow
        assert mode in ["train", "validation", "test"], \
            "Argument --mode could only be ['train', 'validation', 'test']"

        self.mode = mode
        self.root_dir = root_dir
        self.transform = transform
        self.train_data_path = os.path.join(self.root_dir, 'train_split_Depression_AVEC2017.csv')
        self.valid_data_path = os.path.join(self.root_dir, 'dev_split_Depression_AVEC2017.csv')
        self.test_data_path = os.path.join(self.root_dir, 'full_test_split.csv')
        # load sent2vec model for converting text file to 2D array
#         self.sent2vec = SentenceTransformer('all-mpnet-base-v2')  # output dimension 768

        # get full patient ID list
        train_patient_IDs = pd.read_csv(self.train_data_path)['Participant_ID'].to_numpy()
        valid_patient_IDs = pd.read_csv(self.valid_data_path)['Participant_ID'].to_numpy()
        test_patient_IDs = pd.read_csv(self.test_data_path)['Participant_ID'].to_numpy()
        self.full_patient_IDs = np.sort(np.hstack([train_patient_IDs, valid_patient_IDs, test_patient_IDs]))
    
        # load training data # 107 sessions
        if self.mode == "train":
            # pre-checking and cleaning the data
            self.data_df = self.pre_check(pd.read_csv(self.train_data_path))
            # store ground truth
            ####################################################################################################
            # self.patientIDs = self.data_df['Participant_ID'].to_numpy()
            self.patientIDs = np.array([303, 321, 362, 363, 426])  # for debugging on my laptop
            ####################################################################################################
            self.phq_binay_gt = self.data_df['PHQ8_Binary'].to_numpy()
            self.phq_score_gt = self.data_df['PHQ8_Score'].to_numpy()
            self.gender_gt = self.data_df['Gender'].to_numpy()
            self.phq_subscores_gt = self.data_df.iloc[:, 4:].to_numpy()

        # load development data # 35 sessions
        if self.mode == "validation":
            # pre-checking and cleaning the data
            self.data_df = self.pre_check(pd.read_csv(self.valid_data_path))
            # store ground truth
            self.patientIDs = self.data_df['Participant_ID'].to_numpy()
            self.phq_binay_gt = self.data_df['PHQ8_Binary'].to_numpy()
            self.phq_score_gt = self.data_df['PHQ8_Score'].to_numpy()
            self.gender_gt = self.data_df['Gender'].to_numpy()
            self.phq_subscores_gt = self.data_df.iloc[:, 4:].to_numpy()

        # load test data # 47 sessions
        if self.mode == "test":
            # pre-checking and cleaning the data
            self.data_df = self.pre_check(pd.read_csv(self.test_data_path))
            # store ground truth
            self.patientIDs = self.data_df['Participant_ID'].to_numpy()
            self.phq_binay_gt = self.data_df['PHQ_Binary'].to_numpy()
            self.phq_score_gt = self.data_df['PHQ_Score'].to_numpy()
            self.gender_gt = self.data_df['Gender'].to_numpy()
            # subscores in test data are not provided, thus we initialize it with 0 to avoid error for DataLoader
            self.phq_subscores_gt = np.zeros((self.patientIDs.shape[0],  8))
                
        # get sampler
        target = self.phq_binay_gt  # np.array([0,1,1,0,1])  # self.phq_binay_gt
        class_sample_count = np.unique(target, return_counts=True)[1]
        weight = 1. / class_sample_count
        samples_weight = weight[target]
        samples_weight = torch.from_numpy(samples_weight).double()
        self.sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
        
            
    def pre_check(self, data):
        '''
        Basic cleaning process to make sure no missing value
        and that the sum of each PHQ subscore equals to PHQ score
        Argument:
            data: numpy array
        Return:
            data: numpy array with type "int"
        '''
        # make sure no NaN, Inf, -Inf
        if data.isin([np.nan, np.inf, -np.inf]).any(1).sum():
            print('Replacing NaN, Inf, or -Inf ...')
            data = data.replace([np.inf, -np.inf, np.nan], 0)  # .astype('int')
        else:
            data = data  # .astype('int')

        # compare the sum of each PHQ subscore to PHQ score
        unequal = data.iloc[:, 4:].sum(axis=1) != data.iloc[:, 2]
        if unequal.any() and self.mode != 'test':
            lines = np.where(unequal)
            raise ValueError(("The sum of each PHQ subscore at line {} "
                              "is unequal to the PHQ score").format(lines[0]))
            
        # check whether the PHQ binary is correctly converted based on PHQ score 
        phq_binary = data.iloc[:, 1].to_numpy()
        phq_score = data.iloc[:, 2].to_numpy()
        phq_converted_binary = np.where(phq_score > 9, 1, 0)
        if (phq_converted_binary != phq_binary).any():
            where = np.where(phq_converted_binary != phq_binary)
            data.iloc[where, 1] = phq_converted_binary[where]

        return data

    
    def __len__(self):
        return len(self.patientIDs)

    
    def __iter__(self):
        return iter(self.patientIDs)
    
    
    def __getitem__(self, idx):
        '''
        Essentional function for creating dataset in PyTorch, which will automatically be
        called in Dataloader and load all the extracted features of the patient in the Batch
        based on the index of self.patientIDs
        Argument:
            idx: int, index of the patient ID in self.patientIDs
        Return:
            session: dict, contains all the extracted features and ground truth of a patient/session
        '''
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # get the patient session path
        session_num = self.patientIDs[idx]
        session_path = os.path.join(self.root_dir, '{}_P'.format(session_num))

        # TODO: if other feature is needed, add more in the following part...

#         # get audio feature path
#         audio_path = os.path.join(session_path, '{}_AUDIO.wav'.format(session_num))
        
        # load audio file
        audio_root = 'D:/DAIC-WOZ_dataset/Audio/logmel_snv_exp'
        audio_path = os.path.join(audio_root, 'complete_database.h5')
        # audio feature extration
        idx_in_whole_dataset = np.where(full_patient_IDs==self.patientIDs[idx])[0][0]
        with h5py.File(audio_path, 'r') as h5:
            audio_feature = h5['features'][:, 0][idx_in_whole_dataset]
        audio = np.transpose(audio_feature.reshape(80, -1))
#         audio, self.audio_parameters = self.load_audio(audio_path)
        
        # summary
        session = {'patientID': session_num,
                   'session_path': session_path,
                   'audio': audio,
                   'phq_score_gt': self.phq_score_gt[idx],
                   'phq_binay_gt': self.phq_binay_gt[idx],
                   'phq_subscores_gt': self.phq_subscores_gt[idx],
                   'gender_gt': self.gender_gt[idx]}
        
#         # get all features path of the session
#         facial_landmarks_path = os.path.join(session_path, '{}_CLNF_features3D.txt'.format(session_num))
#         gaze_direction_path = os.path.join(session_path, '{}_CLNF_gaze.txt'.format(session_num))
#         audio_path = os.path.join(session_path, '{}_AUDIO.wav'.format(session_num))
#         text_path = os.path.join(session_path, '{}_TRANSCRIPT.csv'.format(session_num))
        
#         # facial feature
#         facial_landmarks = self.load_facial_landmarks(facial_landmarks_path)
#         # gaze direction feature
#         gaze_direction = pd.read_csv(gaze_direction_path).iloc[:, 4:].to_numpy()
#         # audion feature, but constrain the rows based to match the shape of landmarks/gaze_sample
#         audio, self.audio_parameters = self.load_audio(audio_path)
#         audio = audio[:facial_landmarks.shape[0]]
#         # text feature
#         self.text_feature = self.load_sent2vec(text_path, speaker='Participant')
#         sentence_embedding = self.text_feature['sentence_embeddings']

#         # summary
#         session = {'patientID': session_num,
#                    'session_path': session_path,
#                    'facial_landmarks': facial_landmarks,
#                    'gaze_direction': gaze_direction, 
#                    'audio': audio,
#                    'sentence_embeddings': sentence_embedding,
#                    'phq_score_gt': self.phq_score_gt[idx],
#                    'phq_binay_gt': self.phq_binay_gt[idx],
#                    'phq_subscores_gt': self.phq_subscores_gt[idx],
#                    'gender_gt': self.gender_gt[idx]}

        if self.transform:
            session = self.transform(session)

        return session
    
    def load_audio(self, audio_path, 
                   spectro_type='mel_spectrogram', 
                   frame_size = 2048,
                   hop_size = 533,
                   sample_rate = 16000,
                   num_mel_bands = 80,
                   preprocess=True):
        '''
        Standard method of loading audio and extracting audio features
        with Short-Time Fourier Transform by utilizing librosa library
        Arguments:
            audio_path: string, absolute path to audio file
            preprocess: boolean, whether normalize the data
        Return:
            audio_feature: 2D numpy.ndarray, extracted audio feature (spectra) in dB
        '''
        # only spectrogram and mel_spectrogram are allow
        assert spectro_type in ['spectrogram', 'mel_spectrogram'],\
            "Argument --spectro could only be ['spectrogram', 'mel_spectrogram']"
        
        # parameter setting for Short-Time Fourier Transform
        audio_parameters = {'spectro_type': spectro_type,
                            'sample_rate': sample_rate,
                            'frame_size': frame_size,
                            'hop_size': hop_size, 
                            'num_mel_bands': num_mel_bands}
        
        # load audio file with librosa
        sampled_values, sr = librosa.load(audio_path, sr=audio_parameters['sample_rate'])  
        ''' According to documnet sample rate is 16kHz
            sampled_values: audio sampled values of time series
            sr: sampling rate of audio
        '''
        
        # extracting features
        if spectro_type == 'spectrogram':
            # use Short-Time Fourier Transform, return complex-valued matrix STFT coefficients
            extracted_values = librosa.stft(sampled_values, 
                                            n_fft=audio_parameters['frame_size'], 
                                            hop_length=audio_parameters['hop_size'])
            # calculating the spectrogram
            extracted_values = np.abs(extracted_values) ** 2
            # print("Shape of the extracted features in dB: {}".format(extracted_values.shape))
            
        elif spectro_type == 'mel_spectrogram':
            # get the Mel filter banks
            self.filter_banks = librosa.filters.mel(n_fft=audio_parameters['frame_size'], 
                                                    sr=audio_parameters['sample_rate'], 
                                                    n_mels=audio_parameters['num_mel_bands'])
            # extract the mel spectrograom
            extracted_values = librosa.feature.melspectrogram(sampled_values, 
                                                              sr=audio_parameters['sample_rate'], 
                                                              n_fft=audio_parameters['frame_size'], 
                                                              hop_length=audio_parameters['hop_size'], 
                                                              n_mels=audio_parameters['num_mel_bands'])
            # print("Shape of the extracted_values in dB: {}".format(extracted_values.shape))
            
        else:
            raise ValueError("The given value of spectro_type is not supported\n"
                             "'spectro_type' could only be 'spectrogram' or 'mel_spectrogram'")
        
        # convert amplitude to DBs
        # transpose the result so that the rows corresponds to time and column to frequence
        audio_feature = np.transpose(librosa.power_to_db(extracted_values))
        # print("Shape of the final extracted audio feature (spectra) in dB: {}".format(audio_feature.shape))
        
        if preprocess:
            audio_feature = audio_minmax_scaler(audio_feature)
            
        return audio_feature, audio_parameters
    

class Padding(object):
    ''' pad zero to each feature matrix so that they all have the same size '''

    def __init__(self, audio_output_size=(58989, 80)):
        super(Padding, self).__init__()
        '''
        Each output size could be 'int' or 'tuple'. 
        Integer would be the number of desired rows
        and Tuple would be the desired 2D array size.

        Here is recommended to keep the number of columns 
        as they are and only set the number of rows with int

        To find the maximum length of rows, please use the 
        'find_max_length' function in utils to search through. 

        The value 386 are the maximum length in our case.
        '''
        assert isinstance(audio_output_size, (int, tuple))
        self.audio_output_size = audio_output_size

        
    def __call__(self, session):
        audio = session['audio']
        
        # audio padding along heigh dimension
        if isinstance(self.audio_output_size, int):
            h, w = audio.shape
            new_h = self.audio_output_size if h > self.audio_output_size else h
            padded_audio = np.zeros((self.audio_output_size, w))
            padded_audio[:new_h, :w] = audio[:new_h, :w]
        
        # audio padding along both heigh and width dimension
        else:
            h, w = audio.shape
            new_h = self.audio_output_size[0] if h > self.audio_output_size[0] else h
            new_w = self.audio_output_size[1] if w > self.audio_output_size[1] else w
            padded_audio = np.zeros(self.audio_output_size)
            padded_audio[:new_h, :new_w] = audio[:new_h, :new_w]

        # summary
        padded_session = {'patientID': session['patientID'],
                          'session_path': session['session_path'],
                          'audio': padded_audio,
                          'phq_score_gt': session['phq_score_gt'],
                          'phq_binay_gt': session['phq_binay_gt'],
                          'phq_subscores_gt': session['phq_subscores_gt'],
                          'gender_gt': session['gender_gt']}

        return padded_session

    
class Rescale(object):
    """Rescale the image in a sample to a given size.
    Arguments:
        output_size:(tuple or int),  Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size=(256, 256)):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, session):
        audio = session['audio']

        h, w = audio.shape[:2]

        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)

        rescaled_audio = transform.resize(audio, (new_h, new_w))

        # summary
        rescaled_session = {'patientID': session['patientID'],
                            'session_path': session['session_path'],
                            'audio': rescaled_audio,
                            'phq_score_gt': session['phq_score_gt'],
                            'phq_binay_gt': session['phq_binay_gt'],
                            'phq_subscores_gt': session['phq_subscores_gt'],
                            'gender_gt': session['gender_gt']}

        return rescaled_session


class RandomCrop(object):
    """Crop randomly the image in a sample.
    Arguments:
        output_size:(tuple or int), Desired output size. If int, square crop
            is made.
    """

    def __init__(self, output_size=(224, 224)):
        assert isinstance(output_size, (int, tuple))

        if isinstance(output_size, int):
            self.output_size = (output_size, output_size)
        else:
            assert len(output_size) == 2
            self.output_size = output_size

    def __call__(self, session):
        audio = session['audio']

        h, w = sentence_embeddings.shape[:2]
        new_h, new_w = self.output_size

        top = np.random.randint(0, h - new_h)
        left = np.random.randint(0, w - new_w)

        cropped_audio = audio[top:top + new_h, left:left + new_w]

        # summary
        cropped_session = {'patientID': session['patientID'],
                           'session_path': session['session_path'],
                           'audio': cropped_audio,
                           'phq_score_gt': session['phq_score_gt'],
                           'phq_binay_gt': session['phq_binay_gt'],
                           'phq_subscores_gt': session['phq_subscores_gt'],
                           'gender_gt': session['gender_gt']}

        return cropped_session


class ToTensor(object):
    """Convert ndarrays in sample to Tensors or np.int to torch.tensor."""

    def __call__(self, session):
        converted_session = {'patientID': session['patientID'],
                             'session_path': session['session_path'],
                             'audio': torch.from_numpy(session['audio']).type(torch.FloatTensor),
                             'phq_score_gt': torch.tensor(session['phq_score_gt']).type(torch.FloatTensor),
                             'phq_binay_gt': torch.tensor(session['phq_binay_gt']).type(torch.FloatTensor),
                             'phq_subscores_gt': torch.from_numpy(session['phq_subscores_gt']).type(torch.FloatTensor),
                             'gender_gt': torch.tensor(session['gender_gt']).type(torch.FloatTensor)}

        return converted_session

In [148]:
if __name__ == '__main__':
    import sys
    from torch.utils.data import DataLoader
    from torchvision import transforms

    # sys.path.append('C:/Users/denni/Documents/KIT Studium/Bachelorarbeit')
    root_dir = 'C:/Users/denni/Documents/KIT Studium/Bachelorarbeit'

    # test 3: try to load the dataset with DataLoader
    transformed_dataset = DepressionDataset(os.path.join(root_dir, 'DAIC-WOZ Dataset'), 'train', 
                                            transform=transforms.Compose([Padding((49109, 80)), ToTensor()]))

    # create dataloader
    dataloader = DataLoader(transformed_dataset,
                            batch_size=2,
                            shuffle=False,
                            num_workers=0)
    # iterate through batches
    for i_batch, sample_batched in enumerate(dataloader):
        print('Batch number: ', i_batch, ', audio: ', sample_batched['audio'].size())
        print('=================================')
        if i_batch == 1:
            test_batch = sample_batched


Batch number:  0 , audio:  torch.Size([2, 49109, 80])
Batch number:  1 , audio:  torch.Size([2, 49109, 80])
Batch number:  2 , audio:  torch.Size([1, 49109, 80])
