In [None]:
!pip install --upgrade datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ibis-framework 7.1.0 requires pyarrow<15,>=2, but you have pyarrow 15.0.0 which is incompatible.[0m[31m
[0m

In [None]:
!pip install transformers -q

In [None]:
import os
import gc
import math
import librosa
import numpy as np
import pandas as pd
import librosa as lb
import matplotlib.pyplot as plt

from scipy import signal
from scipy.fft import fftshift

from collections import defaultdict
from itertools import islice
from typing import Any
from sklearn.model_selection import StratifiedShuffleSplit

import torch
import torchaudio

from IPython import display
from torch import nn
from torch.utils.data import DataLoader, Dataset, Subset
from tqdm.auto import trange
from tqdm import tqdm
tqdm.pandas()

In [None]:
import re
import seaborn as sns
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import (
    load_dataset,
    load_metric,
    load_from_disk,
    Audio,
    concatenate_datasets
)
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    Wav2Vec2Tokenizer,
    Wav2Vec2Config,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2CTCTokenizer,
    TrainingArguments
)

In [None]:
SAMPLE_RATE = 16000
AUDIO_DIR = '/content/psychiatric_disorders_data/wav files'

TRAIN_CSV_PATH = '/content/drive/MyDrive/Laboratory/data/train.csv'
TEST_CSV_PATH = '/content/drive/MyDrive/Laboratory/data/test.csv'
VAL_CSV_PATH = '/content/drive/MyDrive/Laboratory/data/val.csv'

MODEL_NAME = 'facebook/wav2vec2-large-xlsr-53'

### 1. Open data

In [None]:
from zipfile import ZipFile
path = '/content/drive/MyDrive/Laboratory/psychiatric_disorders_data.zip'

if not os.path.isdir('psychiatric_disorders_data'):
    with ZipFile(path, 'r') as zipfile:
        zipfile.extractall()

In [None]:
df = pd.read_excel('/content/psychiatric_disorders_data/PsychiatricDiscourse_participant.data.xlsx')

In [None]:
df.head()

Unnamed: 0,ID,group,diagnosis,sex,age,education.level,education.years,depression.symptoms,dep.scale,thought.disorder.symptoms,td.scales
0,PD-001,patient,schizotypal.disorder,female,19.0,secondary,11,1,HDRS,0,SAPS
1,PD-002,patient,bipolar.affective.disorder,female,26.0,higher,17,1,HDRS,0,SAPS
2,PD-003,patient,schizotypal.disorder,female,17.0,secondary,10,0,HDRS,0,SAPS
3,PD-004,patient,borderline.personality.disorder,female,16.0,secondary,9,1,HDRS,0,SAPS
4,PD-005,patient,schizotypal.disorder,female,19.0,secondary,11,0,HDRS,0,SAPS


In [None]:
df.shape

(346, 11)

In [None]:
depression_only = df.loc[
    (df['thought.disorder.symptoms'] == 0.) &
    (df['depression.symptoms'] != 0.)
]


control_group = df.loc[
    (df['depression.symptoms'] == 0.) &
    (df['thought.disorder.symptoms'] == 0.)
]

df = pd.concat([depression_only, control_group])

In [None]:
df['depression.symptoms'] = df['depression.symptoms'].apply(lambda x: 0 if x == 0 else 1)

In [None]:
DRIVE_PATH = '/content/psychiatric_disorders_data/wav files/'
def get_patient_audio(row, data_folder=DRIVE_PATH, return_uncomplete=False):
    key = row.ID
    audio_files = []
    for filename in os.listdir(data_folder):
        if filename.find(key) != -1:
            audio_files.append(filename)
    return audio_files

df['audio'] = df.apply(get_patient_audio, axis=1)

# exclude patients with num of recordings other than 3
df = df[df.audio.apply(len) == 3]

In [None]:
task_mapping = {
    'narrative': ['sportsman', 'adventure', 'winterday'],
    'story': ['present', 'trip', 'party'],
    'instruction': ['chair', 'table', 'bench']
}

def get_domain_audio(row, domain):
    files = []
    for topic in task_mapping[domain]:
        for file_name in row.audio:
            if file_name.find(topic) != -1:
                files.append(file_name)

    if len(files) > 1:
        print(files)
    # assert len(files) < 2
    return files[0] if len(files) else None

for domain in task_mapping:
    df.loc[:, domain] = df.apply(get_domain_audio, axis=1, domain=domain)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, domain] = df.apply(get_domain_audio, axis=1, domain=domain)


In [None]:
df.head(2)

Unnamed: 0,ID,group,diagnosis,sex,age,education.level,education.years,depression.symptoms,dep.scale,thought.disorder.symptoms,td.scales,audio,narrative,story,instruction
0,PD-001,patient,schizotypal.disorder,female,19.0,secondary,11,1,HDRS,0,SAPS,"[PD-001-pers-1-present.wav, PD-001-instr-1-cha...",PD-001-pic-1-sportsman.wav,PD-001-pers-1-present.wav,PD-001-instr-1-chair.wav
1,PD-002,patient,bipolar.affective.disorder,female,26.0,higher,17,1,HDRS,0,SAPS,"[PD-002-pic-1-adventure.wav, PD-002-pers-1-pre...",PD-002-pic-1-adventure.wav,PD-002-pers-1-present.wav,PD-002-instr-1-chair.wav


### Data split

In [None]:
train_sss = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, train_size = 0.8, random_state = 42)
for (train_index, test_index) in train_sss.split(df, df['depression.symptoms']):
    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]

test_sss = StratifiedShuffleSplit(n_splits=1, test_size = 0.5, train_size = 0.5, random_state = 42)
for (test_index, val_index) in test_sss.split(test_df, test_df['depression.symptoms']):
    val_df = test_df.iloc[val_index]
    test_df = test_df.iloc[test_index]

In [None]:
print('train size:', train_df.shape)
print('val size:', val_df.shape)
print('test size:', test_df.shape)

train size: (242, 15)
val size: (30, 15)
test size: (30, 15)


In [None]:
columns = ['ID', 'depression.symptoms', 'audio']
train_df = train_df.explode('audio')[columns]
val_df = val_df.explode('audio')[columns]
test_df = test_df.explode('audio')[columns]

In [None]:
train_df.to_csv(TRAIN_CSV_PATH, sep="\t", encoding="utf-8", index=False)
test_df.to_csv(TEST_CSV_PATH, sep="\t", encoding="utf-8", index=False)
val_df.to_csv(VAL_CSV_PATH, sep="\t", encoding="utf-8", index=False)

Get data files

In [None]:
data_files = {
    "train": TRAIN_CSV_PATH,
    "validation": VAL_CSV_PATH,
    "test": TEST_CSV_PATH
}
dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

print(train_dataset)
print(val_dataset)
print(test_dataset)

Dataset({
    features: ['ID', 'depression.symptoms', 'audio'],
    num_rows: 726
})
Dataset({
    features: ['ID', 'depression.symptoms', 'audio'],
    num_rows: 90
})
Dataset({
    features: ['ID', 'depression.symptoms', 'audio'],
    num_rows: 90
})


In [None]:
def cut_recordings(audio, to_cut: int = 5, stride: int = 0, to_pad: bool = True):
    '''
    :param audio: np.array vector
    :param to_cut: number of seconds of each chunk
    :param stride: step in seconds
    :to_pad: if padding is needed
    :returns: numpy array of chunks
    '''
    segment_length = to_cut * SAMPLE_RATE
    stride_length = stride * SAMPLE_RATE
    audio_length = audio.shape[0]

    if to_pad:
        numframes = int(math.ceil((audio_length - (segment_length - stride_length)) / stride_length))
        pad_length = int(numframes * stride_length + segment_length) - audio_length
        audio = np.pad(audio, (0, pad_length), constant_values=0)

    else:
        numframes = int(math.floor((audio_length - segment_length)) / stride_length)
        del_length = int((numframes - 1) * stride_length + segment_length)
        audio = audio[:pad_length]

    indices = (
        np.tile(np.arange(0, segment_length), (numframes, 1))
        + np.tile(
            np.arange(0, numframes * stride_length, stride_length),
            (segment_length, 1),
        ).T
    )

    indices = np.array(indices, dtype=np.int32)
    audio_frames = audio[indices]

    return audio_frames

def get_audio_samples(audiopath, frame_config):
    '''
    :param frame_config: {to_cut: int(sec), stride: int(sec), to_pad: bool}
    '''
    filepath = os.path.join(AUDIO_DIR, audiopath)
    sr = lb.get_samplerate(filepath)
    signal, _ = lb.load(filepath, sr=sr)
    if sr != SAMPLE_RATE:
        signal = lb.resample(signal, orig_sr=sr, target_sr=SAMPLE_RATE)
    signal, _ = lb.effects.trim(signal, top_db=40)

    audio_frames = cut_recordings(signal, **frame_config)

    return audio_frames

In [None]:
processor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
INPUT_COLUMN = 'audio'
OUTPUT_COLUMN = 'depression.symptoms'

def preprocess_stacked_speech_files(batch):
    """
    Process batch of audio files into windows of window_length with stride_length
    and return input values as well as metadata for the batch
    """

    frame_config = {
        'to_cut': 5,
        'stride': 2,
        'to_pad': True
    }
    speech_list = [
        get_audio_samples(path, frame_config) for path in batch[INPUT_COLUMN]
    ]

    labels = [label for label in batch[OUTPUT_COLUMN]]
    n_windows = [len(window) for window in speech_list]

    processed_list = [
        processor(speech_window, sampling_rate=SAMPLE_RATE)
        for speech_window in speech_list
    ]

    # make new larger dictionary that contains the flattened values
    # labels = label as idx
    out = {"input_values": [], "labels": []} # "attention_mask": [],

    # save metadata from other columns
    for meta_key in batch.keys():
        if meta_key == INPUT_COLUMN:
            out[f'{meta_key}_ID'] = []

    # looping through list of processed stacked speech arrays
    for i, processed_speech in enumerate(processed_list):

        # un-nesting the stacked time windows
        for key, value in processed_speech.items():
            if key != 'attention_mask':
            # values are indented in a list, need to index 0 to get them out
              out[key].extend(value)
            # print(key, len(out[key]))

        # making sure each window has the right label
        out["labels"] += [labels[i]] * n_windows[i]

        # adding metadata to be able to reidentify files
        for meta_key, meta_value in batch.items():
            if meta_key == INPUT_COLUMN:
              # print(meta_key, meta_value)
              out[f'{meta_key}_ID'] += [meta_value[i]] # * n_windows[i]

    return out

In [None]:
train = train_dataset.map(
    preprocess_stacked_speech_files,
    batched=True,
    batch_size=16,
    remove_columns=train_dataset.column_names,
)

val = val_dataset.map(
    preprocess_stacked_speech_files,
    batched=True,
    batch_size=16,
    remove_columns=train_dataset.column_names,
)

test = test_dataset.map(
    preprocess_stacked_speech_files,
    batched=True,
    batch_size=4,
    remove_columns=train_dataset.column_names,
)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [None]:
TRAIN_DATASET_PATH = '/content/drive/MyDrive/Laboratory/data/train'
train = load_from_disk(TRAIN_DATASET_PATH)

Loading dataset from disk:   0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
val_pd = val.to_pandas()

In [None]:
val_pd.head()

Unnamed: 0,input_values,labels,audio_ID
0,"[[0.01072557, 0.012946154, 0.010644085, 0.0180...",0,PN-216-pers-1-trip.wav
1,"[[0.029042706, 0.022691382, 0.014911257, 0.030...",0,PN-216-instr-1-chair.wav
2,"[[-0.01367839, -0.014848133, -0.029146224, -0....",0,PN-216-pic-1-winterday.wav
3,"[[-0.045608684, -0.10905161, -0.11416071, -0.1...",1,PN-164-pic-1-winterday.wav
4,"[[0.045743175, 0.025785526, 0.045982063, 0.043...",1,PN-164-instr-1-chair.wav


In [None]:
val_pd.to_pickle('/content/drive/MyDrive/Laboratory/data/val.pickle')

In [None]:
test_pd = test.to_pandas()
test_pd.to_pickle('/content/drive/MyDrive/Laboratory/data/test.pickle')