In [1]:
from pydub import AudioSegment
from pathlib import Path
import tensorflow as tf
from glob import glob
import pandas as pd
import numpy as np
import math
import os


## Create df with all classes, speaker and files
### Parameters:

- **path**:  
  The path of the original audio files. Each folder within this path represents a dialect, and subfolders within each dialect folder should contain audio files from different speakers.  
  
- **path_aug**:  
  The path of the augmented audio files. Each folder within this path represents a dialect, and subfolders within each dialect folder should contain audio files from different speakers.  

- **name_aug**:  
  Used for storing the resulting DataFrame as './All_Files_' + name_aug + '.pkl'.
  
- **s_ending**:  
  Ending of Audio File from Standard.
  
- **d_ending**:  
  Ending of Audio File from Dialect.

- **t_ending**:  
  Ending of Audio File from Test.

### Returns:

Saves a DataFrame with the following columns:

- _'dialect'_: Represents the class (dialect) of the audio.
- _'speaker'_: Represents the speaker or place associated with the audio.
- _'file\_name'_: Represents the name of the audio file.
- _'length'_: Represents the number of samples in the audio file.
- _'file\_path'_: Represents the path of the audio file.
- _'augmented'_: True if its an augmented file.
- _'type'_: Represents the type (standard, dialect or test) of a file.


The DataFrame is saved to './All_Files_' + name_aug + '.pkl'.


In [2]:
def create_speaker_DF(path, path_aug, name_aug, s_ending, d_ending, t_ending):
    df = pd.DataFrame(columns=['dialect', 'speaker', 'file_name', 'length', 'file_path', 'augmented', 'type'])

    df = sub(df, path, 'False', '', s_ending, d_ending, t_ending)
    if (name_aug != ''):
        df = sub(df, path_aug, 'True', name_aug, s_ending, d_ending, t_ending)
    
    df.to_pickle('./All_Files_.pkl')
    
    return df


In [3]:
def sub(df, path, aug, name_aug, s_ending, d_ending, t_ending):
    all_speaker = []
    all_speaker_name = []
    
    all_dialects = glob(path + '\\*', recursive = True)
    for dialect in all_dialects:
        all_speaker = np.concatenate((all_speaker, glob(dialect + '\\*', recursive = True)), axis=None)
        all_speaker_name.append([f.name for f in os.scandir(dialect) if f.is_dir()])

    audios = []
    for path in all_speaker:
        audios.extend(tf.io.gfile.glob(path + '\\*.wav'))
    
    for audio in audios:
        split = audio.split('\\')
    
        audio_segment = AudioSegment.from_file(audio, "wav") 
        duration = len(audio_segment)
    
        speaker = split[-2]
        
        base_name = Path(audio).stem
        if base_name.endswith(s_ending):
            audio_type = 'standard'
        elif base_name.endswith(d_ending):
            audio_type = 'dialect'
        elif base_name.endswith(t_ending):
            audio_type = 'test'
        else:
            audio_type = None
        
        list_row = [split[-3], speaker, Path(audio).name, duration, audio, aug, audio_type]
        df.loc[len(df)] = list_row
    return df