In [None]:
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
import numpy as np
import librosa
import random
import time


In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)


## calculates embeddings and saves the DF
### Parameters:

- **model_path**:  
  The path to the model used for extracting embeddings.

- **length**:  
  The desired length of each audio segment in seconds.

- **batch_size**:  
  The batch size used for processing audio segments.
  
- **name_aug**:  
  A string identifier used for distinguishing augmented data.
  
- **test_only**:  
  True if the run is only for making predictions.
  
- **label**:  
  Either Speaker Label of speaker under test or Dialect Label for Training.

### Returns:

- **df_learn**:  
  DataFrame containing the following columns:
  - _'dialect'_: Represents the dialect class of the audio.
  - _'file_name'_: Represents the name of the audio file.
  - _'trillsson'_: Contains the embeddings calculated by the pre-trained model.
  - _'file_path'_: Represents the path of the audio file.
  - _'speaker'_: Represents the speaker associated with the audio.
  - _'samples_begin'_: Indicates the starting sample index of each segment.
  - _'samples_end'_: Indicates the ending sample index of each segment.
  - _'class'_: Class (Type) of Audio File.
  
  The DataFrame is saved as './Data_.pkl' and './Data_.csv'.


In [None]:
def create_data(model_path, length, batch_size, name_aug, test_only, label, step_duration=1):
    timeCountTotal = 0.0
    startTotal = time.time()
    
    df = pd.read_pickle('./All_Files_.pkl')
    
    model = hub.load(model_path)

    model.trainable = False

    df_learn = pd.DataFrame(columns=['dialect', 'file_name', 'trillsson', 'file_path', 'speaker', 'samples_begin', 'samples_end', 'class'])
    audio_samples = []
        
    length = int(length*16000)
    
    if (test_only):
        df = df[(df['speaker'] == label) & (df['type'] == 'test')]
        for index, row in df.iterrows():
            file_path = None
            if row['length'] >= 0 and row['augmented']=='False':
                file_path = row['file_path']
                name = row['file_name']
                speaker = row['speaker']
                dialect = row['dialect']
                class_label = row['type']
                
            if (file_path is not None):
                audio, sr = librosa.load(file_path, sr=16000, dtype=np.float32)
                step = int(step_duration * sr)
                
                for i in range(0, len(audio) - length + 1, step):
                    ad = audio[i:i+length]
                    audio_samples.append(ad)
                    list_row = [dialect, name, [], file_path, speaker, i, i+length-1, class_label]
                    df_learn.loc[len(df_learn)] = list_row
        
    else:
        df = df[df['type'].isin(['standard', 'dialect'])]
        # cut Audios in length long Segments and save it in df_learn
        for index, row in df.iterrows():
            file_path = None
            if row['dialect'] == label:
                if (name_aug != ''):
                    if row['length'] >= 0 and row['augmented']=='True':
                        file_path = row['file_path']
                        name = row['file_name']
                        speaker = row['speaker']
                        dialect = row['dialect']
                        class_label = row['type']
                else:
                    if row['length'] >= 0 and row['augmented']=='False':
                        file_path = row['file_path']
                        name = row['file_name']
                        speaker = row['speaker']
                        dialect = row['dialect']
                        class_label = row['type']

                if (file_path is not None):
                    audio, sr = librosa.load(file_path, sr=16000, dtype=np.float32)

                    times = len(audio)//(length)
                    for i in range(0, times):
                        ad = audio[i*length:((i+1)*length)]
                        audio_samples.append(ad)
                        list_row = [dialect, name, [], file_path, speaker, i*length, ((i+1)*length)-1, class_label]
                        df_learn.loc[len(df_learn)] = list_row
    
    # shuffle indices of df_learn, so that embedding calculation is not sequential
    df_size = df_learn.shape[0]
    indices = list(range(df_size))
    random.shuffle(indices)
    embeddings_list = [None] * len(df_learn)

    i = 0
    timeCount = 0.0
    print('total to calcuate: ' + str(df_size))
    while i < df_size:
        indices_batch = indices[i:min(i + batch_size, df_size)]
        audios = [audio_samples[ind] for ind in indices_batch]
            
        # calculate actual embeddings
        start = time.time()
        embeddings = model(audios)['embedding']
        end = time.time()
        timeCount += end - start
        embeddings_list_tmp = embeddings.numpy().tolist()
        
        for ind, emb in zip(indices_batch, embeddings_list_tmp):
            embeddings_list[ind] = emb
        i += batch_size
        print('actual calculated: ' + str(i))

    df_learn['trillsson'] = embeddings_list
    print('Time for extracting Features:', timeCount)
    
    # save embeddings as pkl and csv
    if (test_only):
        df_learn.to_pickle('./Data_test.pkl')
        df_learn.to_csv('./Data_test.csv',  sep=';')
    elif (name_aug != ''):
        df_learn.to_pickle('./Data_' + name_aug + '_aug.pkl')
        df_learn.to_csv('./Data_' + name_aug + '_aug.csv',  sep=';')
    else:
        df_learn.to_pickle('./Data_.pkl')
        df_learn.to_csv('./Data_.csv',  sep=';')
        
    endTotal = time.time()
    timeCountTotal += endTotal - startTotal
    print('Time for extracting Features in total:', timeCountTotal)
    
    return df_learn
