# Load Libraries

In [9]:
import warnings
warnings.filterwarnings('ignore')

import os
import time
import joblib
import json
import csv
import pathlib
import librosa
import librosa.display
import scipy.stats
from scipy.stats import skew, kurtosis
from scipy.signal import hilbert

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from PIL import Image
from matplotlib import pyplot
from collections import Counter
from pydub import AudioSegment # sudo apt install ffmpeg
from pprint import pprint
from scipy.signal import spectrogram

%matplotlib inline

from concurrent.futures import ThreadPoolExecutor

# Load data

In [10]:
df_all = pd.read_csv('Results/Data/data_all.csv')
df_all

Unnamed: 0,dataset,filepath,filename,age,gender,label,status,prob
0,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_counting-normal,28.0,male,0,healthy,
1,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-o,28.0,male,0,healthy,
2,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-a,28.0,male,0,healthy,
3,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-e,28.0,male,0,healthy,
4,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_breathing-shallow,28.0,male,0,healthy,
...,...,...,...,...,...,...,...,...
72335,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-8.wav,37.0,male,1,negative,
72336,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-9.wav,37.0,male,1,negative,
72337,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-1.wav,24.0,female,1,negative,
72338,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-0.wav,24.0,female,1,negative,


In [11]:
# Get columns
columns = [
    'dataset', 'filepath', 'filename', 
    'age', 'gender', 'label', 'status', 'duration', 
    'duration_segment', 'sample_frequency',
    'segment_shape',
    ]
    
print(columns)

['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'status', 'duration', 'duration_segment', 'sample_frequency', 'segment_shape']


In [12]:
def split_audio(
        y,  # Signal
        sr, # Sample frequency
        segment_length=10.0, # Segment length 1s
        overlap=0 # Overlap 50%
    ):
    # Calculate the number of samples per segment
    segment_samples = int(segment_length * sr)
    
    # Calculate the step size
    step_size = int(segment_samples * (1 - overlap))
    
    # Initialize the start and end points
    start = 0
    end = segment_samples
    
    segments = []
    
    while start < len(y):
        segment = y[start:end]
        segments.append(segment)
        start += step_size
        end = start + segment_samples
    
    return segments

In [19]:
def pad_array(array, target_length):
    return np.pad(array, (0, max(0, target_length - len(array))), 'constant')


# Function to process each row and extract features
def process_row(i, df_all, segment_length, overlap):
    results = []
    
    filepath = df_all['filepath'][i] # Audio path
    dataset = df_all['dataset'][i] # Dataset name
    filename = df_all['filename'][i]
    
    label = df_all['label'][i]
    age = df_all['age'][i]
    gender = df_all['gender'][i]
    status = df_all['status'][i]
    
    # try:
    if True:
        (y, sr) = librosa.load(filepath, mono=True)
        duration = librosa.get_duration(y=y, sr=sr)

        if duration == 0:
            return results

        segments = split_audio(y, sr, segment_length=segment_length, overlap=overlap)

        for segment in segments:
            segment = pad_array(segment, segment_length*sr)
            
            duration_segment = librosa.get_duration(y=segment, sr=sr)

            n_mfcc=13 #20
            hop_length=1024 #512
            n_fft=1024 #2048

            # n_mfcc=20
            # hop_length=512
            # n_fft=2048
            
            mfccs = librosa.feature.mfcc(
                y=segment,
                sr=sr, 
                n_mfcc=n_mfcc, 
                hop_length=hop_length, 
                n_fft=n_fft
            )

            segment_shape = mfccs.shape
            segment_output = list(mfccs.flatten())
            # print(f'mfccs: {mfccs.shape}')

            result_row = [
                dataset, filepath, filename,
                age, gender, label, status, duration,
                duration_segment,
                sr, segment_shape,
            ] + segment_output

            results.append(result_row)

    # except Exception as error:
    #     print(error)
    #     pass
            
    return results

In [20]:
13*216

2808

In [21]:
20*431

8620

In [22]:
list_dataset_name = [
    'coswara', 
    'coughvid', 
    'esc50', 
    'fsdkaggle', 
    'virufy',
    ]

segment_length=1
overlap=0 # To avoid overfitting when doing kfold

for dataset_name in list_dataset_name:
    print(f'{dataset_name}')
    
    df_all = pd.read_csv(f'Results/Data/data_summary_{dataset_name}.csv')
    
    results_all = []

    # Multi threading
#     with ThreadPoolExecutor(max_workers=8) as executor:
# #     with ThreadPoolExecutor() as executor:
#         futures = [executor.submit(process_row, i, df_all, segment_length, overlap) for i in range(len(df_all))]
        
#         for future in tqdm(futures):
#             results_all.extend(future.result())
    
    for i in tqdm(range(len(df_all))):
        results_all.extend(process_row(i, df_all, segment_length, overlap))
        
    # columns_additional = list(range(20*431))
    if segment_length == 10:    
        columns_additional = list(range(13*216))
    elif segment_length == 5:
        columns_additional = list(range(13*108))
    elif segment_length == 1:
        columns_additional = list(range(13*22))
    path_save = f'Results/CNN_Features/MFCC/data_{dataset_name}_features_{segment_length}s_{overlap}.csv'
    results_all = pd.DataFrame(results_all, columns=columns + columns_additional)
    results_all.to_csv(path_save, index=False)

coswara


  0%|          | 0/24712 [00:00<?, ?it/s]

coughvid


  0%|          | 0/34434 [00:00<?, ?it/s]

esc50


  0%|          | 0/2000 [00:00<?, ?it/s]

fsdkaggle


  0%|          | 0/11073 [00:00<?, ?it/s]

virufy


  0%|          | 0/121 [00:00<?, ?it/s]