# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import json
import csv

import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import Counter
from pprint import pprint

from functions.functions_features import split_audio, mean_variance_normalize, process_row, pad_array, process_CNN_row

# Load data

In [2]:
df_all = pd.read_csv('Results/Data/data_all.csv')
df_all

Unnamed: 0,dataset,filepath,filename,age,gender,label,status,prob
0,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_cough-shallow,28.0,male,1,healthy,
1,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_cough-heavy,28.0,male,1,healthy,
2,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_breathing-shallow,28.0,male,0,healthy,
3,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-a,28.0,male,0,healthy,
4,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-o,28.0,male,0,healthy,
...,...,...,...,...,...,...,...,...
72335,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-4.wav,37.0,male,1,negative,
72336,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-1.wav,37.0,male,1,negative,
72337,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-5.wav,24.0,female,1,negative,
72338,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-0.wav,24.0,female,1,negative,


In [3]:
# Get columns
columns = [
    'dataset', 'filepath', 'filename', 
    'age', 'gender', 'label', 'status', 'duration', 
    'duration_segment', 'sample_frequency', 'mean_amplitude',
    'segment_shape',
    ]
    
print(columns)

['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'status', 'duration', 'duration_segment', 'sample_frequency', 'mean_amplitude', 'segment_shape']


In [4]:
list_dataset_name = [
    'coswara', 
    'coughvid', 
    'esc50', 
    'fsdkaggle', 
    'virufy',
    ]

overlap=0 # To avoid overfitting when doing kfold

In [5]:
if not os.path.exists(f'Results/Features_CNN'):
    os.makedirs(f'Results/Features_CNN')

for segment_length in [1, 5, 10]:
    for dataset_name in list_dataset_name:
        print(f'\n{dataset_name} {segment_length}')
        
        df_all = pd.read_csv(f'Results/Data/data_summary_{dataset_name}.csv')
        df_all = df_all.sample(frac=1).groupby('label').head(1000).reset_index(drop=True)
        
        results_all = []
 
        path_save = f'Results/Features_CNN/data_{dataset_name}_features_{segment_length}s_{overlap}.csv'

        if os.path.exists(path_save) == False:
        # if True:
            for i in tqdm(range(len(df_all))):
                results, output_shape = process_CNN_row(i, df_all, segment_length, overlap)
                results_all.extend(results)
                
            print(output_shape)
            dimension_dictionary = {
                1: 22,
                5: 27,
                10: 27,
            }
            output_shape = dimension_dictionary[segment_length] * 128
            columns_additional = list(range(output_shape))
            results_all = pd.DataFrame(results_all, columns=columns + columns_additional)
            results_all.to_csv(path_save, index=False)
            print(results_all.shape)

coswara 1


100%|██████████| 2000/2000 [01:44<00:00, 19.10it/s]


2816
(17759, 2828)
coughvid 1


100%|██████████| 2000/2000 [01:40<00:00, 19.97it/s]


2816
(16693, 2828)
esc50 1


100%|██████████| 1040/1040 [00:29<00:00, 35.14it/s]


2816
(5200, 2828)
fsdkaggle 1


100%|██████████| 1273/1273 [00:46<00:00, 27.23it/s]


2816
(8486, 2828)
virufy 1


100%|██████████| 121/121 [00:01<00:00, 84.38it/s]


2816
(242, 2828)
coswara 5


100%|██████████| 2000/2000 [00:40<00:00, 48.88it/s]


3456
(4252, 3468)
coughvid 5


100%|██████████| 2000/2000 [00:25<00:00, 78.33it/s] 


3456
(3710, 3468)
esc50 5


100%|██████████| 1040/1040 [00:05<00:00, 176.63it/s]


3456
(1040, 3468)
fsdkaggle 5


100%|██████████| 1273/1273 [00:11<00:00, 108.61it/s]


3456
(2368, 3468)
virufy 5


100%|██████████| 121/121 [00:00<00:00, 199.03it/s]


3456
(121, 3468)
coswara 10


100%|██████████| 2000/2000 [00:19<00:00, 103.48it/s]


None
(2583, 3468)
coughvid 10


100%|██████████| 2000/2000 [00:20<00:00, 96.27it/s] 


3456
(2133, 3468)
esc50 10


100%|██████████| 1040/1040 [00:12<00:00, 82.99it/s]


3456
(1040, 3468)
fsdkaggle 10


100%|██████████| 1273/1273 [00:17<00:00, 70.90it/s]


3456
(1625, 3468)
virufy 10


100%|██████████| 121/121 [00:01<00:00, 107.71it/s]


3456
(121, 3468)
