In [1]:
#!pip install tqdm

# 장애음성 중에서 뇌졸증(구음장애) classification
#### CI : 뇌졸중 O       (뇌경색, 뇌출혈)
#### HI : 뇌졸중 X       (루게릭, 청각장애)


In [2]:
import enum
import glob, librosa, random
import numpy as np
import pandas as pd
from typing import Union
import json
import os

from tqdm import tqdm, tqdm_notebook

import sys
path = os.path.abspath('../../')
print(path)
sys.path.append(path)
# sys.path.append('./utils')
from CommonUtil import *

/home/gil/gcubme_ai3/Workspace/SB_MUN


In [3]:
# BASE_PATH = "C:\kwoncy\\nlp\dysarthria2\data\**"
BASE_PATH = os.path.abspath('./data/origin2')
SAVE_PATH = os.path.abspath('./data/preproc2')

SAMPLE_RATE = 16000
WINDOW_SIZE = 64000
length = 5000

save_train =os.path.join(SAVE_PATH, 'train') 
save_validation = os.path.join(SAVE_PATH, 'validation')
save_test = os.path.join(SAVE_PATH, 'test')

os.makedirs(save_train, exist_ok=True)
os.makedirs(save_validation, exist_ok=True)
os.makedirs(save_test, exist_ok=True)

print(BASE_PATH)
print(SAVE_PATH)

/home/gil/gcubme_ai3/Workspace/SB_MUN/9_NIA_46_dysarthria/1_source_abstract/data/origin2
/home/gil/gcubme_ai3/Workspace/SB_MUN/9_NIA_46_dysarthria/1_source_abstract/data/preproc2


In [4]:
total_dir = f'{BASE_PATH}/*/*/*_SCO_*.wav'
total_list = sorted(glob.glob(total_dir))
print(len(total_list))


25000


In [5]:
trai_dir = f'{BASE_PATH}/train/*/*_SCO_*.wav'
vali_dir = f'{BASE_PATH}/validation/*/*_SCO_*.wav'
test_dir = f'{BASE_PATH}/test/*/*_SCO_*.wav'


trai_list = sorted(glob.glob(trai_dir))
vali_list = sorted(glob.glob(vali_dir))
test_list = sorted(glob.glob(test_dir))

print(len(trai_list))
print(len(vali_list))
print(len(test_list))


20000
2500
2500


In [6]:
def normalization(list):
    list = np.array(list)
    #df = pd.DataFrame(list)
    
    max_list = 1
    min_list = -1
    
    list = list + abs(min_list)
    
    max_list = 2
    min_list = 0
    
    list_norm = (list - min_list) / (max_list - min_list)
    
    return list_norm

In [7]:
class Patient:
    def __init__(self, wav_files, save_path:'str') -> None:
        self.save_path = save_path
        self.wav_files:'list[str]' = wav_files
        self.wav_files_length = len(self.wav_files)
#         print("self.wav_files's length: ", len(self.wav_files))
        
        self.original_wavs = None
        self.temp_wavs = None
        self.y_data_list = None
        # self.load_wavs()
    
    def load_wav(self, wav_file:str) -> np.ndarray:
        arr, sr =  librosa.load(wav_file, sr = SAMPLE_RATE)
        return arr
    
    def load_json(self, wav_path:str):
        dir = os.path.dirname(wav_path)
        filename = os.path.basename(wav_path).split('.wav')[0]
       
        json_path = f'{dir}/{filename}.json'
       
        with open(json_path, 'r') as json_file:
            json_data = json.load(json_file)
           # print(json_data['speaker']['classification'])
        
        classification = json_data['speaker']['classification']    
        if classification == 'CI' or classification =='IH':
            y = 1.0
        else:
            y = 0.0
        return y
    
    def load_json_info(self, wav_path:str):
        dir = os.path.dirname(wav_path)
        filename = os.path.basename(wav_path).split('.wav')[0]
       
        json_path = f'{dir}/{filename}.json'
       
        with open(json_path, 'r') as json_file:
            json_data = json.load(json_file)
           # print(json_data['speaker']['classification'])
        return json_data
    
    def load_wavs(self, wav_files:'list[str]'=None, n:int=None) -> 'Patient':
        ## load wav files. if input is None, then loads self.wav_files which is indicate the files made at the time this class being instance.
        ## this function will set self.loaded_wavs using input list of wav_files
        if wav_files == None:
            wav_files = self.wav_files
            
        length_wav_files = len(wav_files)
        self.original_wavs = []
        self.y_data_list = []
        
        for a in (range(len(wav_files))):
            wav_path = wav_files[a]
            wav_file = self.load_wav(wav_path)
            classification = self.load_json(wav_path)
            self.original_wavs.append(wav_file)
            self.y_data_list.append(classification)
#         print(f'successful --> {len(self.original_wavs)}, {len(self.y_data_list)}')
        
        return self
    
#============================================================================================================ 
    def padded_wav(self, audio, classification):
        if len(audio) <= WINDOW_SIZE:
            data_count = 1
            X = np.zeros((data_count, WINDOW_SIZE, ), dtype=np.float32)
            X[0, :len(audio)] = audio[:]
        elif len(audio) > WINDOW_SIZE:
            data_count = int(len(audio) // WINDOW_SIZE + 1)
            X = np.zeros((data_count, WINDOW_SIZE, ), dtype=np.float32)
            data_len = int(len(audio) / data_count)
            for a in range(data_count):
                X[a,:data_len] = audio[a*data_len:(a+1)*data_len]
        
        if classification == 0.0:
            Y = np.zeros((data_count, ))
        elif classification == 1.0:
            Y = np.ones((data_count, ))
        
        return X, Y, data_count
    
    
    def save_nparray(self, nparray:np.ndarray=None, save_name:str='') -> 'Patient':
        data_x = []
        data_y = []
        
        data_count_list = []
        for a in (range(len(self.original_wavs))):
            audio = self.original_wavs[a]
            classification = self.y_data_list[a]
            X, Y, data_count = self.padded_wav(audio, classification)
            data_x.extend(X)
            data_y.extend(Y)
            data_count_list.append(data_count)
            
        save_path_X = f"{self.save_path}/{save_name}_X.npy"
        save_path_Y = f"{self.save_path}/{save_name}_Y.npy"
        save_path_CSV = f"{self.save_path}/{save_name}.csv"
        
        data_x = normalization(data_x)
        data_y = np.array(data_y)
#         print(data_x.shape)
#         print(data_y.shape)
        
        
        np.save(save_path_X, data_x)
        np.save(save_path_Y, data_y)
        
#         print(f'successful --> {data_x.shape}, {data_y.shape}')
        
        df = pd.DataFrame({"wav_files": self.wav_files, "y_data": self.y_data_list, "data_count":data_count_list})
        df.to_csv(save_path_CSV, index=None, encoding="utf-8-sig")
        return self


# 3. 일괄 데이터셋 생성하기

In [8]:
def save_preproc(save_dir, data_list):
    
    for a in tqdm_notebook(range(len(data_list))):

        wav_files = [data_list[a]]
        save_name = os.path.basename(wav_files[0]).split('.wav')[0]
    
        data_path = f'{save_dir}/{save_name}'
            
        patient = Patient(wav_files, save_dir).load_wavs()
        patient.save_nparray(save_name=save_name)

In [9]:
save_preproc(save_train, trai_list)
save_preproc(save_validation, vali_list)
save_preproc(save_test, test_list)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for a in tqdm_notebook(range(len(data_list))):


  0%|          | 0/20000 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for a in tqdm_notebook(range(len(data_list))):


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

# data info

In [10]:
print(len(total_list), total_list[0])

25000 /home/gil/gcubme_ai3/Workspace/SB_MUN/9_NIA_46_dysarthria/1_source_abstract/data/origin2/test/HC0008/HC0008_SCO_A_1_029.wav


In [11]:
patient_list = [os.path.basename(path).split('_')[0] for path in total_list]
patient_list = np.unique(patient_list)
print(len(patient_list), patient_list[0])


1000 HC0001


In [12]:
patient_total_list = []
ckeck_list = []
for i  in tqdm_notebook(range(len(total_list))):
    wav_path = total_list[i]
    patient_id = os.path.basename(wav_path).split('_')[0]
    
    if not patient_id in ckeck_list:
        patient_total_list.append(wav_path)
        ckeck_list.append(patient_id)
print(len(patient_total_list), patient_total_list[0])


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i  in tqdm_notebook(range(len(total_list))):


  0%|          | 0/25000 [00:00<?, ?it/s]

1000 /home/gil/gcubme_ai3/Workspace/SB_MUN/9_NIA_46_dysarthria/1_source_abstract/data/origin2/test/HC0008/HC0008_SCO_A_1_029.wav


In [13]:
def save_preproc(save_dir, data_list):
    
    json_list = []
    for a in tqdm_notebook(range(len(data_list))):
        wav_path = data_list[a]
        wav_files = [wav_path]
        save_name = os.path.basename(wav_files[0]).split('.wav')[0]
    
        data_path = f'{save_dir}/{save_name}'
            
        patient_json = Patient(wav_files, save_dir).load_json_info(wav_path)
        json_list.append(patient_json)
    print(f'total_json : {len(json_list)}')
    return json_list

In [14]:
patient_total_json_list = save_preproc('./test', patient_total_list)
total_json_list = save_preproc('./test', total_list)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for a in tqdm_notebook(range(len(data_list))):


  0%|          | 0/1000 [00:00<?, ?it/s]

total_json : 1000


  0%|          | 0/25000 [00:00<?, ?it/s]

total_json : 25000


In [15]:
def return_data_info(json_list):
    category_list = []
    gender_list = []
    age_list = []
    education_list = []
    hospital_list = []
    classification_list = []
    intelligibility_list = []
    degree_list = []
    diagnostics_list = []
    education_list = []
    recordingTime_list = []
    for i  in tqdm_notebook(range(len(json_list))):
        category = json_list[i]['dataset']['category']
        gender = json_list[i]['speaker']['gender']
        age = json_list[i]['speaker']['age']
        education = json_list[i]['speaker']['education']
        hospital = json_list[i]['speaker']['hospital']
        classification = json_list[i]['speaker']['classification']
        intelligibility = json_list[i]['speaker']['intelligibility']
        degree = json_list[i]['speaker']['degree']
        diagnostics = json_list[i]['speaker']['diagnostics']
        recordingTime = json_list[i]['dataset']['recordingTime']
        
        category_list.append(category)
        gender_list.append(gender)
        age_list.append(age)
        education_list.append(education)
        hospital_list.append(hospital)
        classification_list.append(classification)
        intelligibility_list.append(intelligibility)
        degree_list.append(degree)
        diagnostics_list.append(diagnostics)
        recordingTime_list.append(recordingTime)
    data_info = {
        'category_list':category_list,
        'gender_list':gender_list,
        'age_list':age_list,
        'education_list':education_list,
        'hospital_list':hospital_list,
        'classification_list':classification_list,
        'intelligibility_list':intelligibility_list,
        'degree_list':degree_list,
        'diagnostics_list':diagnostics_list,
        'education_list':education_list,
        'recordingTime_list':recordingTime_list
    }
    return data_info

In [16]:
patient_data_info = return_data_info(patient_total_json_list)
data_info = return_data_info(total_json_list)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i  in tqdm_notebook(range(len(json_list))):


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [17]:
def check_data(info_list, key):
    data_list = info_list[key]
    data_index = np.unique(data_list)
    data_cnt = []
    for idx in data_index:
        data_cnt.append(data_list.count(idx))
    total_cnt = np.sum(data_cnt)

    print(total_cnt, data_index)
    for i, idx in enumerate(data_index):
        print(f'{idx}\t: {data_cnt[i]} ({data_cnt[i]/total_cnt*100:.2f}%)')
        

# category_list

In [18]:
check_data(data_info, 'category_list')

25000 ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M']
A	: 2841 (11.36%)
B	: 2490 (9.96%)
C	: 2097 (8.39%)
D	: 2371 (9.48%)
E	: 1865 (7.46%)
F	: 1528 (6.11%)
G	: 762 (3.05%)
H	: 2260 (9.04%)
I	: 776 (3.10%)
J	: 2147 (8.59%)
K	: 2110 (8.44%)
L	: 1849 (7.40%)
M	: 1904 (7.62%)


# Gender

In [19]:
check_data(patient_data_info, 'gender_list')

1000 ['F' 'M']
F	: 523 (52.30%)
M	: 477 (47.70%)


# Age

In [20]:
check_data(patient_data_info, 'age_list')

1000 ['10~20대' '30~40대' '50~60대' '70대 이상']
10~20대	: 325 (32.50%)
30~40대	: 300 (30.00%)
50~60대	: 236 (23.60%)
70대 이상	: 139 (13.90%)


# education_list

In [117]:
check_data(patient_data_info, 'education_list')

1000 ['E1' 'E2' 'E3' 'E4' 'E5' 'E6']
E1	: 99 (9.90%)
E2	: 82 (8.20%)
E3	: 348 (34.80%)
E4	: 424 (42.40%)
E5	: 43 (4.30%)
E6	: 4 (0.40%)


# hospital_list

In [142]:
check_data(data_info, 'hospital_list')

25000 ['HC' 'HH' 'HK' 'HL' 'HM' 'HS']
HC	: 3025 (12.10%)
HH	: 1150 (4.60%)
HK	: 475 (1.90%)
HL	: 15875 (63.50%)
HM	: 450 (1.80%)
HS	: 4025 (16.10%)


# classification_list

In [114]:
check_data(data_info, 'classification_list')

25000 ['ALS' 'CI' 'HI' 'IH']
ALS	: 25 (0.10%)
CI	: 9025 (36.10%)
HI	: 15875 (63.50%)
IH	: 75 (0.30%)


# intelligibility_list

In [113]:
check_data(data_info, 'intelligibility_list')

25000 ['SIR1' 'SIR2' 'SIR3' 'SIR4' 'SIR5']
SIR1	: 1475 (5.90%)
SIR2	: 1960 (7.84%)
SIR3	: 3745 (14.98%)
SIR4	: 8290 (33.16%)
SIR5	: 9530 (38.12%)


# degree_list

In [120]:
check_data(data_info, 'degree_list')

25000 ['MILD' 'MODERATE' 'MODERATELY' 'PROFOUND' 'SEVERE']
MILD	: 7874 (31.50%)
MODERATE	: 2613 (10.45%)
MODERATELY	: 1525 (6.10%)
PROFOUND	: 8775 (35.10%)
SEVERE	: 4213 (16.85%)


# diagnostics_list

In [132]:
diagnostics_list = patient_data_info['diagnostics_list']
diagnostics_list = [int(date) for date in diagnostics_list if date != None]
print(np.min(diagnostics_list))
print(np.max(diagnostics_list))
# print(sorted(diagnostics_list))


19891220
20221224


# recordingTime_list

In [143]:
recordingTime_list = patient_data_info['recordingTime_list']
print(round(np.min(recordingTime_list),2))
print(round(np.max(recordingTime_list),2))
# print(sorted(diagnostics_list))
print(round(np.mean(recordingTime_list),2))
print(round(np.std(recordingTime_list),2))


1.12
18.43
5.24
2.17


In [84]:
data_info.keys()

dict_keys(['gender_list', 'age_list', 'education_list', 'hospital_list', 'classification_list', 'intelligibility_list', 'degree_list', 'diagnostics_list', 'recordingTime_list'])

In [27]:
total_json_list[0]

{'dataset': {'filename': 'HC0008_SCO_A_1_029',
  'speakerID': 'HC0008',
  'sentenceType': 'SCO',
  'category': 'A',
  'numberOfRecordings': '1',
  'sentenceID': '029',
  'recordingSystem': 'CON',
  'recordingQuality': '16000Hz',
  'recordingDate': '20220901',
  'recordingTime': 6.202,
  'recordingDevice': 'SM'},
 'speaker': {'gender': 'M',
  'age': '50~60대',
  'education': 'E3',
  'hospital': 'HC',
  'classification': 'CI',
  'intelligibility': 'SIR5',
  'degree': 'MILD',
  'diagnostics': '20220901',
  'device': None,
  'deviceUsedAge': None,
  'comunicationTool': None,
  'rehabilitation': None,
  'hearingLoss': None},
 'annotations': {'script': '코클리어 엔 세븐 블루투스 연결 방법 알려 줘.',
  'form': '코클리어 엔 (쎄븐)/(세븐) 블루투스 연결 방법 알려 줘.',
  'pronunciationForm': '코클리어 엔 쎄븐 블루투스 연결 방법 알려 줘.',
  'spellingForm': '코클리어 엔 세븐 블루투스 연결 방법 알려 줘.',
  'start': 0.0,
  'end': 6.202}}