In [1]:
import numpy as np 
import pandas as pd 
import pickle

from sklearn.preprocessing import LabelEncoder 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import soundfile as sf
import librosa 



# **SOME DEFAULT FUNCTIONS**


In [2]:
def convert_probability_to_1_0_score(y_pred_proba, threshold):
    b, m, n = np.array(y_pred_proba).shape
    y_pred = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            if y_pred_proba[j][i][1] >= threshold:
                y_pred[i][j] = 1
    return y_pred

In [3]:
def convert_categorical_data_to_numerical(df, categorical_columns):
    for column in categorical_columns:
        number = LabelEncoder()
        df[column] = number.fit_transform(df[column].astype('str')).astype(float)
        le_name_mapping = dict(zip(number.classes_, number.transform(number.classes_)))
        print(column)
        print(le_name_mapping)

In [4]:
def convert_audio_to_array(data, sample_rate):
    stft = np.abs(librosa.stft(data)) 
    mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=128),axis=1) 
    melspectrogram = np.mean(librosa.feature.melspectrogram(S=stft, sr=sample_rate),axis=1)
    return np.concatenate((mfccs,melspectrogram))

# **GLOBAL VARIABLES**

In [5]:
PATH_TO_AUDIO_FILES = '/kaggle/input/notebookd311de3272/processed_audio_files/'
ADDITIONAL_COLUMNS = ['adult_bmi', 'sex', 'age']
CATEGORICAL_COLUMNS = ['mode', 'location', 'sex']
ADDITIONAL_FEATURES = ['adult_bmi', 'mode', 'location', 'sex', 'age']
PATH_TO_AUDIO_TRAIN_DF = '/kaggle/input/notebookd311de3272/files_info.csv'
PATH_TO_PATIENT_INFO_DF = '/kaggle/input/notebookd311de3272/train_data.csv'
TEST_SIZE = 0.15
MAX_ITERATIONS = 200
THRESHOLD = 0.15
FILE_NAME_TO_SAVE_MODEL = "crackles_wheezels_model.pkl"

# **READING DATASETS**

In [6]:
audio_train_df = pd.read_csv(PATH_TO_AUDIO_TRAIN_DF)
audio_train_df

Unnamed: 0.1,Unnamed: 0,start,end,crackles,weezels,pid,mode,location,equipment,filename,filename_new
0,0,0.036,2.436,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_0.wav
1,1,2.436,5.250,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_1.wav
2,2,5.250,8.422,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_2.wav
3,3,8.422,11.222,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_3.wav
4,4,11.222,13.807,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_4.wav
...,...,...,...,...,...,...,...,...,...,...,...
6893,6893,11.928,17.049,0,0,162,mc,Pr,AKGC417L,162_2b4_Pr_mc_AKGC417L,162_2b4_Pr_mc_AKGC417L_2.wav
6894,6894,17.049,18.729,0,0,162,mc,Pr,AKGC417L,162_2b4_Pr_mc_AKGC417L,162_2b4_Pr_mc_AKGC417L_3.wav
6895,6895,2.145,7.788,0,0,200,mc,Ar,AKGC417L,200_2p3_Ar_mc_AKGC417L,200_2p3_Ar_mc_AKGC417L_0.wav
6896,6896,7.788,13.881,0,0,200,mc,Ar,AKGC417L,200_2p3_Ar_mc_AKGC417L,200_2p3_Ar_mc_AKGC417L_1.wav


In [7]:
patient_info_df = pd.read_csv(PATH_TO_PATIENT_INFO_DF)
patient_info_df

Unnamed: 0.1,Unnamed: 0,pid,is_Al,Al_crackles,Al_wheezes,is_Ar,Ar_crackles,Ar_wheezes,is_Pl,Pl_crackles,...,Lr_wheezes,is_Tc,Tc_crackles,Tc_wheezes,age,sex,adult_bmi,child_weight,child_height,disease
0,0,101,True,0.0,0.0,False,0.0,0.000000,False,0.000000,...,0.0,False,0.0,0.0,3.00,F,0.00,19.0,99.0,URTI
1,1,102,False,0.0,0.0,True,0.0,0.000000,False,0.000000,...,0.0,False,0.0,0.0,0.75,F,0.00,9.8,73.0,Healthy
2,2,103,False,0.0,0.0,True,0.0,0.666667,False,0.000000,...,0.0,False,0.0,0.0,70.00,F,33.00,0.0,0.0,Asthma
3,3,104,True,0.0,0.0,True,0.0,0.714286,True,0.000000,...,0.0,False,0.0,0.0,70.00,F,28.47,0.0,0.0,COPD
4,4,105,False,0.0,0.0,False,0.0,0.000000,False,0.000000,...,0.0,True,0.0,0.0,7.00,F,0.00,32.0,135.0,URTI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,121,222,False,0.0,0.0,True,0.0,0.000000,False,0.000000,...,0.0,False,0.0,0.0,60.00,M,0.00,0.0,0.0,COPD
122,122,223,True,0.0,0.0,True,0.0,1.000000,True,0.000000,...,0.0,False,0.0,0.0,0.00,0,0.00,0.0,0.0,COPD
123,123,224,True,0.0,0.0,False,0.0,0.000000,False,0.000000,...,0.0,True,0.0,0.0,10.00,F,0.00,32.3,143.0,Healthy
124,124,225,False,0.0,0.0,False,0.0,0.000000,True,0.000000,...,0.0,False,0.0,0.0,0.83,M,0.00,7.8,74.0,Healthy


# **JOINING TWO DATAFRAMES ON PATIENT ID AND CONVERTING CATEGORICAL VALUES**

In [8]:
train_on_audio_df = audio_train_df.join(patient_info_df.set_index('pid')[ADDITIONAL_COLUMNS], on='pid')
convert_categorical_data_to_numerical(train_on_audio_df, CATEGORICAL_COLUMNS)
train_on_audio_df

mode
{'mc': 0, 'sc': 1}
location
{'Al': 0, 'Ar': 1, 'Ll': 2, 'Lr': 3, 'Pl': 4, 'Pr': 5, 'Tc': 6}
sex
{'0': 0, 'F': 1, 'M': 2}


Unnamed: 0.1,Unnamed: 0,start,end,crackles,weezels,pid,mode,location,equipment,filename,filename_new,adult_bmi,sex,age
0,0,0.036,2.436,0,0,168,1.0,0.0,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_0.wav,17.35,1.0,19.0
1,1,2.436,5.250,0,0,168,1.0,0.0,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_1.wav,17.35,1.0,19.0
2,2,5.250,8.422,0,0,168,1.0,0.0,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_2.wav,17.35,1.0,19.0
3,3,8.422,11.222,0,0,168,1.0,0.0,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_3.wav,17.35,1.0,19.0
4,4,11.222,13.807,0,0,168,1.0,0.0,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_4.wav,17.35,1.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6893,6893,11.928,17.049,0,0,162,0.0,5.0,AKGC417L,162_2b4_Pr_mc_AKGC417L,162_2b4_Pr_mc_AKGC417L_2.wav,24.90,1.0,67.0
6894,6894,17.049,18.729,0,0,162,0.0,5.0,AKGC417L,162_2b4_Pr_mc_AKGC417L,162_2b4_Pr_mc_AKGC417L_3.wav,24.90,1.0,67.0
6895,6895,2.145,7.788,0,0,200,0.0,1.0,AKGC417L,200_2p3_Ar_mc_AKGC417L,200_2p3_Ar_mc_AKGC417L_0.wav,27.80,1.0,72.0
6896,6896,7.788,13.881,0,0,200,0.0,1.0,AKGC417L,200_2p3_Ar_mc_AKGC417L,200_2p3_Ar_mc_AKGC417L_1.wav,27.80,1.0,72.0


# **PREPROCESSING AUDIO DATA AND SPLITTING INTO INPUT DATA AND LABELS**

In [9]:
labels = []
input_data = []

for idx,row in train_on_audio_df.iterrows():
    array_from_audio = convert_audio_to_array(*sf.read(f'{PATH_TO_AUDIO_FILES}{row.filename_new}'))
    input_data.append(np.append(array_from_audio, row[ADDITIONAL_FEATURES])) 
    labels.append(list((row['crackles'], row['weezels'])))

# **SPLITTING DATA INTO TRAIN AND TEST**

In [10]:
X_train, X_test, y_train, y_test = train_test_split(input_data, labels, test_size=TEST_SIZE, random_state=42)

# **TRAINING MULTI OUTPUT HIST GRADIENT BOOSTING**

In [11]:
clf = MultiOutputClassifier(HistGradientBoostingClassifier(max_iter=MAX_ITERATIONS, max_leaf_nodes=None)).fit(np.array(X_train), np.array(y_train))

# **CONVERTING PROBABILITY TO 0 AND 1 SCORES**

In [12]:
y_pred = convert_probability_to_1_0_score(clf.predict_proba(np.array(X_test)), THRESHOLD)

# **BUILDING MULTI LABEL CONFUSION MATRIX**

In [13]:
cf_matrix = multilabel_confusion_matrix(np.array(y_pred), np.array(y_test))
print(cf_matrix)

[[[608  61]
  [ 71 295]]

 [[813  59]
  [ 27 136]]]


# **CALCULATING AVERAGE ACCURACY**

In [14]:
average_accuracy = []
for i in cf_matrix:
    average_accuracy.append((i[1][1]+i[0][0])/(i[1][1]+i[0][0]+i[0][1]+i[1][0]))

print(sum(average_accuracy)/len(average_accuracy))

0.8946859903381643


# **CALCULATING CLASSIFICATION REPORT**

In [15]:
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8060    0.8287    0.8172       356
           1     0.8344    0.6974    0.7598       195

   micro avg     0.8147    0.7822    0.7981       551
   macro avg     0.8202    0.7630    0.7885       551
weighted avg     0.8160    0.7822    0.7969       551
 samples avg     0.3589    0.3565    0.3536       551



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **MODEL SAVING**

In [16]:
with open(FILE_NAME_TO_SAVE_MODEL, 'wb') as file:
    pickle.dump(clf, file)