This notebook to keep a benchmarking result of Table 7 on Covid-19 and Abnormal Detection task, include Unsupervised(Isolation Forest, XGBOD) at last this notebook.

+ Use Coswara dataset ```dataset_type='Coswara'```:
+ Use FRILL pretrain model to extract Feature ```PRETRAIN="FRILL"```
+ Use SVM method to classify
+ Set seed 1111 ```seed=1111```

# Library

In [1]:
import warnings
from glob import glob

warnings.filterwarnings("ignore")

import os, time, math, random, cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile, pickle, h5py, joblib, json
import multiprocessing

import librosa
import opensmile
import xgboost as xgb
# import tensorflow_hub as hub

from math import pi
from tqdm import tqdm
from pathlib import Path
from functools import partial
from scipy.fftpack import fft, hilbert
from scipy.spatial.distance import cdist
from scipy.spatial import cKDTree
from sklearn.svm import LinearSVC
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_predict, train_test_split
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, auc, precision_recall_curve, roc_curve, average_precision_score

SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    


# Configuration

In [2]:
#sample rate
SR = 44100
#100 ms
FRAME_LEN = int(SR / 10)
#50% overlap, meaning 5ms hop length
HOP = int(FRAME_LEN / 2)
#the MFCC dimension
MFCC_dim = 13
PRETRAIN = 'FRILL'
codebook_size = 1000

fold_num = 5
seed = 1111

dataset_type = 'Coswara'

if dataset_type == 'SoundDr':
    PERIOD = 15
    SR = 48000
    DIR_DATA = "./sounddr_data/"

    df = pd.read_csv(DIR_DATA + 'data.csv')
    df['file_path'] = df['file_cough'] + '.wav'

    df['label_symptom'] = (df['symptoms_status_choice'].map(str) != "['No']").astype(int)
    df['label_abnormal'] = ((df['symptoms_status_choice'].map(str) != "['No']") | (df['cov19_status_choice'] != 'never')).astype(int)
    df['label_covid'] = (df['cov19_status_choice'] != 'never').astype(int)
elif dataset_type == 'CoughVid':
    PERIOD = 10
    SR = 22050

    DIR_DATA = './coughvid_data/'

    VidData   = pd.read_csv(os.path.join(DIR_DATA, 'public_dataset/metadata_compiled.csv'), header=0)
    VidData   = VidData.loc[VidData['cough_detected'] >= 0.9][['uuid','fever_muscle_pain','respiratory_condition','status', 'quality_1', 'age', 'gender']]
    VidData.dropna(subset=['uuid','fever_muscle_pain','respiratory_condition','status'], inplace=True)
    VidData = VidData[(VidData['quality_1'] != 'no_cough') & (VidData['quality_1'] != 'poor')]
    VidData = VidData[(VidData['status'] != 'symptomatic') & (VidData['status'].notna())]
    VidData['label_covid'] = (VidData['status'] == 'COVID-19').astype(int)

    extradata = VidData.loc[VidData['status']=='COVID-19']
    notradata = VidData.loc[VidData['status']!='COVID-19']

    df = pd.concat([extradata, notradata], ignore_index= True)
    df['file_path'] = df['uuid'].apply(lambda x: 'public_dataset/' + x + '.webm')
    def g(x):
        for i in x:
            if i is True:
                return 1
        return 0
    df['label_abnormal'] = df[['fever_muscle_pain', 'respiratory_condition', 'label_covid']].apply(lambda x: g(x), axis=1)
else:
    PERIOD = 5
    SR = 44100
    DIR_DATA = "./coswara_data/"

    join_by = pd.read_csv(os.path.join(DIR_DATA, 'combined_data.csv'))
    df_list = []
    for each in os.listdir(DIR_DATA):
        for path in tqdm(glob(DIR_DATA + each + '/*/cough-shallow.wav')):
            temp = pd.DataFrame(columns=['id', 'DIR'])
            temp['id'] = [path.split('/')[-2]]
            temp['DIR'] = [path]
            temp = pd.merge(left=temp,right=join_by,on='id',how='inner')

            temp['label_cough'] = (temp['cough'] == True).astype(int)

            temp['file_path'] = each + '/' + temp['id'] + '/cough-shallow.wav'
            temp['label_covid'] = temp['covid_status'].apply(lambda x: 1 if x == 'positive_mild' or x =='positive_moderate' or x == 'COVID-19' else 0)
            df_list.append(temp)
    df = pd.concat(df_list)
    def g(x):
        for i in x:
            if i is True:
                return 1
        return 0
    df['label_abnormal'] = df[['st', 'bd', 'cld', 'pneumonia', 'others_resp', 'asthma', 'label_covid']].apply(lambda x: g(x), axis=1)

target_col = 'label_abnormal'
OUTPUT_DIR = DIR_DATA + 'output/'
os.makedirs(OUTPUT_DIR, exist_ok = True)

df.tail()

100%|██████████| 32/32 [00:00<00:00, 161.02it/s]
100%|██████████| 64/64 [00:00<00:00, 161.89it/s]
0it [00:00, ?it/s]
100%|██████████| 81/81 [00:00<00:00, 161.78it/s]
100%|██████████| 196/196 [00:01<00:00, 152.76it/s]
100%|██████████| 66/66 [00:00<00:00, 164.74it/s]
100%|██████████| 20/20 [00:00<00:00, 152.50it/s]
100%|██████████| 23/23 [00:00<00:00, 161.40it/s]
100%|██████████| 168/168 [00:01<00:00, 163.79it/s]
100%|██████████| 82/82 [00:00<00:00, 157.81it/s]
100%|██████████| 18/18 [00:00<00:00, 149.02it/s]
100%|██████████| 16/16 [00:00<00:00, 153.84it/s]
100%|██████████| 32/32 [00:00<00:00, 157.66it/s]
100%|██████████| 54/54 [00:00<00:00, 163.61it/s]
100%|██████████| 17/17 [00:00<00:00, 162.67it/s]
100%|██████████| 56/56 [00:00<00:00, 161.37it/s]
100%|██████████| 42/42 [00:00<00:00, 156.63it/s]
100%|██████████| 37/37 [00:00<00:00, 159.09it/s]
100%|██████████| 19/19 [00:00<00:00, 157.03it/s]
0it [00:00, ?it/s]
100%|██████████| 42/42 [00:00<00:00, 143.92it/s]
100%|██████████| 76/76 [00:

Unnamed: 0,id,DIR,a,covid_status,record_date,ep,g,l_c,l_l,l_s,...,st,ihd,asthma,others_preexist,cld,pneumonia,label_cough,file_path,label_covid,label_abnormal
0,d7w3B2YcJ3TLx58ryhiASEtwaAu1,./coswara_data/20210930/d7w3B2YcJ3TLx58ryhiASE...,72,positive_moderate,2021-09-29,y,male,India,Coimbatore,Tamil Nadu,...,True,,,,,,1,20210930/d7w3B2YcJ3TLx58ryhiASEtwaAu1/cough-sh...,1,1
0,JBt2sizAUqdzxaoHx2jCMl9ky1H2,./coswara_data/20210930/JBt2sizAUqdzxaoHx2jCMl...,73,healthy,2021-09-23,n,female,India,Coimbatore,Tamil Nadu,...,,,,,,,0,20210930/JBt2sizAUqdzxaoHx2jCMl9ky1H2/cough-sh...,0,0
0,0Js6ZUZQ9NUnu568Fh7B6mZ1R8o1,./coswara_data/20210930/0Js6ZUZQ9NUnu568Fh7B6m...,50,positive_moderate,2021-09-24,y,female,India,Coimbatore,Tamil Nadu,...,,,,,,,0,20210930/0Js6ZUZQ9NUnu568Fh7B6mZ1R8o1/cough-sh...,1,0
0,wRIACcPu2dS0xiHj6O7O4eJXKT53,./coswara_data/20210930/wRIACcPu2dS0xiHj6O7O4e...,31,positive_mild,2021-09-23,y,female,India,Coimbatore,Tamil Nadu,...,True,,,,,,1,20210930/wRIACcPu2dS0xiHj6O7O4eJXKT53/cough-sh...,1,1
0,8Ul16g4L9nP9lcp16BKh1X6cfhb2,./coswara_data/20210930/8Ul16g4L9nP9lcp16BKh1X...,46,positive_moderate,2021-09-25,y,female,India,Coimbatore,Tamil Nadu,...,,,,,,,0,20210930/8Ul16g4L9nP9lcp16BKh1X6cfhb2/cough-sh...,1,0


In [3]:
def get_duration(filename, mono=True, res_type="kaiser_fast"):
    duration = 0
    sr = SR
    try:
        y, sr = librosa.load(filename, sr=None, mono=mono, res_type=res_type)
        duration = librosa.get_duration(y=y, sr=sr)
    except:
        print('Error file:' + filename)
    return duration, sr

In [13]:
df[target_col].value_counts()

0    1841
1     391
Name: label_abnormal, dtype: int64

# Utils

In [14]:
def crop_or_pad(y, length):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length-len(y))])
    elif len(y) > length:
        cut = random.randint(0, len(y) - length)
        y = y[cut:cut+length]
    return y

def merge_feature(list_features):
    """
      Merge numpy array features
      Args:
        - list_features: list of numpy array features
                         :type: a list of numpy arrays 
      Returns:
        - features: the concatenate numpy array along axis=1
                    :type: a numpy array                 
    """      
    features = np.concatenate(list_features, axis=1)
    features = np.nan_to_num(features)
    features = np.clip(features, -np.finfo(np.float32).max, np.finfo(np.float32).max)
    return features

In [15]:
def compute_metrics(cfs_matrix):
    """
      Calculate common metrics based on the confusion matrix
      Args:
        - cfs_matrix: a sklearn confusion matrix 
                      :type: a ndarray of shape (n_classes, n_classes)
      Returns:
        - precision: the precision of the prediction
                     :type: float  
        - recall: the recall of the prediction
                  :type: float  
        - f1: the f1-score of the prediction
              :type: float                       
    """     
    precision = cfs_matrix[1,1] / (cfs_matrix[1,1] + cfs_matrix[0,1])
    recall = cfs_matrix[1,1] / (cfs_matrix[1,1] + cfs_matrix[1,0])
    f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1

# Extract Audio Features

In [16]:
if not os.path.exists(OUTPUT_DIR + PRETRAIN + '.pickle'):
    import tensorflow.compat.v2 as tf
    tf.enable_v2_behavior()
    import tensorflow_hub as hub

    frill_nofrontend_model = hub.load('https://tfhub.dev/google/nonsemantic-speech-benchmark/frill-nofrontend/1')

    def stabilized_log(data, additive_offset, floor):
      """TF version of mfcc_mel.StabilizedLog."""
      return tf.math.log(tf.math.maximum(data, floor) + additive_offset)


    def log_mel_spectrogram(data,
                            audio_sample_rate,
                            num_mel_bins=64,
                            log_additive_offset=0.001,
                            log_floor=1e-12,
                            window_length_secs=0.025,
                            hop_length_secs=0.010,
                            fft_length=None):
        """TF version of mfcc_mel.LogMelSpectrogram."""
        window_length_samples = int(round(audio_sample_rate * window_length_secs))
        hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
        if not fft_length:
            fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))

        spectrogram = tf.abs(
            tf.signal.stft(
                tf.cast(data, tf.dtypes.float64),
                frame_length=window_length_samples,
                frame_step=hop_length_samples,
                fft_length=fft_length,
                window_fn=tf.signal.hann_window,
            )
        )

        to_mel = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=num_mel_bins,
            num_spectrogram_bins=fft_length // 2 + 1,
            sample_rate=audio_sample_rate,
            lower_edge_hertz=125.0,
            upper_edge_hertz=7500.0,
            dtype=tf.dtypes.float64
        )

        mel = spectrogram @ to_mel
        log_mel = stabilized_log(mel, log_additive_offset, log_floor)
        return log_mel

    def compute_frontend_features(samples, sr, frame_hop, n_required=16000, num_mel_bins=64, frame_width=96):
        if samples.dtype == np.int16:
            samples = tf.cast(samples, np.float32) / np.iinfo(np.int16).max
        if samples.dtype == np.float64:
            samples = tf.cast(samples, np.float32)
        assert samples.dtype == np.float32, samples.dtype
        n = tf.size(samples)
        samples = tf.cond(
            n < n_required,
            lambda: tf.pad(samples, [(0, n_required - n)]),
            lambda: samples
        )
        mel = log_mel_spectrogram(samples, sr, num_mel_bins=num_mel_bins)
        mel = tf.signal.frame(mel, frame_length=frame_width, frame_step=frame_hop, axis=0)
        return mel

    def make_nonsemantic_frill_nofrontend_feat(filename):
        try:
            waveform, _ = librosa.load(os.path.join(DIR_DATA, filename), sr=16000, mono=True, res_type="kaiser_fast")
            if 2048 > waveform.shape[-1]:
                print('File length < 2048')
                return None, filename
            frontend_feats = tf.expand_dims(compute_frontend_features(waveform, 16000, frame_hop=17), axis=-1).numpy().astype(np.float32)
            assert frontend_feats.shape[1:] == (96, 64, 1)

            embeddings = frill_nofrontend_model(frontend_feats)['embedding']
            mean_emb = embeddings.numpy().mean(axis=0)
            std_emb = embeddings.numpy().std(axis=0)
        except Exception as e:
            print('Error: ' + str(e))
            return None, filename
        return np.concatenate((mean_emb, std_emb)), filename

# Extract Features

In [17]:
def get_features_of_list_audio(df):
    X_features = []
    df['error'] = 0
    # extract train data features
#     pool = multiprocessing.Pool(multiprocessing.cpu_count())
#     with tqdm(total=len(df.values)) as t:
#         for feature, filename in pool.imap(make_nonsemantic_trill_feat, df['file_path'].values):
#             if feature is None:
#                 df['error'][df['file_path'] == filename] = 1
#             else:
#                 X_features.append(feature)
#             t.update(1)
#     pool.close()
#     pool.join()

    for idx, r in tqdm(df.iterrows(), total=len(df)):
        if PRETRAIN == 'TRILL':
            feature, filename = make_nonsemantic_trill_feat(r['file_path'])
        elif PRETRAIN == 'OpenSmileBoAW':
            feature, filename = make_opensmileboaw_feat(r['file_path'])
        elif PRETRAIN == 'OpenSmile':
            feature, filename = make_opensmile_feat(r['file_path'])
        elif PRETRAIN == 'DeepSpectrum':
            feature, filename = make_deepspect_feat(r['file_path'])
        else:
            feature, filename = make_nonsemantic_frill_nofrontend_feat(r['file_path'])
        
        if feature is None:
            df['error'][df['file_path'] == filename] = 1
        else:
            X_features.append(feature)
    return np.array(X_features), df

In [18]:
if not os.path.exists(OUTPUT_DIR + PRETRAIN + '.pickle'):
    X_features, df = get_features_of_list_audio(df)
    df.to_csv(os.path.join(OUTPUT_DIR, 'data.csv'), index=False)
    pickle.dump({
        'X_trill_features': X_features
    }, open(OUTPUT_DIR + PRETRAIN + '.pickle', "wb" ))
else:
    df = pd.read_csv(os.path.join(OUTPUT_DIR, 'data.csv'))
    f = pickle.load(open(OUTPUT_DIR + PRETRAIN + '.pickle', "rb" ))
    X_features = f['X_trill_features']
df = df[df['error'] == 0].reset_index(drop=True)

# if PRETRAIN == 'OpenSmile':
scaler = StandardScaler()
X = scaler.fit_transform(merge_feature([X_features]))
# else:
# X = merge_feature([X_features])
print(f"Data feature shape: {X.shape, len(df)}")

Data feature shape: ((2201, 4096), 2201)


# Functions

In [19]:
def evaluate(ensem_preds, targets):
    """
      Evaluate the prediction by providing metrics & also the best threshold (to get the highest f1-score)
      Ex: AUC, Accurary, Precision, Recall, F1-Score.
      Then print these metrics
      Args:
        - ensem_preds: predictions for ids 
                       :type: a numpy array
        - targets: the actual results of ids 
                   :type: a numpy array                 
      Returns:
        - None                  
    """     
    best_th = 0
    best_score = 0

    for th in np.arange(0.0, 0.6, 0.01):
        pred = (ensem_preds > th).astype(int)
        score = f1_score(targets, pred)
        if score > best_score:
            best_th = th
            best_score = score

    print(f"\nAUC score: {roc_auc_score(targets, ensem_preds):12.4f}")
    print(f"Best threshold {best_th:12.4f}")

    preds = (ensem_preds > best_th).astype(int)

    cm1 = confusion_matrix(targets, preds)
    print('\nConfusion Matrix : \n', cm1)
    precision, recall, f1 = compute_metrics(cm1)
    
    print('\n=============')
    print (f'Precision    : {precision:12.4f}')
    
    print(f'Recall : {recall:12.4f}')
    
    print(f'F1 Score : {f1:12.4f}')
    
    total1=sum(sum(cm1))

    print('\n=============')
    accuracy1=(cm1[0,0]+cm1[1,1])/total1
    print (f'Accuracy    : {accuracy1:12.4f}')

def get_model(pos_scale, c=1):
    model = LinearSVC(C=c, class_weight='balanced', random_state=seed)
    return model

# Train COVID-19

In [20]:
 if not os.path.exists(OUTPUT_DIR + 'df_label_covid_5fold.csv'):
    folds = df.copy()
    Fold = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(folds, folds['label_covid'])):
        folds.loc[val_index, 'fold'] = int(n)
    folds['fold'] = folds['fold'].astype(int)
    folds.to_csv(OUTPUT_DIR + 'df_label_covid_5fold.csv', index=False)
else:
    folds = pd.read_csv(OUTPUT_DIR + 'df_label_covid_5fold.csv')

In [21]:
y = folds['label_covid']
pos_scale = (y == 0).sum() / (y == 1).sum()
print(pos_scale)
targets = []
preds = []
aucs = []

for fold in range(5):
    train_idx = folds['fold'] != fold
    valid_idx = folds['fold'] == fold
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_val = X[valid_idx]
    y_val = y[valid_idx]

    targets.append(y_val)
    model = get_model(pos_scale=pos_scale)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    preds.append(pred)
    auc = roc_auc_score(y_val, pred)
    aucs.append(auc)
    print(auc)
    del model

targets = np.concatenate(targets)
preds = np.concatenate(preds)

print("(!) cv5 AUC ", np.mean(aucs), np.std(aucs))
evaluate(preds, targets)

4.822751322751323
0.6382119682768566
0.6547031963470321
0.6974429223744292
0.628903990746096
0.6365673799884326
(!) cv5 AUC  0.6511658915465692 0.024619459793158903

AUC score:       0.6510
Best threshold       0.0000

Confusion Matrix : 
 [[1467  356]
 [ 190  188]]

Precision    :       0.3456
Recall :       0.4974
F1 Score :       0.4078

Accuracy    :       0.7519


# Train abnormal

In [22]:
 if not os.path.exists(OUTPUT_DIR + 'df_label_abnormal_5fold.csv'):
    folds = df.copy()
    Fold = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(folds, folds['label_abnormal'])):
        folds.loc[val_index, 'fold'] = int(n)
    folds['fold'] = folds['fold'].astype(int)
    folds.to_csv(OUTPUT_DIR + 'df_label_abnormal_5fold.csv', index=False)
else:
    folds = pd.read_csv(OUTPUT_DIR + 'df_label_abnormal_5fold.csv')

In [23]:
y = folds['label_abnormal']
pos_scale = (y == 0).sum() / (y == 1).sum()
print(pos_scale)
targets = []
preds = []
aucs = []

def get_model(pos_scale):
    model = xgb.XGBClassifier(
        max_depth=7,
        scale_pos_weight=pos_scale,
        learning_rate=0.3,
        n_estimators=200,
        subsample=1,
        colsample_bytree=1,
        nthread=-1,
        seed=seed,
        eval_metric='logloss'
    )
    return model

for fold in range(5):
    train_idx = folds['fold'] != fold
    valid_idx = folds['fold'] == fold
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_val = X[valid_idx]
    y_val = y[valid_idx]

    targets.append(y_val)
    model = get_model(pos_scale=pos_scale)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    preds.append(pred)
    auc = roc_auc_score(y_val, pred)
    aucs.append(auc)
    print(auc)
    del model

targets = np.concatenate(targets)
preds = np.concatenate(preds)

print("(!) cv5 AUC ", np.mean(aucs), np.std(aucs))
evaluate(preds, targets)

4.658097686375322
0.5915448188175461
0.5905155450609996
0.5887519478679699
0.5768168295792605
0.5667587476979743
(!) cv5 AUC  0.58287757780475 0.009638642926106993

AUC score:       0.5829
Best threshold       0.0000

Confusion Matrix : 
 [[1763   49]
 [ 314   75]]

Precision    :       0.6048
Recall :       0.1928
F1 Score :       0.2924

Accuracy    :       0.8351


## Test other model

### IsolationForest

In [26]:
y = folds['label_abnormal']
pos_scale = (y == 0).sum() / (y == 1).sum()
print(pos_scale)

def get_model(pos_scale):
    model = IsolationForest(n_estimators=500, max_samples='auto', contamination=0.1, n_jobs=-1, random_state=seed) 
    return model

4.658097686375322


In [27]:
targets = []
preds = []
aucs = []

for fold in range(5):
    train_idx = folds['fold'] != fold
    valid_idx = folds['fold'] == fold
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_val = X[valid_idx]
    y_val = y[valid_idx]

    targets.append(y_val)
    model = get_model(pos_scale)
    model.fit(X_train)

    scores = (-1.0) * model.decision_function(X_val)
    pred = scores.flatten()
    preds.append(pred)
    auc = roc_auc_score(y_val, pred)
    aucs.append(auc)
    print(auc)
    del model

targets = np.concatenate(targets)
preds = np.concatenate(preds)

print("(!) cv5 AUC ", np.mean(aucs), np.std(aucs))
evaluate(preds, targets)

0.4778907960726142
0.5172265750778148
0.49247414647967136
0.5215682107947301
0.47565165037540724
(!) cv5 AUC  0.4969622757600475 0.019256693703546882

AUC score:       0.4958
Best threshold       0.0000

Confusion Matrix : 
 [[1637  175]
 [ 337   52]]

Precision    :       0.2291
Recall :       0.1337
F1 Score :       0.1688

Accuracy    :       0.7674


### XGBOD

In [28]:
y = folds['label_abnormal']
pos_scale = (y == 0).sum() / (y == 1).sum()
print(pos_scale)

from pyod.models.xgbod import XGBOD

def get_model(pos_scale):
    # Model candidate
    model = XGBOD(max_depth=7,
        scale_pos_weight=pos_scale,
        learning_rate=0.3,
        n_estimators=200,
        subsample=1,
        colsample_bytree=1,
        nthread=-1,
        seed=seed,
        eval_metric='logloss')
    return model

4.658097686375322


In [29]:
targets = []
preds = []
aucs = []

for fold in range(5):
    train_idx = folds['fold'] != fold
    valid_idx = folds['fold'] == fold
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_val = X[valid_idx]
    y_val = y[valid_idx]

    targets.append(y_val)
    model = get_model(pos_scale)
    model.fit(X_train, y_train)

    pred = model.decision_function(X_val)  # predict raw outlier scores on test
    auc = roc_auc_score(y_val, pred)
    preds.append(pred)
    aucs.append(auc)
    print(auc)
    del model

targets = np.concatenate(targets)
preds = np.concatenate(preds)

print("(!) cv5 AUC ", np.mean(aucs), np.std(aucs))
evaluate(preds, targets)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.6851380942290033
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.6471324818432257
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.6778403456580252
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are 

### IForest

In [30]:
y = folds['label_abnormal']
pos_scale = (y == 0).sum() / (y == 1).sum()
print(pos_scale)

from pyod.models.iforest import IForest

def get_model(pos_scale):
    model = IForest(n_estimators=500, max_samples='auto', contamination=0.1, n_jobs=-1, random_state=seed) 
    return model

4.658097686375322


In [31]:
targets = []
preds = []
aucs = []

for fold in range(5):
    train_idx = folds['fold'] != fold
    valid_idx = folds['fold'] == fold
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_val = X[valid_idx]
    y_val = y[valid_idx]

    targets.append(y_val)
    
    if PRETRAIN == 'OpenSmileBoAW':
        from sklearn.cluster import MiniBatchKMeans

        p = int(X_train.shape[1]/130)
        X_train1 = X_train[:, :(p-1)*65].reshape(-1, p-1, 65)
        X_train2 = X_train[:, (p-1)*65:].reshape(-1, (p+1), 65)
        X_val1 = X_val[:, :(p-1)*65].reshape(-1, (p-1), 65)
        X_val2 = X_val[:, (p-1)*65:].reshape(-1, (p+1), 65)

        t = time.time()
        vocab1 = MiniBatchKMeans(n_clusters=codebook_size).fit(X_train1.reshape(-1, 65))
        codebook1 = vocab1.cluster_centers_
        print("Kmean =", time.time()-t)
        X_train1 = generate_boaw(X_train1, codebook1)
        X_val1 = generate_boaw(X_val1, codebook1)
        
        vocab2 = MiniBatchKMeans(n_clusters=codebook_size).fit(X_train2.reshape(-1, 65))
        codebook2 = vocab2.cluster_centers_
        X_train2 = generate_boaw(X_train2, codebook2)
        X_val2 = generate_boaw(X_val2, codebook2)

        X_train = np.nan_to_num(np.concatenate([X_train1, X_train2], axis=1))  # [num_sample, codebook_size*2]
        X_val = np.nan_to_num(np.concatenate([X_val1, X_val2], axis=1))

    model = get_model(pos_scale)
    model.fit(X_train)

    pred = model.decision_function(X_val)  # predict raw outlier scores on test
    auc = roc_auc_score(y_val, pred)
    preds.append(pred)
    aucs.append(auc)
    print(auc)
    del model

targets = np.concatenate(targets)
preds = np.concatenate(preds)

print("(!) cv5 AUC ", np.mean(aucs), np.std(aucs))
evaluate(preds, targets)

0.4778907960726142
0.5172265750778148
0.49247414647967136
0.5215682107947301
0.47565165037540724
(!) cv5 AUC  0.4969622757600475 0.019256693703546882

AUC score:       0.4958
Best threshold       0.0000

Confusion Matrix : 
 [[1637  175]
 [ 337   52]]

Precision    :       0.2291
Recall :       0.1337
F1 Score :       0.1688

Accuracy    :       0.7674


# Save notebook

In [32]:
!cp "SoundDr_cough.ipynb" "$OUTPUT_DIR/Coswara_cough_frill_svm_1111.ipynb"