In [1]:
import librosa 
import numpy as np
import os
import warnings
warnings.simplefilter("ignore", UserWarning)
import pandas as pd

import pickle
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_confusion_matrix,ConfusionMatrixDisplay

import numpy as np

# import librosa
# import soundfile as sf
import os
import warnings
warnings.filterwarnings('ignore')

# import librosa.display
# import sox
# from surfboard.sound import Waveform

# import surfboard
from sklearn.metrics import matthews_corrcoef


In [2]:
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping
from tensorflow.keras.layers import BatchNormalization,Dropout,Conv1D,Add,MaxPooling1D,Lambda,TimeDistributed,Dense,Activation,GlobalMaxPooling1D
def _bn_relu(layer, dropout=0, **params):
    '''
    This function returns a Batch Normalization Layer with desired activation function
    '''
    layer = BatchNormalization()(layer) 
    layer = Activation(params["conv_activation"])(layer)

    if dropout > 0:
        
        layer = Dropout(params["conv_dropout"])(layer)

    return layer

def add_conv_weight(
        layer,
        filter_length,
        num_filters,
        subsample_length=1,
        **params):
    '''
    This function returns a Convolution Layer with desired params
    '''
    
    layer = Conv1D(
        filters=num_filters,
        kernel_size=filter_length,
        strides=subsample_length,
        padding='same',
        kernel_initializer=params["conv_init"])(layer)
    return layer


def add_conv_layers(layer, **params):
    '''
    This function returns a Conv1D and BN Layer stacked together
    '''
    for subsample_length in params["conv_subsample_lengths"]:
        layer = add_conv_weight(
                    layer,
                    params["conv_filter_length"],
                    params["conv_num_filters_start"],
                    subsample_length=subsample_length,
                    **params)
        layer = _bn_relu(layer, **params)
    return layer

def resnet_block(
        layer,
        num_filters,
        subsample_length,
        block_index,
        **params):
    '''
    This function returns a Resnet Block with desired activation function
    '''

    def zeropad(x):
        '''
        This function pads zeros to the input vector by a zero vector of same shape in 3rd Dimension
        This is used when convolution filters are doubled every 4th Residual Block to match the dimensions
        '''
        y = tf.zeros_like(x)
        return tf.concat([x, y], axis=2)

    def zeropad_output_shape(input_shape):
        '''
        This function checks the shape of input then doubles the 3rd dimension and returns the shape as tuple
        This is used to get dimesion shape for the zeropad function
        '''
        shape = list(input_shape)
        assert len(shape) == 3
        shape[2] *= 2
        return tuple(shape)

    # Adding Skip Connections
    shortcut = MaxPooling1D(pool_size=subsample_length)(layer)
    # At each 4th residual block, double the convolution filters, pad the shortcut so that dimensions match
    zero_pad = (block_index % params["conv_increase_channels_at"]) == 0 \
        and block_index > 0
    if zero_pad is True:
        shortcut = Lambda(zeropad, output_shape=zeropad_output_shape)(shortcut)
    for i in range(params["conv_num_skip"]):
        if not (block_index == 0 and i == 0):
            layer = _bn_relu(
                layer,
                dropout=params["conv_dropout"] if i > 0 else 0,
                **params)
        layer = add_conv_weight(
            layer,
            params["conv_filter_length"],
            num_filters,
            subsample_length if i == 0 else 1,
            **params)
    layer = Add()([shortcut, layer])
    return layer

def get_num_filters_at_index(index, num_start_filters, **params):
    '''
    This function returns the convolution filters for the specified layer
    '''
    return 2**int(index / params["conv_increase_channels_at"]) \
        * num_start_filters

def add_resnet_layers(layer, **params):
    '''
    This Function addds the residual blocks that make up the structure
    The first and last layers of the network are special-cased due to this pre-activation block structure.
    '''
    layer = add_conv_weight(
        layer,
        params["conv_filter_length"],
        params["conv_num_filters_start"],
        subsample_length=1,
        **params)
    layer = _bn_relu(layer, **params)
    for index, subsample_length in enumerate(params["conv_subsample_lengths"]):
        num_filters = get_num_filters_at_index(
            index, params["conv_num_filters_start"], **params)
        layer = resnet_block(
            layer,
            num_filters,
            subsample_length,
            index,
            **params)
    layer = _bn_relu(layer, **params)
    return layer
    
def add_output_layer(layer, **params):
    '''
    This Function adds the output layer which is a Dense Layer wrapped in a TimeDistributed Layer.
    We use TimeDistributed layer so that the model outputs a prediction for each timestep and the temporal information is retained 
    Because TimeDistributed applies the same instance of Dense to each of the timestamps, the same set of weights are used at each timestamp. 
    '''
    layer = GlobalMaxPooling1D(name = 'feats')(layer)
    layer = Dense(params["num_categories"])(layer)
    return Activation('sigmoid')(layer)

def add_compile(model, **params):
    '''
    This functions adds the compiler to the model
    We have used Adam Optimizer and Categorical Cross-Entropy Loss'''
    
    # optimizer = Adam(
    #     learning_rate=params["learning_rate"],
    #     clipnorm=params.get("clipnorm", 1))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def build_network(**params):
    '''
    This function builds the entire network based on given parameters'''
    inputs = tf.keras.Input(shape=params['input_shape'],
                   dtype='float32',
                   name='inputs')

    if params.get('is_regular_conv', False):
        layer = add_conv_layers(inputs, **params)
    else:
        layer = add_resnet_layers(inputs, **params)

    output = add_output_layer(layer, **params)
    model = tf.keras.Model(inputs=[inputs], outputs=[output])
    if params.get("compile", True):
        model = add_compile(model, **params)
    return model


2023-02-08 22:01:39.034967: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-08 22:01:39.613903: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-08 22:01:39.613939: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-08 22:01:41.231335: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [4]:

# Model Parameters
params = {
"conv_subsample_lengths": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
"conv_filter_length": 16,
"conv_num_filters_start": 32,
"conv_init": "he_normal",
"conv_activation": "relu",
"conv_dropout": 0.2,
"conv_num_skip": 2,
"conv_increase_channels_at": 4,

"learning_rate": 0.001,
"input_shape": [22016, 1],
"num_categories": 1,
"compile":False
}
# Create Model
model = build_network(**params)
model.load_weights('CNN_Weights/cnn_model3_cough_vs_non_cough_segmented_22016_65k_files_orig_train_included_v2_final.h5')

In [143]:
# path =  'D:\AI4LYF\Covid\Latest Work\cohort_3_segmented\cohort_3_segmented\pos_seg'
path_pos='/home/bigpenguin/projects/project_covid/gathrd_data/cohort3/cohort_3_segmented_mp3/pos_seg/'
pos_files = [os.path.join(path_pos,i)  for i in os.listdir(path_pos) if 'ipynb' not in i ]
pos_files = np.array(sorted(pos_files))
len(pos_files)

2561

In [144]:
# path =  'D:\AI4LYF\Covid\Latest Work\cohort_3_segmented\cohort_3_segmented\\neg_seg'
# path='/home/bigpenguin/projects/surfboard_works/data_for_cough_denoising/coughs/segmented/'
path_neg='/home/bigpenguin/projects/project_covid/gathrd_data/cohort3/cohort_3_segmented_mp3/neg_seg/'
neg_files = [os.path.join(path_neg,i)  for i in os.listdir(path_neg) if 'ipynb' not in i ]
neg_files = np.array(sorted(neg_files))
len(neg_files)

8059

In [145]:
all_files = [
    *pos_files, 
    *neg_files
]

In [146]:
len(all_files)

10620

In [147]:
X_s = []
pos_count = 0
neg_count = 0
for file in pos_files:
    y, sr = librosa.load(file,sr=22050)
    y = librosa.util.normalize(y)
    
    if len(y)<=22016:

        pad_num = 22016-len(y)
        y = np.pad(y, [(0), (pad_num)], mode='constant')
      
    else:
        print(file)
        print(len(y))
        y = y[0:22016]
        
    X_s.append(y)
    
    # if pos_count==800:
    #   break


/home/bigpenguin/projects/project_covid/gathrd_data/cohort3/cohort_3_segmented_mp3/pos_seg/pos-0512-019-cough-m-59-5.mp3
22284


In [148]:
X_s_neg = []
for file in neg_files:
    y, sr = librosa.load(file,sr=22050)
    y = librosa.util.normalize(y)
    if len(y)<=22016:

        pad_num = 22016-len(y)
        y = np.pad(y, [(0), (pad_num)], mode='constant')
      
    else:
#       print(file)
#       print(len(y))
        y = y[0:22016]
    X_s_neg.append(y)

In [149]:
X_s = np.array(X_s)
X_s_neg = np.array(X_s_neg)
X_s.shape,X_s_neg.shape

((2561, 22016), (8059, 22016))

In [150]:
indices_pos = []
y_pred_proba = model.predict(X_s)
y_pred = np.where(y_pred_proba>0.97,1,0)
for idx,(i,j) in enumerate(zip(y_pred,y_pred_proba)):
    if i[0] ==0:
    # print(files[idx],str(i[0]),'('+str(round(1-j[0], 4))+')')
        indices_pos.append(idx)
    elif i[0]==1:
    # print(files[idx],str(i[0]),'('+str(round(j[0], 4))+')')
        continue
len(indices_pos)



688

In [151]:
# y_pred_proba=model.predict(X_s_neg[0:50000])
# y_pred_proba1=model.predict(X_s_neg[50000:100000])
# y_pred_proba2=model.predict(X_s_neg[100000:])

# len(y_pred_proba),len(y_pred_proba1),len(y_pred_proba2)
# ypp=np.array([*y_pred_proba.reshape(-1),*y_pred_proba1.reshape(-1),*y_pred_proba2.reshape(-1)])

In [152]:
indices_neg = []

y_pred_proba = model.predict(X_s_neg)
y_pred = np.where(y_pred_proba>0.97,1,0)
for idx,(i,j) in enumerate(zip(y_pred,y_pred_proba)):
    if i[0] ==0:
#         print(files[idx],str(i[0]),'('+str(round(1-j[0], 4))+')')
        indices_neg.append(idx)
    elif i[0]==1:
#         print(files[idx],str(i[0]),'('+str(round(j[0], 4))+')')
        continue
len(indices_neg)



2365

In [153]:

# Model Parameters
params = {
"conv_subsample_lengths": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
"conv_filter_length": 16,
"conv_num_filters_start": 32,
"conv_init": "he_normal",
"conv_activation": "relu",
"conv_dropout": 0.2,
"conv_num_skip": 2,
"conv_increase_channels_at": 4,

"learning_rate": 0.001,
"input_shape": [22016, 1],
"num_categories": 1,
"compile":False
}
# Create Model
model_1 = build_network(**params)
model_1.load_weights('/home/bigpenguin/projects/surfboard_works/CNN_Weights/cnn_model3_cough_vs_non_cough_segmented_22016_orig_train_included_final.h5')


In [154]:
indices_pos_1 = []
y_pred_proba = model_1.predict(X_s)
y_pred = np.where(y_pred_proba>0.97,1,0)
for idx,(i,j) in enumerate(zip(y_pred,y_pred_proba)):
    if i[0] ==0:
    # print(files[idx],str(i[0]),'('+str(round(1-j[0], 4))+')')
        indices_pos_1.append(idx)
    elif i[0]==1:
    # print(files[idx],sy_pred_proba = model_1.predict(X_s)tr(i[0]),'('+str(round(j[0], 4))+')')
        continue
len(indices_pos_1)



296

In [155]:
# y_pred_proba_1=model_1.predict(X_s_neg[0:50000])
# y_pred_proba1_1=model_1.predict(X_s_neg[50000:100000])
# y_pred_proba2_1=model_1.predict(X_s_neg[100000:])

# len(y_pred_proba_1),len(y_pred_proba1_1),len(y_pred_proba2_1)
# ypp_1=np.array([*y_pred_proba_1.reshape(-1),*y_pred_proba1_1.reshape(-1),*y_pred_proba2_1.reshape(-1)])

In [156]:
indices_neg_1 = []
y_pred_proba = model_1.predict(X_s_neg)
y_pred = np.where(y_pred_proba>0.97,1,0)

for idx,(i,j) in enumerate(zip(y_pred,y_pred_proba)):
    if i[0] ==0:
        # print(files[idx],str(i[0]),'('+str(round(1-j[0], 4))+')')
        indices_neg_1.append(idx)
    elif i[0]==1:
        # print(files[idx],str(i[0]),'('+str(round(j[0], 4))+')')
        continue
len(indices_neg_1)



929

In [157]:
pos_del = set(pos_files[indices_pos]).union( set(pos_files[indices_pos_1]))
len(set(pos_files[indices_pos]).union( set(pos_files[indices_pos_1])))

732

In [158]:
neg_del = set(neg_files[indices_neg_1]).union( set(neg_files[indices_neg]))
len(set(neg_files[indices_neg_1]).union( set(neg_files[indices_neg])))

2457

In [159]:
len(neg_del),len(pos_del)

(2457, 732)

In [162]:
'''
!!!!!!!!!!! USE CAREFULLY !!!!!!!!!!!
'''

# # negative and positive non cough samples deletion from target dirs -- 
# # p='/home/bigpenguin/projects/project_covid/gathrd_data/COHORT1/cohort_1_segmented_mp3/pos_seg/'
# # n='/home/bigpenguin/projects/project_covid/gathrd_data/COHORT1/cohort_1_segmented_mp3/neg_seg/'

# def delete_non_cough(target_dir,list_of_files_to_delete):
#     [os.remove(target_dir+i) for i in [x.split('/')[-1] for x in list_of_files_to_delete] if i in os.listdir(target_dir)]
# #         if i in os.listdir(target_dir):
# #             os.remove(target_dir+i)

# delete_non_cough(path_pos,pos_del)
# delete_non_cough(path_neg,neg_del)

In [161]:
# np.save("pos_to_be deleted.npy",np.array(list(pos_del)))
# np.save("neg_to_be deleted.npy",np.array(list(neg_del)))