In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.metrics import plot_confusion_matrix,matthews_corrcoef
from sklearn.model_selection import cross_val_score, train_test_split
import os
import librosa
from librosa.display import waveshow

# my modules
from preprocessing import cough_segmentation
from feature_extraction import librosa_feature_columns, librosa_normal_embedding

In [None]:
sr=22050

cough preprocessing

In [None]:
base_path='/home/bigpenguin/projects/project_covid/covid_models/silent_split_covid_dataset/covid_for_rauf_pipeline/'

In [None]:
'''
setting main cohorts paths
'''

c = 1
input_dir_neg = f'/home/bigpenguin/projects/project_covid/covid_models/silent_split_covid_dataset/covid_for_rauf_pipeline/data/raw/cohort{c}/cough/cough_neg/'
input_dir_pos = f'/home/bigpenguin/projects/project_covid/covid_models/silent_split_covid_dataset/covid_for_rauf_pipeline/data/raw/cohort{c}/cough/cough_pos/'
output_dir_neg = f'/home/bigpenguin/projects/project_covid/covid_models/silent_split_covid_dataset/covid_for_rauf_pipeline/data/transformed/cohort{c}/seg_neg/'
output_dir_pos = f'/home/bigpenguin/projects/project_covid/covid_models/silent_split_covid_dataset/covid_for_rauf_pipeline/data/transformed/cohort{c}/seg_pos/'


In [None]:
os.listdir(input_dir_neg),os.listdir(input_dir_pos),os.listdir(output_dir_neg),os.listdir(output_dir_pos)

In [None]:
# import soundfile as sf

# count=0
# for i in os.listdir(input_dir_neg):
# #     try:
#     aud, sr = librosa.load(input_dir_neg+i,sr=22050)
#     aud = librosa.util.normalize(aud)

#     if len(aud)/sr>1.0:
#         splits=librosa.effects.split(aud,top_db=33)
#         print(splits)
#         for sp in splits:
#             pads=sp[1]-sp[0]
#             if pads<44100:
#                 diff=44100-pads
#                 zeros=np.zeros(int(diff/2))
#                 tsec=np.pad(aud[sp[0]:sp[1]],len(zeros))
#                 tsec = librosa.util.normalize(tsec)
#                 n=i.split('.')[0]
#                 sf.write(f'{output_dir_neg}{n+"_"+str(count)}.wav',tsec ,sr,format='wav')
#                 count+=1
# #     except:
# #         print('file length zero')

In [None]:
'''
cough segmentation 
'''

cso = cough_segmentation()
cso.segmenter(input_dir_neg,output_dir_neg)
cso.segmenter(input_dir_pos,output_dir_pos)

In [None]:
'''
cough model inference
'''

feature extracion

In [None]:
'''
create audio data method
'''

def create_data(data_dir,sr):
    data=[]
    for i in os.listdir(data_dir):
        y,sr=librosa.load(data_dir+i,sr=sr)
        data.append(y)
    return data

In [None]:
'''
feature extraction method
'''

def feature_extractor(sr,embedding,data,columns,y):
    feats=[embedding(x,sr) for x in data]
    feats_df=pd.DataFrame(feats,columns=columns)
    ys=feats_df.shape[0]*y
    feats_df['status']=ys
    return feats_df

In [None]:
'''
generate column names for the embedding
'''

columns = librosa_feature_columns()

In [None]:
'''
setting paths for the segmented cohorts
'''

cohort_1_seg_pos = '/path/to/segmented/positive/cohort_1/'
cohort_1_seg_neg = '/path/to/segmented/negative/cohort_1/'

cohort_2_seg_pos = '/path/to/segmented/positive/cohort_2/'
cohort_2_seg_neg = '/path/to/segmented/negative/cohort_2/'

cohort_3_seg_pos = '/path/to/segmented/positive/cohort_3/'
cohort_3_seg_neg = '/path/to/segmented/negative/cohort_3/'

In [None]:
'''
create data
'''

cohort_1_data_pos = create_data(cohort_1_seg_pos,sr)
cohort_1_data_neg = create_data(cohort_1_seg_neg,sr)

cohort_2_data_pos = create_data(cohort_2_seg_pos,sr)
cohort_2_data_neg = create_data(cohort_2_seg_neg,sr)

cohort_3_data_pos = create_data(cohort_3_seg_pos,sr)
cohort_3_data_neg = create_data(cohort_3_seg_neg,sr)

In [None]:
'''
extract audio features
'''

positive=['covid-19']
negative=['healthy']

cohort_1_pos_features = feature_extraction(sr,librosa_normal_embedding,cohort_1_data_pos,columns,positive)
cohort_1_neg_features = feature_extraction(sr,librosa_normal_embedding,cohort_1_data_neg,columns,negative)

cohort_2_pos_features = feature_extraction(sr,librosa_normal_embedding,cohort_2_data_pos,columns,positive)
cohort_2_neg_features = feature_extraction(sr,librosa_normal_embedding,cohort_2_data_neg,columns,negative)

cohort_3_pos_features = feature_extraction(sr,librosa_normal_embedding,cohort_3_data_pos,columns,positive)
cohort_3_neg_features = feature_extraction(sr,librosa_normal_embedding,cohort_3_data_neg,columns,negative)

prepare train and test data

In [None]:
'''
prepare train and test data with cohort 1 and cohort 2
'''

cohort_1 = pd.concat([cohort_1_pos_features,cohort_1_neg_features],axis=0).reset_index(drop=True)
cohort_2 = pd.concat([cohort_2_pos_features,cohort_2_neg_features],axis=0).reset_index(drop=True)

cohort_1_2 = pd.concat([cohort_1,cohort_2],axis=0).reset_index(drop=True)

cohort_1_2_y = cohort_1_2.drop(['status'],axis=1)

In [None]:
'''
get target classes
'''

classes=cohort_1_2_y['status'].unique()

In [None]:
'''
splitting dataset to train & test
'''

X_train,X_test,y_train,y_test = train_test_split(
            cohort_1_2,
            cohort_1_2_y,
            test_size=0.2,
            random_state=42,
            shuffle=True,
            stratify=True
)

train & test models

In [None]:
xgbc = XGBClassifier(random_state=42)
xgbc.fit(X_train,y_train)
xgb_pred=xgbc.predict(X_test)

In [None]:
print("Acc -- ",accuracy_score(xgb_pred,y_test)*100)
print("Mcc -- ",matthews_corrcoef(xgb_pred,y_test))
cv_scores = cross_val_score(xgbc,cohort_1_2,cohort_1_2_y,cv=5)
print("Cross Val Acc -- ", np.mean(cv_scores)*100)

In [None]:

fig, ax = plt.subplots()
fig.set_size_inches(8 ,6)
sns.heatmap(confusion_matrix(y_test,xgb_pred),annot=True,fmt='g')

In [None]:
disp = plot_confusion_matrix(xgbc, X_test,y_test, 
                             display_labels=classes,
                             cmap=plt.cm.Blues,
#                              normalize='true'
                            )
disp.ax_.set_title("Normalized confusion matrix")
plt.show()

In [None]:
print(classification_report(y_test,xgb_pred))

In [None]:
lgb = LGBMClassifier(random_state=42)
lgb.fit(X_train,y_train)
lgb_pred=lgb.predict(X_test)

In [None]:
print("Acc -- ",accuracy_score(lgb_pred,y_test)*100)
print("Mcc -- ",matthews_corrcoef(lgb_pred,y_test))
cv_scores = cross_val_score(lgb,cohort_1_2,cohort_1_2_y,cv=5)
print("Cross Val Acc  -- ",np.mean(cv_scores)*100)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8 ,6)
sns.heatmap(confusion_matrix(y_test,lgb_pred),annot=True,fmt='g')

In [None]:
disp = plot_confusion_matrix(lgb, X_test,y_test, 
                             display_labels=classes,
                             cmap=plt.cm.Blues,
#                              normalize='true'
                            )
disp.ax_.set_title("Normalized confusion matrix")

plt.show()

In [None]:
print(classification_report(y_test,lgb_pred))

In [None]:
etc = ExtraTreesClassifier(random_state=42)
etc.fit(X_train,y_train)
etc_pred=etc.predict(X_test)

In [None]:
print("Acc -- ",accuracy_score(etc_pred,y_test)*100)
print("Mcc -- ",matthews_corrcoef(etc_pred,y_test))
cv_scores = cross_val_score(etc,cohort_1_2,cohort_1_2_y,cv=5)
print("Cross Val Acc-- ",np.mean(cv_scores)*100)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8 ,6)
sns.heatmap(confusion_matrix(y_test,lgb_pred),annot=True,fmt='g')

In [None]:
disp = plot_confusion_matrix(etc, X_test,y_test, 
                             display_labels=classes,
                             cmap=plt.cm.Blues,
#                              normalize='true'
                            )
disp.ax_.set_title("Normalized confusion matrix")

plt.show()

In [None]:
print(classification_report(y_test,etc_pred))

test on unseen data

In [None]:
'''
prepare data
'''

cohort_3 = pd.concat([cohort_3_pos_features,cohort_3_neg_features],axis=0).reset_index(drop=True)
cohort_3_y = cohort_3.drop(['status'],axis=1)


In [None]:
'''
compute performance metrics
'''

def performance(actual_predictions):

    ood=pd.DataFrame(actual_predictions)

    print("Acc -- ",accuracy_score(ood['actual'],ood['predictions'])*100)
    
    print("Mcc -- ",matthews_corrcoef(ood['actual'],ood['predictions']))

    print(pd.crosstab(ood['actual'],ood['predictions']))

    print(classification_report(ood['actual'],ood['predictions']))

In [None]:
'''
prepare actual and target values
'''

def model_rotation(model,data,ys):
    cohort = model.predict(data.values)

    actual_predictions = {
        'actual' : ys,
        'predictions' : cohort
    }

    performance(actual_predictions)

In [None]:
print("Results on out of distribution dataset : Cohort 3")
print("XGB\n")
model_rotation(xgbc,cohort_3,cohort_3_y)
print("\nLGB\n")
model_rotation(lgb,cohort_3,cohort_3_y)
print("\nETC\n")
model_rotation(etc,cohort_3,cohort_3_y)