# Load Libraries

In [1]:
!TF_ENABLE_ONEDNN_OPTS=0
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2025-09-21 22:07:58.778728: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-21 22:07:58.794047: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-21 22:07:58.810494: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-21 22:07:58.815200: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-21 22:07:58.827415: I tensorflow/core/platform/cpu_feature_guar

Num GPUs Available:  0


2025-09-21 22:08:01.826824: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
import warnings
warnings.filterwarnings('ignore')

import glob
import io
import datasets
import os
import time
import joblib
import json
import csv
import pathlib
import librosa
import librosa.display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import xgboost as xgb
import seaborn as sns

from tqdm.notebook import tqdm
from PIL import Image
from matplotlib import pyplot
from collections import Counter
from pprint import pprint
from pydub import AudioSegment # sudo apt install ffmpeg
%matplotlib inline

# Preprocessing
from sklearn.preprocessing import (
    LabelEncoder, 
    StandardScaler,
    MinMaxScaler,
    scale
    )
from sklearn.model_selection import (
    GridSearchCV, 
    train_test_split, 
    RepeatedStratifiedKFold, 
    cross_val_score, 
    KFold,
    StratifiedKFold
    ) 
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    roc_curve,
    roc_auc_score, 
    precision_recall_curve,
    auc,
    precision_score, 
    recall_score, 
    f1_score
    )


import torch.utils.data
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset, DatasetDict, Audio

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score

from transformers import WhisperModel, WhisperFeatureExtractor, AdamW
# from transformers import WhisperEncoder
from transformers import WhisperProcessor

from functions_whisper_model import SpeechClassificationDataset, SpeechClassifier, train, evaluate

In [3]:
list_datasets = [
    # ['fsdkaggle'],    # 2% cough Counter({0: 1570, 1: 30})
    # ['virufy'],       # 100% cough Counter({1: 121})
    # ['esc50'],        # 2% cough Counter({0: 1960, 1: 40})
    # ['coughvid'],     # 30% cough Counter({1: 19777, 0: 10267})
    # ['coswara'],      # 25% cough Counter({0: 18914, 1: 5408})
    ['coswara', 'coughvid', 'esc50', 'fsdkaggle', 'virufy'], 
]

# Main

In [4]:
for window_length in [
    # 0.1, 0.2, 0.3, 
    # 0.5, 0.7, 
    1
    ]:
    df_results = []
    for datasets_name in list_datasets:
        datasets_name.sort()
        print('')
        print('#'*60)
        print(', '.join(datasets_name))
        print(f'Window Length: {window_length}')
        print('#'*60)
        
        dataset_str = '_'.join(datasets_name)

        if not os.path.exists(f'Results_Onset/Model_Whisper_Onset/{dataset_str}'):
            os.makedirs(f'Results_Onset/Model_Whisper_Onset/{dataset_str}')
            
        path_model_save = f'Results_Onset/Model_Whisper_Onset/{dataset_str}/whisper_best_model_{window_length}s.pt'

        ################################################################
        # Load Data
        ################################################################
        df_all = pd.DataFrame()
        for dataset in datasets_name:
            df = pd.read_csv(f'Results_Onset/Sliced_Wav_Onset/dataset_{dataset}_{window_length}s_onset.csv')
            df_all = pd.concat([df_all, df], axis=0)
        df_all = df_all.reset_index(drop=True)

        ################################################################
        # Prepare Data
        ################################################################        
        df_all['filepath'] = '/home/l083319/Cough_Related/' + df_all['filepath']
        df_all = df_all[df_all['mean_amplitude'] > 0.005].reset_index(drop=True)

        for col in ['prob', 'status', 'age', 'Unnamed: 0', 'gender', 'mean_amplitude']:
            if col in df_all.columns:
                df_all = df_all.drop([col], axis=1)
        
        audio_df = df_all.rename(columns={
            'label': 'classID', 
            'filepath': 'full_path',
        })
        
        print(audio_df.shape)
        audio_df = audio_df.sample(frac=1).groupby('classID').head(1000).reset_index(drop=True)
        
        # print(audio_df)
        print(Counter(audio_df['dataset']))
        print(Counter(audio_df['classID']))
        
        train_df, temp_df = train_test_split(audio_df, test_size=0.3, random_state=42)
        val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
        
        print('Train:', len(train_df))
        print('Val  :', len(val_df))
        print('Test :', len(test_df))
        
        train_audio_dataset = datasets.Dataset.from_dict({
            "audio": train_df["full_path"].tolist(),
            "labels": train_df["classID"].tolist()    
            }).cast_column("audio", Audio(sampling_rate=16_000))
        
        test_audio_dataset = datasets.Dataset.from_dict({
            "audio": test_df["full_path"].tolist(),
            "labels": test_df["classID"].tolist()
            }).cast_column("audio", Audio(sampling_rate=16_000))
        
        val_audio_dataset = datasets.Dataset.from_dict({
            "audio": val_df["full_path"].tolist(),
            "labels": val_df["classID"].tolist()
            }).cast_column("audio", Audio(sampling_rate=16_000))

        ################################################################
        # Load Whisper
        ################################################################
        model_checkpoint = "openai/whisper-base"
        processor = WhisperProcessor.from_pretrained(model_checkpoint)
        whisper_model = WhisperModel.from_pretrained("openai/whisper-base")
        encoder = whisper_model.encoder  # this is the encoder module
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        train_dataset = SpeechClassificationDataset(train_audio_dataset, processor, encoder)
        test_dataset = SpeechClassificationDataset(test_audio_dataset, processor, encoder)
        val_dataset = SpeechClassificationDataset(val_audio_dataset, processor, encoder)
        
        batch_size = 32
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        
        num_labels = 2
        
        model = SpeechClassifier(num_labels, encoder).to(device)
        optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
        criterion = nn.CrossEntropyLoss()
    
        num_epochs = 1
        
        # state_dict = torch.load('/home/l083319/Cough_Related/Results/Model/whisper_best_model.pt')
        # encoder = WhisperModel.from_pretrained(model_checkpoint)
        # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # num_labels = 2
        # model = SpeechClassifier(num_labels, encoder).to(device)
        # model.load_state_dict(state_dict)

        ################################################################
        # Train Whisper
        ################################################################
        start = time.time() 
        train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs, path_model_save)
        end = time.time() 
        
        print(f"Total runtime of the program is {round(end - start, 3)}s") 
        print('Training Done!')

        ################################################################
        # Test Whisper
        ################################################################
        print('Load Whisper Model')

        # Create a new instance of the model and load the state dictionary
        num_labels = 2
        state_dict = torch.load(path_model_save)
        model = SpeechClassifier(num_labels, encoder).to(device)
        model.load_state_dict(state_dict)
        
        print('Evaluate Data')
        _, _, _, all_labels, all_preds, all_probs = evaluate(model, test_loader, optimizer, criterion, device)
        
        print(classification_report(all_labels, all_preds))
        print('ACC:', accuracy_score(all_labels, all_preds))
        print('Training Done!')
    
        y_test = all_labels
        y_predict = all_preds
        
        acc = accuracy_score(y_test, y_predict)
        cm = confusion_matrix(y_test, y_predict)
        print(cm)
        
        lr_fpr, lr_tpr, _ = roc_curve(y_test, all_probs[:,1])
        roc_auc = auc(lr_fpr, lr_tpr)
        precision, recall, _ = precision_recall_curve(y_test, all_probs[:,1])
        pr_auc = auc(recall, precision)
        
        pre = precision_score(y_test, y_predict)
        rec = recall_score(y_test, y_predict)
        f1 = f1_score(y_test, y_predict)
        tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
        spe = tn / (tn + fp)
        sen = rec
        
        columns = ['dataset', 'dataset_counter', 'label_count', 'window_length',
                   'model',
                   'acc', 'sen', 'spe', 'pre', 'rec', 'f1', 'auc', 'auprc', 'cm']  
        
        results = [[
            dataset_str,
            Counter(audio_df['dataset']),
            Counter(audio_df['classID']),
            window_length, 'Whisper',
            acc, sen, spe, pre, rec, f1,
            roc_auc, pr_auc, cm]]

        df_results.append(results)
    
        test_df['pred'] = all_preds
        test_df.to_csv(f'Results_Onset/Model_Whisper_Onset/{dataset_str}/results_test_data_{window_length}s.csv', index=False)
        
        # Check which data is predicted wrongly
        test_df_wrong = test_df[test_df['classID'] != test_df['pred']]

    df_results = pd.DataFrame(results, columns=columns)
    df_results.to_csv(f'Results_Onset/Model_Whisper_Onset/{dataset_str}/results_summary_{window_length}s.csv', index=False)
    
print('All Done!')


############################################################
coswara, coughvid, esc50, fsdkaggle, virufy
Window Length: 1
############################################################
(30151, 6)
Counter({'coughvid': 751, 'coswara': 733, 'fsdkaggle': 278, 'esc50': 226, 'virufy': 12})
Counter({1: 1000, 0: 1000})
Train: 1400
Val  : 300
Test : 300
Epoch 1/1, Batch 20/44, Train Loss: 0.6431, Run-time: 105.247s



KeyboardInterrupt



In [None]:
# Create an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

for window_length in [0.1, 0.2, 0.3, 0.5, 0.7, 1]:
    df_results = []
    for datasets_name in list_datasets:
        datasets_name.sort()
        
        dataset_str = '_'.join(datasets_name)
    
        df = pd.read_csv(f'Results_Onset/Model_Whisper_Onset/{dataset_str}/results_summary_{window_length}s.csv')
        combined_df = pd.concat([combined_df, df], ignore_index=True)

# Display or save the result
print(combined_df)
combined_df.to_csv(f'Results_Onset/Model_Whisper_Onset/{dataset_str}/results_summary_All.csv')