해당 코드는 Kaggle Notebook 환경에서 진행하였습니다.

라이브러리 버전
- kaggle 1.5.12
- numpy 1.21.6
- pandas 1.3.5
- sklearn 0.0

노이즈 필터, Audio Augmentation, 여러가지 feature extraction,
음성 처리 모델로 많이 사용되는 모델로 CNN, LSTM, LogisticRegression, Soft Voting등 다양하게 시도해보았는데,

MFCC Feature + MLP 모델이 성능이 좋았고,
Unlabeled된 데이터를 활용해서 Semi-supervised Learning 진행하였습니다.

In [1]:
%pip install audiomentations

Collecting audiomentations
  Downloading audiomentations-0.27.0-py3-none-any.whl (64 kB)
     ---------------------------------------- 64.8/64.8 kB 1.8 MB/s eta 0:00:00
Installing collected packages: audiomentations
Successfully installed audiomentations-0.27.0
Note: you may need to restart the kernel to use updated packages.


In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import os
import random
from scipy import signal
from collections import defaultdict
from tqdm import tqdm
import IPython
from collections import Counter

from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, scale, LabelEncoder, StandardScaler, MinMaxScaler, Normalizer, minmax_scale
from sklearn.metrics import log_loss, f1_score, roc_curve, accuracy_score, roc_auc_score, precision_recall_curve, plot_roc_curve, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn import linear_model
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn import svm

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Conv1D, MaxPooling1D, BatchNormalization, Flatten, LSTM
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Trim, Gain, PolarityInversion, SpecCompose, SpecChannelShuffle, SpecFrequencyMask

from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading

import warnings
warnings.filterwarnings(action='ignore')

In [176]:
#pip list

In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimentalex.set_memory_growth(physical_devices[0], True)
except:
    pass

In [4]:
train_df = pd.read_csv('../input/covid19/open/train_data.csv')
test_df = pd.read_csv('../input/covid19/open/test_data.csv')
unlabeled_df = pd.read_csv('../input/covid19/unlabeled_data.csv')

In [5]:
train_df.head()

In [6]:
train_df.shape

In [7]:
test_df.head()

In [8]:
test_df.shape

In [9]:
unlabeled_df.head()

In [10]:
unlabeled_df.shape

In [11]:
CFG = {
    'SR' : 16000,
    'N_MFCC' : 15,
    'SEED' : 41,
    'N_MELS' : 128
}

In [12]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [13]:
seed = CFG['SEED']

In [14]:
wav_path = '../input/covid19/open/train/00003.wav'
(file_dir, file_id) = os.path.split(wav_path)
y,sr = librosa.load(wav_path, sr = CFG['SR'])

time = np.linspace(0, len(y)/sr, len(y))
fig, ax = plt.subplots()
ax.plot(time, y, color = 'g')
ax.set_xlabel('Time')
ax.set_ylabel('Voice')
plt.title(file_id)
plt.show()

In [15]:
IPython.display.Audio(data = y, rate = sr)

In [16]:
def get_pre_features(df, wav, save_path):
    zip_path = '../input/covid19/open'
    
    if os.path.exists(save_path):
        return print(f'{save_path} is exist.')
    
    hop_length = CFG['SR'] * 0.01
    n_fft = CFG['SR'] * 0.025
    win_length = n_fft

    y_mean_mfcc = []
    
    for uid in tqdm(df.id):
        train_path = os.path.join(zip_path, wav)
        wav_path = os.path.join(train_path, str(uid).zfill(5) + '.wav')
        
        y, sr = librosa.load(wav_path, sr = CFG['SR'])
        
        mels = librosa.feature.melspectrogram(y, sr = sr, n_mels = CFG['N_MELS'])
        
        S = np.abs(mels)
        
        log_S = librosa.power_to_db(S, ref = np.max)
        
        mfcc = librosa.feature.mfcc(y = y, S = log_S, sr = sr, n_mfcc = CFG['N_MFCC'], n_fft = n_fft, win_length = win_length, hop_length = hop_length)
        
        x_mean_mfcc = []
        for item in mfcc:
            x_mean_mfcc.append(np.mean(item))
        y_mean_mfcc.append(x_mean_mfcc)

    mfcc_mean_df = pd.DataFrame(y_mean_mfcc, columns = ['mfcc_mean'+str(x) for x in range(1, CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_mean_df], axis = 1)
    
    df.to_csv(save_path, index = False)
    
    return df

In [17]:
train_pre_df = get_pre_features(train_df, 'train', './train_mfcc_df.csv')
test_pre_df = get_pre_features(test_df, 'test', './test_mfcc_df.csv')
unlabeled_pre_df = get_pre_features(unlabeled_df, 'unlabeled', './unlabeled_mfcc_df.csv')

In [121]:
#train_pre_df = pd.read_csv('../input/preprocess-df/train_mfcc_df.csv')
#test_pre_df = pd.read_csv('../input/preprocess-df/test_mfcc_df.csv')
#unlabeled_pre_df = pd.read_csv('../input/preprocess-df/unlabeled_mfcc_df.csv')

In [122]:
train_pre_df.head()

In [123]:
test_pre_df.head()

In [135]:
train_pre_df_copy = train_pre_df.copy()
test_pre_df_copy = test_pre_df.copy()
unlabeled_pre_df_copy = unlabeled_pre_df.copy()

In [138]:
train_pre_df_copy.shape, test_pre_df_copy.shape, unlabeled_pre_df_copy.shape

In [139]:
def onehot_encoder_(encoder, x, col_list):
    for col in col_list:
        encoder = encoder
        encoder.fit(x[col].values.reshape(-1, 1))
        encoded = encoder.transform(x[col].values.reshape(-1, 1))
        encoded_df = pd.DataFrame(encoded, columns = encoder.categories_[0])
        x = pd.concat([x.drop(columns = [col]), encoded_df], axis = 1)
    return x

def label_encoder_(encoder, x, col_list):
    for col in col_list:
        encoder = encoder
        encoder.fit(x[col].values.reshape(-1, 1))
        x[col] = encoder.transform(x[col].values.reshape(-1, 1))
    return x

In [140]:
col_list = ['gender']
encoder = OneHotEncoder(sparse = False)
train_pre_df_copy = onehot_encoder_(encoder, train_pre_df_copy, col_list)
test_pre_df_copy = onehot_encoder_(encoder, test_pre_df_copy, col_list)
unlabeled_pre_df_copy = onehot_encoder_(encoder, unlabeled_pre_df_copy, col_list)

In [141]:
train_pre_df_copy.info()

In [142]:
train_x = train_pre_df_copy.drop(columns=['id', 'covid19'])
train_y = train_pre_df_copy['covid19']

In [143]:
test_x = test_pre_df_copy[train_x.columns]
unlabeled_x = unlabeled_pre_df_copy[train_x.columns]

In [149]:
model = MLPClassifier(activation = 'relu', solver = 'adam', random_state=seed)
model.fit(train_x, train_y)
prob = model.predict_proba(unlabeled_x)
semiprediction = np.where(prob < 0.487, 1, 0)

In [150]:
unlabeled_x['covid19'] = -1
unlabeled_x['covid19'] = semiprediction

In [151]:
unlabeled_x.covid19.value_counts()

In [152]:
unlabeled_y = unlabeled_x['covid19']
unlabeled_x = unlabeled_x.drop(['covid19'], axis = 1)

In [153]:
all_x = pd.concat([train_x, unlabeled_x], axis=0)
all_y = pd.concat([train_y, unlabeled_y], axis=0)

In [171]:
model = MLPClassifier(activation = 'relu', solver = 'adam', random_state=seed)
model.fit(all_x, all_y)
prob = model.predict_proba(test_x)
prediction = np.where(prob < 0.663, 1, 0)

In [172]:
submission = pd.read_csv('../input/covid19/open/sample_submission.csv')
submission['covid19'] = prediction
submission.to_csv('./submission.csv', index=False)

In [173]:
submission.covid19.value_counts()

In [174]:
submission