# 코드

## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import librosa
import imblearn
import xgboost
import catboost

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from pycaret.classification import *

import warnings
warnings.filterwarnings(action='ignore') 

## Hyperparameter Setting
- SR값과 N_MFCC값을 Librosa 패키지에서 사용하는 기본값으로 설정했습니다.
- SEED 값은 123입니다.

In [2]:
CFG = {
    'SR':22050,
    'N_MFCC':20, # MFCC 벡터를 추출할 개수
    'SEED':123
}

## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-Processing 1

In [4]:
train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')

In [5]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [6]:
get_mfcc_feature(train_df, 'train', './train_mfcc_data20_123.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data20_123.csv')

./train_mfcc_data20_123.csv is exist.
./test_mfcc_data20_123.csv is exist.


## Data Pre-Processing 2

In [7]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('./train_mfcc_data20_123.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']

In [8]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [9]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

# Pycaret
- Pycaret으로 학습하기 위해 target값인 train_y를 train_x에 다시 합쳤습니다.

In [10]:
train_x["covid19"] = train_y
train_x.shape

(3805, 27)

## Train
- Categorical 변수 'respiratory_condition', 'fever_or_muscle_pain', 'female', 'male', 'other'를 설정하였습니다.
- Normalize는 minmax로 진행하였습니다.
- random state와 같은 역할을 하는 session_id 값은 위에서 설정한 123으로 동일하게 설정했습니다.
- 불균형 데이터이기 때문에 stratifiedkfold로 진행하였습니다.

In [11]:
model = setup(data=train_x, target='covid19', session_id=CFG['SEED'], train_size=0.8,
              data_split_stratify=True, data_split_shuffle=True, fold_strategy='stratifiedkfold',
              normalize=True, normalize_method='minmax',
              categorical_features=['respiratory_condition', 'fever_or_muscle_pain', 'female', 'male', 'other'])

Unnamed: 0,Description,Value
0,session_id,123
1,Target,covid19
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(3805, 27)"
5,Missing Values,False
6,Numeric Features,21
7,Categorical Features,5
8,Ordinal Features,False
9,High Cardinality Features,False


## Model
- auc를 기준으로 상위 2개 모델을 뽑았습니다.

In [12]:
best = compare_models(sort = 'auc', n_select=2, cross_validation=True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.865,0.7007,0.3342,0.2519,0.2857,0.2134,0.2166,0.004
lr,Logistic Regression,0.9175,0.6998,0.0122,0.1667,0.0222,0.0153,0.0314,0.473
lda,Linear Discriminant Analysis,0.9139,0.6983,0.179,0.4332,0.2507,0.213,0.238,0.006
et,Extra Trees Classifier,0.9192,0.6765,0.0285,0.4167,0.0527,0.0442,0.0931,0.101
catboost,CatBoost Classifier,0.9189,0.6683,0.0243,0.3667,0.0451,0.0371,0.0789,2.391
rf,Random Forest Classifier,0.9195,0.6677,0.0162,0.3,0.0305,0.0259,0.0613,0.182
gbc,Gradient Boosting Classifier,0.9166,0.6629,0.0653,0.4006,0.1101,0.0898,0.1326,0.409
qda,Quadratic Discriminant Analysis,0.7347,0.6449,0.3697,0.2169,0.2108,0.1284,0.1404,0.01
xgboost,Extreme Gradient Boosting,0.9182,0.6388,0.0612,0.3317,0.1029,0.0862,0.1201,0.488
ada,Ada Boost Classifier,0.9103,0.636,0.0447,0.2743,0.0749,0.0495,0.0759,0.114


## Soft voting
- Pycaret의 blend_models를 사용하면 기본적으로 soft voting으로 앙상블 해줍니다.

In [13]:
blender = blend_models(estimator_list=best, optimize='auc', verbose=True)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8852,0.7129,0.24,0.2727,0.2553,0.1934,0.1939
1,0.8885,0.6796,0.28,0.3043,0.2917,0.2313,0.2315
2,0.8984,0.6539,0.24,0.3333,0.2791,0.226,0.2295
3,0.9016,0.7916,0.36,0.3913,0.375,0.3217,0.322
4,0.8882,0.6699,0.125,0.1875,0.15,0.0927,0.0949
5,0.8816,0.6612,0.1667,0.2,0.1818,0.1186,0.1191
6,0.9013,0.6115,0.2917,0.35,0.3182,0.2655,0.2667
7,0.9112,0.7464,0.2917,0.4118,0.3415,0.2953,0.3004
8,0.875,0.7281,0.25,0.2308,0.24,0.172,0.1722
9,0.8882,0.7216,0.36,0.3333,0.3462,0.2851,0.2854


In [14]:
blender

VotingClassifier(estimators=[('nb',
                              GaussianNB(priors=None, var_smoothing=1e-09)),
                             ('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=123,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0,
                                                 warm_start=False))],
                 flatten_transform=True, n_jobs=-1, verbose=False,
                 voting='soft', weights=None)

## Train
- 전체 학습 데이터에 대해서 학습해줍니다.

In [15]:
final_model = finalize_model(blender)

## Inference

In [16]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv('./test_mfcc_data20_123.csv')
test_x = test_x.drop(columns=['id'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

# Model 추론
preds = predict_model(final_model, data=test_x)

## Submission

In [17]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds["Label"]
submission.to_csv('./submit_alchmach.csv', index=False)