### __Tabular Model__

- スペクトログラムを作成

    https://mne.tools/stable/auto_tutorials/clinical/60_sleep.html#feature-engineering

In [1]:
from utils import *

In [2]:
# データの読み込み
train_epochs = load_epoch("train")
test_epochs = load_epoch("test")

In [3]:
def epoch_to_sub_df(epoch_df, id, is_train):
    cols = ["id", "meas_time"]
    
    # 訓練データにはアノテーション追加
    if is_train:
        cols.append("condition")
    
    label_df = epoch_df.loc[epoch_df.groupby("epoch")["time"].idxmin()].reset_index(drop=True)
    label_df["id"] = id
    
    return label_df[cols]

In [4]:
def epoch_to_df(epoch):
    df = epoch.to_data_frame(verbose=False)
    new_meas_date = epoch.info["meas_date"].replace(tzinfo=None) + datetime.timedelta(seconds=epoch.info["temp"]["truncate_start_point"])
    df["meas_time"] = pd.date_range(start=new_meas_date, periods=len(df), freq=pd.Timedelta(1 / 100, unit="s"))
    return df

In [5]:
def power_band(epochs):
    channels = ["EEG Fpz-Cz", "EEG Pz-Oz", "EOG horizontal"]
    spectrum = epochs.compute_psd(picks=channels, verbose=False)
    psds, freqs = spectrum.get_data(return_freqs=True)
    psds /= np.sum(psds, axis=-1, keepdims=True) # 正規化

    X = []
    for i in range(len(channels)):
        for j in range(int(freqs[-1])):
            psds_band_mean = psds[:, i, (freqs >= j) & (freqs < j+1)].mean(axis=-1)
            X.append(psds_band_mean.reshape(len(psds), -1))

            psds_band_max = psds[:, i, (freqs >= j) & (freqs < j+1)].max(axis=-1)
            X.append(psds_band_max.reshape(len(psds), -1))

            psds_band_mean = psds[:, i, (freqs >= j) & (freqs < j+1)].max(axis=-1)
            X.append(psds_band_mean.reshape(len(psds), -1))

    return np.concatenate(X, axis=1)

In [6]:
train_df = []
for epoch in tqdm(train_epochs):
    # 波形をdataframe化
    epoch_df = epoch_to_df(epoch)
    # submit形式のデータフレーム生成
    sub_df = epoch_to_sub_df(epoch_df, epoch.info["temp"]["id"], is_train=True)
    
    # パワースペクトル密度計算
    feature_df = pd.DataFrame(power_band(epoch))
    
    _df = pd.concat([sub_df, feature_df], axis=1)
    # 必要ないラベルがある場合は除外する
    _df = _df[~_df["condition"].isin(["Sleep stage ?", "Movement time"])]
    
    train_df.append(_df)

train_df = pd.concat(train_df).reset_index(drop=True)

  0%|          | 0/108 [00:00<?, ?it/s]

In [7]:
# ラベルIDに変換
train_df["condition"] = train_df["condition"].map(LABEL2ID)
train_df

Unnamed: 0,id,meas_time,condition,0,1,2,3,4,5,6,...,440,441,442,443,444,445,446,447,448,449
0,3c1c5cf,1989-11-13 21:35:00,4,0.004761,0.010244,0.010244,0.000867,0.001664,0.001664,0.000506,...,0.000525,0.000427,0.000694,0.000694,0.000446,0.000703,0.000703,0.000371,0.000689,0.000689
1,3c1c5cf,1989-11-13 21:35:30,4,0.001422,0.002695,0.002695,0.001126,0.001987,0.001987,0.000654,...,0.000682,0.000506,0.000748,0.000748,0.000538,0.000782,0.000782,0.000477,0.000894,0.000894
2,3c1c5cf,1989-11-13 21:36:00,4,0.004680,0.011646,0.011646,0.000512,0.000742,0.000742,0.000452,...,0.000705,0.000511,0.000923,0.000923,0.000463,0.000752,0.000752,0.000440,0.000686,0.000686
3,3c1c5cf,1989-11-13 21:36:30,4,0.006864,0.019896,0.019896,0.001587,0.002832,0.002832,0.000730,...,0.000319,0.000136,0.000235,0.000235,0.000162,0.000201,0.000201,0.000139,0.000244,0.000244
4,3c1c5cf,1989-11-13 21:37:00,4,0.001551,0.003323,0.003323,0.001227,0.002038,0.002038,0.000711,...,0.000409,0.000343,0.000575,0.000575,0.000409,0.000580,0.000580,0.000476,0.000893,0.000893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161605,5edb9d9,1990-03-14 08:17:30,2,0.023262,0.078405,0.078405,0.003707,0.008421,0.008421,0.001718,...,0.000015,0.000009,0.000016,0.000016,0.000013,0.000018,0.000018,0.000013,0.000031,0.000031
161606,5edb9d9,1990-03-14 08:18:00,2,0.024296,0.080117,0.080117,0.002452,0.005165,0.005165,0.001169,...,0.000024,0.000010,0.000014,0.000014,0.000010,0.000016,0.000016,0.000012,0.000025,0.000025
161607,5edb9d9,1990-03-14 08:18:30,1,0.011938,0.029251,0.029251,0.004621,0.008187,0.008187,0.004205,...,0.000054,0.000030,0.000042,0.000042,0.000033,0.000048,0.000048,0.000017,0.000030,0.000030
161608,5edb9d9,1990-03-14 08:19:00,1,0.016051,0.041464,0.041464,0.003514,0.005143,0.005143,0.003764,...,0.000044,0.000029,0.000052,0.000052,0.000024,0.000038,0.000038,0.000028,0.000042,0.000042


In [8]:
test_df = []
for epoch in tqdm(test_epochs):
    # 波形をdataframe化
    epoch_df = epoch_to_df(epoch)
    # submit形式のデータフレーム生成
    sub_df = epoch_to_sub_df(epoch_df, epoch.info["temp"]["id"], is_train=False)

    # パワースペクトル密度計算
    feature_df = pd.DataFrame(power_band(epoch))
    
    _df = pd.concat([sub_df, feature_df], axis=1)
    
    test_df.append(pd.concat([sub_df, feature_df], axis=1))
    
test_df = pd.concat(test_df)

  0%|          | 0/45 [00:00<?, ?it/s]

In [9]:
epochs = train_epochs[0]
epochs = epochs.to_data_frame(verbose=False)
epochs

Unnamed: 0,time,condition,epoch,EEG Fpz-Cz,EEG Pz-Oz,EOG horizontal,Resp oro-nasal,EMG submental,Temp rectal,Event marker
0,0.00,Sleep stage W,0,-80.295971,0.368254,-42.711355,3.100000e+07,3.314000,1.427192e+07,9.530000e+08
1,0.01,Sleep stage W,0,-32.252991,7.074481,-30.367033,3.092531e+07,3.313483,1.427161e+07,9.526826e+08
2,0.02,Sleep stage W,0,-88.165079,6.782906,-54.561905,3.084926e+07,3.312963,1.427131e+07,9.523656e+08
3,0.03,Sleep stage W,0,15.893529,12.517216,-14.566300,3.077191e+07,3.312441,1.427100e+07,9.520494e+08
4,0.04,Sleep stage W,0,23.555556,15.724542,-29.873260,3.069332e+07,3.311918,1.427068e+07,9.517339e+08
...,...,...,...,...,...,...,...,...,...,...
4289995,29.95,Sleep stage 1,1429,7.092552,1.145788,3.209524,3.298500e+07,3.336563,1.424834e+07,9.379142e+08
4289996,29.96,Sleep stage 1,1429,0.155311,2.506471,7.653480,3.239297e+07,3.336464,1.424828e+07,9.377232e+08
4289997,29.97,Sleep stage 1,1429,6.574847,5.616606,2.715751,3.179811e+07,3.336358,1.424822e+07,9.375361e+08
4289998,29.98,Sleep stage 1,1429,-6.678388,9.990232,14.566300,3.120076e+07,3.336245,1.424818e+07,9.373531e+08


### __学習__

- idでGroupKFold

    - ただしscikit-learnの`GroupKFold`は`random_state`が設定できないので`KFold`で実装

    - train: 80％

    - valid: 20%

- モデル: ランダムフォレスト

In [10]:
def custom_accuracy(y_pred, data):
    y_true = data.get_label()
    y_pred = y_pred.reshape(5, -1).argmax(axis=0)
    accuracy = accuracy_score(y_true, y_pred)
    return 'accuracy', accuracy, True

In [11]:
X = train_df.drop(columns=["id", "meas_time", "condition"])
y = train_df["condition"]
X_test = test_df.drop(columns=["id", "meas_time"])

SEED = 42
N_SPLITS = 10

params = {
    "boosting": "gbdt", # "gbdt"
    "objective": "multiclass", # 2値分類
    "seed": SEED,
    "num_leaves": 63,
    "learning_rate": 0.01,  # 学習率
    "feature_fraction": 0.8,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    'num_class': 5, # クラス数
}

cv_scores, results, models = [], [], []
test_prediction =  pd.DataFrame()
kf = KFold(n_splits=N_SPLITS, random_state=SEED, shuffle=True)
unique_ids = train_df["id"].unique()
for fold, (train_id_index, valid_id_index) in tqdm(enumerate(kf.split(unique_ids)), total=N_SPLITS):
    train_index =  train_df["id"].isin(unique_ids[train_id_index])
    valid_index =  train_df["id"].isin(unique_ids[valid_id_index])

    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    result = {}
    model = lgb.train(
        params=params,
        train_set=lgb_train, # 学習データ
        valid_sets=[lgb_train, lgb_valid], # 検証データ
        valid_names=['train', 'valid'], # 学習経過で表示する名称
        num_boost_round=1000, # イテレーションの回数
        feval=custom_accuracy,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False), # アーリーストッピング
            lgb.callback.record_evaluation(result), # ログの記録
            lgb.log_evaluation(0), # ログの表示
        ],
    )
    results.append(result)
    models.append(model)        

    score = result["valid"]["accuracy"][-1]
    cv_scores.append(score)
    print(f'Fold: {fold+1}  Accuracy: {score}')

    # 推論
    test_prediction[str(fold)] = model.predict(X_test).argmax(axis=1)
print(f"CV Score {np.mean(cv_scores)}")

  0%|          | 0/10 [00:00<?, ?it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114750
[LightGBM] [Info] Number of data points in the train set: 144761, number of used features: 450
[LightGBM] [Info] Start training from score -2.958582
[LightGBM] [Info] Start training from score -1.238058
[LightGBM] [Info] Start training from score -2.451117
[LightGBM] [Info] Start training from score -2.191185
[LightGBM] [Info] Start training from score -0.776139
Fold: 1  Accuracy: 0.8589233782420321
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114750
[LightGBM] [Info] Number of data points in the train set: 144738, number of used features: 450
[LightGBM] [Info] Start training from score -2.883802
[LightGBM] [Info] Start training from score -1.249473
[LightGBM] [Info] Start training from score -2.460864
[LightGBM] [Info] Start training from score -2.191829
[LightGBM] [Info] Start training from score -0.775770
Fold: 2  Accuracy: 0.8600640113798008
You ca

### __提出ファイルの作成__

In [12]:
sample_submission_df = pd.read_csv(SUBMISSION_DIR / "sample_submission.csv", parse_dates=[1])
sample_submission_df

Unnamed: 0,id,meas_time,condition
0,53c1555,1989-11-20 23:19:30,Sleep stage W
1,53c1555,1989-11-20 23:20:00,Sleep stage W
2,53c1555,1989-11-20 23:20:30,Sleep stage W
3,53c1555,1989-11-20 23:21:00,Sleep stage W
4,53c1555,1989-11-20 23:21:30,Sleep stage W
...,...,...,...
52291,9b444bb,1989-04-12 07:32:30,Sleep stage W
52292,9b444bb,1989-04-12 07:33:00,Sleep stage W
52293,9b444bb,1989-04-12 07:33:30,Sleep stage W
52294,9b444bb,1989-04-12 07:34:00,Sleep stage W


In [13]:
sample_submission_df["condition"] = test_prediction.mode(axis=1).iloc[:, 0].astype(int) # アンサンブル
sample_submission_df["condition"] = sample_submission_df["condition"].map(ID2LABEL)
sample_submission_df.to_csv(SUBMISSION_DIR / "exp03.csv", index=False)
sample_submission_df

Unnamed: 0,id,meas_time,condition
0,53c1555,1989-11-20 23:19:30,Sleep stage W
1,53c1555,1989-11-20 23:20:00,Sleep stage W
2,53c1555,1989-11-20 23:20:30,Sleep stage W
3,53c1555,1989-11-20 23:21:00,Sleep stage W
4,53c1555,1989-11-20 23:21:30,Sleep stage W
...,...,...,...
52291,9b444bb,1989-04-12 07:32:30,Sleep stage W
52292,9b444bb,1989-04-12 07:33:00,Sleep stage W
52293,9b444bb,1989-04-12 07:33:30,Sleep stage W
52294,9b444bb,1989-04-12 07:34:00,Sleep stage W


In [14]:
sample_submission_df["condition"].value_counts()

Sleep stage 2      21662
Sleep stage W      16462
Sleep stage R       6291
Sleep stage 1       4303
Sleep stage 3/4     3578
Name: condition, dtype: int64