In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [2]:
# csvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('train_SMOTE.csv', delimiter=',', low_memory=False)

# 冒頭を表示して確認
train.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,...,0,0,0,0,0,0,2,0,0,5
1,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,5
2,2,0,0,0,0,0,1,0,3,0,...,0,0,0,0,1,0,0,0,0,1
3,3,0,0,7,0,1,5,2,2,0,...,0,4,0,2,2,0,4,3,0,7
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
# trainのtargetをカテゴリーに変換
train.target = train.target.astype('category')

# 冒頭を表示して確認
train.dtypes

id               int64
feature_0        int64
feature_1        int64
feature_2        int64
feature_3        int64
                ...   
feature_71       int64
feature_72       int64
feature_73       int64
feature_74       int64
target        category
Length: 77, dtype: object

In [4]:
# ラベルエンコーディング（LabelEncoder）
le = LabelEncoder()
encoded = le.fit_transform(train.target.values)
decoded = le.inverse_transform(encoded)
train.target = encoded

# 冒頭を表示して確認
train.target.head()

0    5
1    5
2    1
3    7
4    1
Name: target, dtype: int64

In [5]:
# 訓練データを分割する
X, y = train.drop(['target'], axis=1).drop(['id'], axis=1).values, train.target.values

# 表示して確認
X,y

(array([[0, 0, 6, ..., 2, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 1],
        [1, 0, 0, ..., 1, 1, 0],
        [0, 1, 2, ..., 0, 0, 0]]),
 array([5, 5, 1, ..., 8, 8, 8]))

In [6]:
#モデルを構築&コンパイルする関数
def set_model(input_num):
    #モデルを構築
    model = keras.Sequential([
        keras.layers.Flatten(input_shape=(input_num,)),
        keras.layers.Dense(80, activation='relu'),
        keras.layers.Dense(9, activation='softmax')
    ])
    #モデルをコンパイル
    model.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
    return model

In [7]:
valid_scores = []  # 評価を格納する配列
kf = KFold(n_splits=5, shuffle=True, random_state=42) #データの分割の仕方を決定

#k分割交差検証
for fold, (train_indices, valid_indices) in enumerate(kf.split(X)):
    X_train, X_valid = X[train_indices], X[valid_indices]
    y_train, y_valid = y[train_indices], y[valid_indices]

    # モデルをセット
    model = set_model(X_train.shape[1])
    
    # 学習させる
    model.fit(X_train, y_train,
              validation_data=(X_valid, y_valid),
              epochs=10,
              batch_size=512,
              verbose=0)

    # テストデータを適用する
    y_valid_pred = model.predict(X_valid)
    y_valid_pred = [np.argmax(i) for i in y_valid_pred]
    
    # 平均絶対誤差を求める
    score = mean_absolute_error(y_valid, y_valid_pred)
    print(f'fold {fold} MAE: {score}')
    
    # 評価を格納する
    valid_scores.append(score)

cv_score = np.mean(valid_scores)
print(f'CV score: {cv_score}')

fold 0 MAE: 2.164668668239331
fold 1 MAE: 2.230516834655801
fold 2 MAE: 2.215226249195797
fold 3 MAE: 2.19839159339481
fold 4 MAE: 2.1802292540130175
CV score: 2.197806519899751
