In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [2]:
# csvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('train_SMOTE.csv', delimiter=',', low_memory=False)

# 冒頭を表示して確認
train.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,...,0,0,0,0,0,0,2,0,0,5
1,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,5
2,2,0,0,0,0,0,1,0,3,0,...,0,0,0,0,1,0,0,0,0,1
3,3,0,0,7,0,1,5,2,2,0,...,0,4,0,2,2,0,4,3,0,7
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
# trainのtargetをカテゴリーに変換
train.target = train.target.astype('category')

train.dtypes

id               int64
feature_0        int64
feature_1        int64
feature_2        int64
feature_3        int64
                ...   
feature_71       int64
feature_72       int64
feature_73       int64
feature_74       int64
target        category
Length: 77, dtype: object

In [4]:
# ラベルエンコーディング（LabelEncoder）
le = LabelEncoder()
encoded = le.fit_transform(train.target.values)
decoded = le.inverse_transform(encoded)
train.target = encoded

# 冒頭を表示して確認
train.target.head()

0    5
1    5
2    1
3    7
4    1
Name: target, dtype: int64

In [5]:
# データとラベルを分割する
X, y = train.drop(['target'], axis=1).drop(['id'], axis=1).values, train.target.values

# 表示して確認
X,y

(array([[0, 0, 6, ..., 2, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 1],
        [1, 0, 0, ..., 1, 1, 0],
        [0, 1, 2, ..., 0, 0, 0]]),
 array([5, 5, 1, ..., 8, 8, 8]))

In [6]:
# lightgbmを実装して識別を行う--------------------------------------------------

valid_scores = []  # 評価を格納する配列
kf = KFold(n_splits=5, shuffle=True, random_state=42) #データの分割の仕方を決定

#k分割交差検証
for fold, (train_indices, valid_indices) in enumerate(kf.split(X)):
    X_train, X_test = X[train_indices], X[valid_indices]
    y_train, y_test = y[train_indices], y[valid_indices]

    # データをセット
    # 訓練データ
    lgb_train = lgb.Dataset(X_train, y_train)
    # 評価データ
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    # パラメータの設定
    parms = {
        'task': 'train', #トレーニング用
        'boosting': 'gbdt', #勾配ブースティング決定木
        'objective': 'multiclass', #目的：多値分類
        'num_class': 9, #分類するクラス数
        'metric': 'multi_error', #評価指標：正答率
        'num_iterations': 1000, #1000回学習
        'verbose': -1 #学習情報を非表示
    }

    # モデルの学習
    model = lgb.train(parms,
                     #訓練データ
                     train_set=lgb_train,
                     # 評価データ
                     valid_sets=lgb_eval,
                     early_stopping_rounds=100)
    
    # 結果の予測
    y_pred = model.predict(X_test)
    # 予測確率を整数へ
    y_pred = np.argmax(y_pred, axis=1)
    
    # 識別率を求める
    score = accuracy_score(y_test, y_pred)
    print(f'fold {fold} MAE: {score}')
    
    # 評価を格納する
    valid_scores.append(score)

# 評価の平均を求める
cv_score = np.mean(valid_scores)
print(f'CV score: {cv_score}')



[1]	valid_0's multi_error: 0.796794
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.791926
[3]	valid_0's multi_error: 0.7865
[4]	valid_0's multi_error: 0.783187
[5]	valid_0's multi_error: 0.78042
[6]	valid_0's multi_error: 0.778394
[7]	valid_0's multi_error: 0.776925
[8]	valid_0's multi_error: 0.774759
[9]	valid_0's multi_error: 0.772732
[10]	valid_0's multi_error: 0.771006
[11]	valid_0's multi_error: 0.768733
[12]	valid_0's multi_error: 0.767328
[13]	valid_0's multi_error: 0.765644
[14]	valid_0's multi_error: 0.764669
[15]	valid_0's multi_error: 0.763371
[16]	valid_0's multi_error: 0.761795
[17]	valid_0's multi_error: 0.759951
[18]	valid_0's multi_error: 0.758685
[19]	valid_0's multi_error: 0.757774
[20]	valid_0's multi_error: 0.756444
[21]	valid_0's multi_error: 0.755544
[22]	valid_0's multi_error: 0.754311
[23]	valid_0's multi_error: 0.753152
[24]	valid_0's multi_error: 0.751855
[25]	valid_0's multi_error: 0.75029
[26]	valid_0's multi_error

[219]	valid_0's multi_error: 0.654428
[220]	valid_0's multi_error: 0.654085
[221]	valid_0's multi_error: 0.65356
[222]	valid_0's multi_error: 0.653077
[223]	valid_0's multi_error: 0.652949
[224]	valid_0's multi_error: 0.652842
[225]	valid_0's multi_error: 0.652906
[226]	valid_0's multi_error: 0.652734
[227]	valid_0's multi_error: 0.652552
[228]	valid_0's multi_error: 0.652541
[229]	valid_0's multi_error: 0.652112
[230]	valid_0's multi_error: 0.651876
[231]	valid_0's multi_error: 0.651716
[232]	valid_0's multi_error: 0.65134
[233]	valid_0's multi_error: 0.651169
[234]	valid_0's multi_error: 0.650815
[235]	valid_0's multi_error: 0.65045
[236]	valid_0's multi_error: 0.650172
[237]	valid_0's multi_error: 0.649979
[238]	valid_0's multi_error: 0.649657
[239]	valid_0's multi_error: 0.649528
[240]	valid_0's multi_error: 0.6494
[241]	valid_0's multi_error: 0.649056
[242]	valid_0's multi_error: 0.648778
[243]	valid_0's multi_error: 0.648445
[244]	valid_0's multi_error: 0.648145
[245]	valid_0's m

[436]	valid_0's multi_error: 0.607334
[437]	valid_0's multi_error: 0.607291
[438]	valid_0's multi_error: 0.607034
[439]	valid_0's multi_error: 0.607023
[440]	valid_0's multi_error: 0.606723
[441]	valid_0's multi_error: 0.60668
[442]	valid_0's multi_error: 0.606498
[443]	valid_0's multi_error: 0.606166
[444]	valid_0's multi_error: 0.60579
[445]	valid_0's multi_error: 0.605544
[446]	valid_0's multi_error: 0.605479
[447]	valid_0's multi_error: 0.605458
[448]	valid_0's multi_error: 0.605061
[449]	valid_0's multi_error: 0.604804
[450]	valid_0's multi_error: 0.604461
[451]	valid_0's multi_error: 0.604268
[452]	valid_0's multi_error: 0.603957
[453]	valid_0's multi_error: 0.603935
[454]	valid_0's multi_error: 0.603817
[455]	valid_0's multi_error: 0.603828
[456]	valid_0's multi_error: 0.603603
[457]	valid_0's multi_error: 0.603485
[458]	valid_0's multi_error: 0.603002
[459]	valid_0's multi_error: 0.602992
[460]	valid_0's multi_error: 0.602777
[461]	valid_0's multi_error: 0.602563
[462]	valid_0'

[653]	valid_0's multi_error: 0.57626
[654]	valid_0's multi_error: 0.576163
[655]	valid_0's multi_error: 0.57596
[656]	valid_0's multi_error: 0.575649
[657]	valid_0's multi_error: 0.575702
[658]	valid_0's multi_error: 0.575456
[659]	valid_0's multi_error: 0.575327
[660]	valid_0's multi_error: 0.575134
[661]	valid_0's multi_error: 0.575027
[662]	valid_0's multi_error: 0.574909
[663]	valid_0's multi_error: 0.575005
[664]	valid_0's multi_error: 0.574877
[665]	valid_0's multi_error: 0.574544
[666]	valid_0's multi_error: 0.574416
[667]	valid_0's multi_error: 0.574115
[668]	valid_0's multi_error: 0.573965
[669]	valid_0's multi_error: 0.573858
[670]	valid_0's multi_error: 0.573772
[671]	valid_0's multi_error: 0.573697
[672]	valid_0's multi_error: 0.573418
[673]	valid_0's multi_error: 0.57329
[674]	valid_0's multi_error: 0.573333
[675]	valid_0's multi_error: 0.573022
[676]	valid_0's multi_error: 0.572818
[677]	valid_0's multi_error: 0.572636
[678]	valid_0's multi_error: 0.572679
[679]	valid_0's

[870]	valid_0's multi_error: 0.552552
[871]	valid_0's multi_error: 0.552488
[872]	valid_0's multi_error: 0.552402
[873]	valid_0's multi_error: 0.552348
[874]	valid_0's multi_error: 0.552413
[875]	valid_0's multi_error: 0.552391
[876]	valid_0's multi_error: 0.552187
[877]	valid_0's multi_error: 0.55208
[878]	valid_0's multi_error: 0.551962
[879]	valid_0's multi_error: 0.551619
[880]	valid_0's multi_error: 0.551587
[881]	valid_0's multi_error: 0.551555
[882]	valid_0's multi_error: 0.551383
[883]	valid_0's multi_error: 0.551458
[884]	valid_0's multi_error: 0.551394
[885]	valid_0's multi_error: 0.551051
[886]	valid_0's multi_error: 0.551072
[887]	valid_0's multi_error: 0.550965
[888]	valid_0's multi_error: 0.550965
[889]	valid_0's multi_error: 0.550911
[890]	valid_0's multi_error: 0.550611
[891]	valid_0's multi_error: 0.550547
[892]	valid_0's multi_error: 0.550461
[893]	valid_0's multi_error: 0.55029
[894]	valid_0's multi_error: 0.550107
[895]	valid_0's multi_error: 0.549989
[896]	valid_0'

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import confusion_matrix
# 混同行列を表示
cmx = confusion_matrix(y_test, y_pred)
cmx