In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [2]:
# csvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('train.csv', delimiter=',', low_memory=False)
submit_data = pd.read_csv('test.csv', delimiter=',', low_memory=False)

# 冒頭を表示して確認
train.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,...,0,0,0,0,0,0,2,0,0,Class_6
1,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,Class_6
2,2,0,0,0,0,0,1,0,3,0,...,0,0,0,0,1,0,0,0,0,Class_2
3,3,0,0,7,0,1,5,2,2,0,...,0,4,0,2,2,0,4,3,0,Class_8
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_2


In [3]:
# trainのtargetをカテゴリーに変換
train.target = train.target.astype('category')

train.dtypes

id               int64
feature_0        int64
feature_1        int64
feature_2        int64
feature_3        int64
                ...   
feature_71       int64
feature_72       int64
feature_73       int64
feature_74       int64
target        category
Length: 77, dtype: object

In [4]:
# ラベルエンコーディング（LabelEncoder）
le = LabelEncoder()
encoded = le.fit_transform(train.target.values)
decoded = le.inverse_transform(encoded)
train.target = encoded

# 冒頭を表示して確認
train.target.head()

0    5
1    5
2    1
3    7
4    1
Name: target, dtype: int64

In [5]:
# 学習データの偏りを見る
weight = train['target'].value_counts()
weight = weight.reindex(index=[0,1,2,3,4,5,6,7,8])
weight

0     9118
1    24431
2    14798
3     4704
4     3064
5    51811
6    14769
7    51763
8    25542
Name: target, dtype: int64

In [6]:
# 重み付けを行う
weight_lst = pd.DataFrame([1-weight[i]/train.shape[0] for i in train.target], columns=['weight'])

# 冒頭を表示して確認
weight_lst.head()

Unnamed: 0,weight
0,0.740945
1,0.740945
2,0.877845
3,0.741185
4,0.877845


In [7]:
weight_lst.shape

(200000, 1)

In [8]:
# データとラベルを分割する
X, y = train.drop(['target'], axis=1).drop(['id'], axis=1).values, train.target.values

# 表示して確認
X,y

(array([[0, 0, 6, ..., 2, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 2, 0, ..., 1, 0, 0],
        [0, 0, 2, ..., 0, 1, 0],
        [5, 4, 0, ..., 2, 3, 1]]),
 array([5, 5, 1, ..., 7, 6, 7]))

In [9]:
# lightgbmを実装して識別を行う--------------------------------------------------

valid_scores = []  # 評価を格納する配列
kf = KFold(n_splits=5, shuffle=True, random_state=42) #データの分割の仕方を決定

#k分割交差検証----------------------
for fold, (train_indices, valid_indices) in enumerate(kf.split(X)):
    X_train, X_test = X[train_indices], X[valid_indices]
    y_train, y_test = y[train_indices], y[valid_indices]

    # データをセット
    # 訓練データ
    lgb_train = lgb.Dataset(X_train, y_train, weight=weight_lst["weight"][train_indices])
    # 評価データ
    lgb_eval = lgb.Dataset(X_test, y_test, weight=weight_lst["weight"][valid_indices], reference=lgb_train)
    
    # パラメータの設定
    parms = {
        'task': 'train', #トレーニング用
        'boosting': 'gbdt', #勾配ブースティング決定木
        'objective': 'multiclass', #目的：多値分類
        'num_class': 9, #分類するクラス数
        'metric': 'multi_error', #評価指標：正答率
        'num_iterations': 100, #100回学習
        'verbose': -1 #学習情報を非表示
    }

    # モデルの学習
    model = lgb.train(parms,
                     #訓練データ
                     train_set=lgb_train,
                     # 評価データ
                     valid_sets=lgb_eval,
                     early_stopping_rounds=100)
    
    # 結果の予測
    y_pred = model.predict(X_test)
    # 予測確率を整数へ
    y_pred = np.argmax(y_pred, axis=1)
    
    # 識別率を求める
    score = accuracy_score(y_test, y_pred)
    print(f'fold {fold} MAE: {score}')
    
    # 評価を格納する
    valid_scores.append(score)

# 評価の平均を求める
cv_score = np.mean(valid_scores)
print(f'CV score: {cv_score}')



[1]	valid_0's multi_error: 0.717624
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.714081
[3]	valid_0's multi_error: 0.692977
[4]	valid_0's multi_error: 0.684496
[5]	valid_0's multi_error: 0.679778
[6]	valid_0's multi_error: 0.676286
[7]	valid_0's multi_error: 0.673455
[8]	valid_0's multi_error: 0.670989
[9]	valid_0's multi_error: 0.670175
[10]	valid_0's multi_error: 0.669067
[11]	valid_0's multi_error: 0.668303
[12]	valid_0's multi_error: 0.66806
[13]	valid_0's multi_error: 0.666832
[14]	valid_0's multi_error: 0.666211
[15]	valid_0's multi_error: 0.665956
[16]	valid_0's multi_error: 0.66524
[17]	valid_0's multi_error: 0.665247
[18]	valid_0's multi_error: 0.664647
[19]	valid_0's multi_error: 0.664287
[20]	valid_0's multi_error: 0.663365
[21]	valid_0's multi_error: 0.662978
[22]	valid_0's multi_error: 0.66307
[23]	valid_0's multi_error: 0.662597
[24]	valid_0's multi_error: 0.662726
[25]	valid_0's multi_error: 0.66256
[26]	valid_0's multi_error



[1]	valid_0's multi_error: 0.717977
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.715605
[3]	valid_0's multi_error: 0.691447
[4]	valid_0's multi_error: 0.687483
[5]	valid_0's multi_error: 0.68325
[6]	valid_0's multi_error: 0.678692
[7]	valid_0's multi_error: 0.674866
[8]	valid_0's multi_error: 0.671338
[9]	valid_0's multi_error: 0.669967
[10]	valid_0's multi_error: 0.666936
[11]	valid_0's multi_error: 0.666459
[12]	valid_0's multi_error: 0.666378
[13]	valid_0's multi_error: 0.665486
[14]	valid_0's multi_error: 0.664933
[15]	valid_0's multi_error: 0.664384
[16]	valid_0's multi_error: 0.664189
[17]	valid_0's multi_error: 0.664286
[18]	valid_0's multi_error: 0.663759
[19]	valid_0's multi_error: 0.664007
[20]	valid_0's multi_error: 0.664012
[21]	valid_0's multi_error: 0.663419
[22]	valid_0's multi_error: 0.663639
[23]	valid_0's multi_error: 0.663213
[24]	valid_0's multi_error: 0.66315
[25]	valid_0's multi_error: 0.662811
[26]	valid_0's multi_err



[1]	valid_0's multi_error: 0.723489
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.719407
[3]	valid_0's multi_error: 0.69789
[4]	valid_0's multi_error: 0.692053
[5]	valid_0's multi_error: 0.687792
[6]	valid_0's multi_error: 0.684127
[7]	valid_0's multi_error: 0.680662
[8]	valid_0's multi_error: 0.678343
[9]	valid_0's multi_error: 0.676298
[10]	valid_0's multi_error: 0.674566
[11]	valid_0's multi_error: 0.672916
[12]	valid_0's multi_error: 0.672295
[13]	valid_0's multi_error: 0.671875
[14]	valid_0's multi_error: 0.670189
[15]	valid_0's multi_error: 0.669571
[16]	valid_0's multi_error: 0.669262
[17]	valid_0's multi_error: 0.66825
[18]	valid_0's multi_error: 0.668756
[19]	valid_0's multi_error: 0.668272
[20]	valid_0's multi_error: 0.667688
[21]	valid_0's multi_error: 0.66734
[22]	valid_0's multi_error: 0.666942
[23]	valid_0's multi_error: 0.667249
[24]	valid_0's multi_error: 0.667138
[25]	valid_0's multi_error: 0.667463
[26]	valid_0's multi_erro



[1]	valid_0's multi_error: 0.715559
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.713211
[3]	valid_0's multi_error: 0.689019
[4]	valid_0's multi_error: 0.683537
[5]	valid_0's multi_error: 0.678957
[6]	valid_0's multi_error: 0.675648
[7]	valid_0's multi_error: 0.67254
[8]	valid_0's multi_error: 0.671281
[9]	valid_0's multi_error: 0.670233
[10]	valid_0's multi_error: 0.668057
[11]	valid_0's multi_error: 0.666714
[12]	valid_0's multi_error: 0.66577
[13]	valid_0's multi_error: 0.665608
[14]	valid_0's multi_error: 0.665009
[15]	valid_0's multi_error: 0.664702
[16]	valid_0's multi_error: 0.664839
[17]	valid_0's multi_error: 0.665064
[18]	valid_0's multi_error: 0.664264
[19]	valid_0's multi_error: 0.6637
[20]	valid_0's multi_error: 0.664135
[21]	valid_0's multi_error: 0.663721
[22]	valid_0's multi_error: 0.663671
[23]	valid_0's multi_error: 0.663214
[24]	valid_0's multi_error: 0.663246
[25]	valid_0's multi_error: 0.663654
[26]	valid_0's multi_error



[1]	valid_0's multi_error: 0.720139
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.714994
[3]	valid_0's multi_error: 0.69512
[4]	valid_0's multi_error: 0.689415
[5]	valid_0's multi_error: 0.684413
[6]	valid_0's multi_error: 0.679369
[7]	valid_0's multi_error: 0.676686
[8]	valid_0's multi_error: 0.674028
[9]	valid_0's multi_error: 0.672987
[10]	valid_0's multi_error: 0.671468
[11]	valid_0's multi_error: 0.670695
[12]	valid_0's multi_error: 0.670684
[13]	valid_0's multi_error: 0.669736
[14]	valid_0's multi_error: 0.66986
[15]	valid_0's multi_error: 0.669364
[16]	valid_0's multi_error: 0.669372
[17]	valid_0's multi_error: 0.668533
[18]	valid_0's multi_error: 0.668397
[19]	valid_0's multi_error: 0.668343
[20]	valid_0's multi_error: 0.668581
[21]	valid_0's multi_error: 0.668207
[22]	valid_0's multi_error: 0.668539
[23]	valid_0's multi_error: 0.667706
[24]	valid_0's multi_error: 0.668022
[25]	valid_0's multi_error: 0.668096
[26]	valid_0's multi_err

In [10]:
from sklearn.metrics import confusion_matrix
# 混同行列を表示
cmx = confusion_matrix(y_test, y_pred)
cmx

array([[   1,  472,    7,    0,    0,  455,    2,  877,   23],
       [   2, 2464,    8,    1,    0,  868,    1, 1387,   68],
       [   1, 1158,    7,    0,    0,  636,    3, 1047,   58],
       [   0,  253,    3,    1,    1,  228,    0,  430,   15],
       [   0,  139,    1,    0,    2,  163,    0,  315,    6],
       [   1, 1060,    6,    1,    2, 5182,    4, 4059,   77],
       [   1,  399,    0,    1,    1,  709,    5, 1841,   37],
       [   1,  944,   10,    1,    1, 2649,    6, 6609,   84],
       [   3, 1310,   10,    1,    1, 1284,    1, 2507,   89]])

In [11]:
# 提出データを適用できる形にする
x_submit = submit_data.drop(['id'], axis=1)

# 冒頭を表示して確認
x_submit.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,0,0,0,0,0,0,0,0,...,3,1,3,0,0,0,0,3,0,0
2,0,1,7,1,0,0,0,0,6,0,...,3,0,0,0,0,3,0,2,0,0
3,0,0,0,4,3,1,0,0,0,0,...,0,0,0,1,0,0,0,4,0,0
4,0,0,5,0,0,0,0,0,0,8,...,0,0,0,0,0,0,0,0,1,0


In [12]:
# 提出データを予測する
predictions = model.predict(x_submit)

# データの形を確認
predictions.shape

(100000, 9)

In [13]:
# 結果を提出形式に変形
df_predictions = pd.DataFrame(predictions)
df_predictions.columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
submit_data = pd.concat([submit_data.id,df_predictions],axis=1)
submit_data = submit_data.set_index('id')
submit_data.head()

Unnamed: 0_level_0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
200000,0.052149,0.379732,0.184217,0.028398,0.014746,0.15025,0.027973,0.047024,0.115511
200001,0.053757,0.114451,0.079226,0.027861,0.017077,0.201237,0.083197,0.275363,0.147831
200002,0.037921,0.047898,0.028747,0.017958,0.011537,0.602473,0.038402,0.127225,0.08784
200003,0.062151,0.108438,0.086933,0.046038,0.017862,0.215524,0.06532,0.199668,0.198066
200004,0.04844,0.107244,0.084984,0.034658,0.014964,0.300287,0.0552,0.200778,0.153445


In [15]:
# CSVファイルとして出力
submit_data.to_csv("submission_LGBM_weight_2.csv")