In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [9]:
# csvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('train.csv', delimiter=',', low_memory=False)
submit_data = pd.read_csv('test.csv', delimiter=',', low_memory=False)

# 冒頭を表示して確認
train.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,...,0,0,0,0,0,0,2,0,0,Class_6
1,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,Class_6
2,2,0,0,0,0,0,1,0,3,0,...,0,0,0,0,1,0,0,0,0,Class_2
3,3,0,0,7,0,1,5,2,2,0,...,0,4,0,2,2,0,4,3,0,Class_8
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_2


In [8]:
#trainの基本的統計量を表示
train.describe(include='all').transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,200000.0,99999.500000,57735.171256,0.0,49999.75,99999.5,149999.25,199999.0
feature_0,200000.0,0.972710,3.941836,0.0,0.00,0.0,1.00,61.0
feature_1,200000.0,1.168365,3.993407,0.0,0.00,0.0,1.00,51.0
feature_2,200000.0,2.219325,6.476570,0.0,0.00,0.0,1.00,64.0
feature_3,200000.0,2.296735,7.551858,0.0,0.00,0.0,1.00,70.0
...,...,...,...,...,...,...,...,...
feature_71,200000.0,0.806895,2.458741,0.0,0.00,0.0,1.00,30.0
feature_72,200000.0,1.282925,4.261420,0.0,0.00,0.0,1.00,61.0
feature_73,200000.0,2.940210,10.784650,0.0,0.00,0.0,1.00,130.0
feature_74,200000.0,0.632005,3.925310,0.0,0.00,0.0,0.00,52.0


In [3]:
# trainのtargetをカテゴリーに変換
train.target = train.target.astype('category')

train.dtypes

id               int64
feature_0        int64
feature_1        int64
feature_2        int64
feature_3        int64
                ...   
feature_71       int64
feature_72       int64
feature_73       int64
feature_74       int64
target        category
Length: 77, dtype: object

In [4]:
# ラベルエンコーディング（LabelEncoder）
le = LabelEncoder()
encoded = le.fit_transform(train.target.values)
decoded = le.inverse_transform(encoded)
train.target = encoded

# 冒頭を表示して確認
train.target.head()

0    5
1    5
2    1
3    7
4    1
Name: target, dtype: int64

In [5]:
# データとラベルを分割する
X, y = train.drop(['target'], axis=1).drop(['id'], axis=1).values, train.target.values

# 表示して確認
X,y

(array([[0, 0, 6, ..., 2, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 2, 0, ..., 1, 0, 0],
        [0, 0, 2, ..., 0, 1, 0],
        [5, 4, 0, ..., 2, 3, 1]]),
 array([5, 5, 1, ..., 7, 6, 7]))

In [6]:
# lightgbmを実装して識別を行う--------------------------------------------------

valid_scores = []  # 評価を格納する配列
kf = KFold(n_splits=5, shuffle=True, random_state=42) #データの分割の仕方を決定

#k分割交差検証
for fold, (train_indices, valid_indices) in enumerate(kf.split(X)):
    X_train, X_test = X[train_indices], X[valid_indices]
    y_train, y_test = y[train_indices], y[valid_indices]

    # データをセット
    # 訓練データ
    lgb_train = lgb.Dataset(X_train, y_train)
    # 評価データ
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    # パラメータの設定
    parms = {
        'task': 'train', #トレーニング用
        'boosting': 'gbdt', #勾配ブースティング決定木
        'objective': 'multiclass', #目的：多値分類
        'num_class': 9, #分類するクラス数
        'metric': 'multi_error', #評価指標：正答率
        'num_iterations': 100, #100回学習
        'verbose': -1 #学習情報を非表示
    }

    # モデルの学習
    model = lgb.train(parms,
                     #訓練データ
                     train_set=lgb_train,
                     # 評価データ
                     valid_sets=lgb_eval,
                     early_stopping_rounds=100)
    
    # 結果の予測
    y_pred = model.predict(X_test)
    # 予測確率を整数へ
    y_pred = np.argmax(y_pred, axis=1)
    
    # 識別率を求める
    score = accuracy_score(y_test, y_pred)
    print(f'fold {fold} MAE: {score}')
    
    # 評価を格納する
    valid_scores.append(score)

# 評価の平均を求める
cv_score = np.mean(valid_scores)
print(f'CV score: {cv_score}')



[1]	valid_0's multi_error: 0.6856
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.684425
[3]	valid_0's multi_error: 0.675175
[4]	valid_0's multi_error: 0.6599
[5]	valid_0's multi_error: 0.655625
[6]	valid_0's multi_error: 0.65145
[7]	valid_0's multi_error: 0.649275
[8]	valid_0's multi_error: 0.647175
[9]	valid_0's multi_error: 0.644575
[10]	valid_0's multi_error: 0.6436
[11]	valid_0's multi_error: 0.64365
[12]	valid_0's multi_error: 0.6429
[13]	valid_0's multi_error: 0.641875
[14]	valid_0's multi_error: 0.641575
[15]	valid_0's multi_error: 0.640425
[16]	valid_0's multi_error: 0.6402
[17]	valid_0's multi_error: 0.639575
[18]	valid_0's multi_error: 0.638525
[19]	valid_0's multi_error: 0.638725
[20]	valid_0's multi_error: 0.638225
[21]	valid_0's multi_error: 0.6393
[22]	valid_0's multi_error: 0.63805
[23]	valid_0's multi_error: 0.638175
[24]	valid_0's multi_error: 0.638125
[25]	valid_0's multi_error: 0.637975
[26]	valid_0's multi_error: 0.638125




[1]	valid_0's multi_error: 0.68775
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.684025
[3]	valid_0's multi_error: 0.6715
[4]	valid_0's multi_error: 0.6623
[5]	valid_0's multi_error: 0.65815
[6]	valid_0's multi_error: 0.655425
[7]	valid_0's multi_error: 0.651325
[8]	valid_0's multi_error: 0.64895
[9]	valid_0's multi_error: 0.6472
[10]	valid_0's multi_error: 0.6451
[11]	valid_0's multi_error: 0.643675
[12]	valid_0's multi_error: 0.64245
[13]	valid_0's multi_error: 0.64145
[14]	valid_0's multi_error: 0.64035
[15]	valid_0's multi_error: 0.64
[16]	valid_0's multi_error: 0.64005
[17]	valid_0's multi_error: 0.6391
[18]	valid_0's multi_error: 0.638675
[19]	valid_0's multi_error: 0.639
[20]	valid_0's multi_error: 0.638675
[21]	valid_0's multi_error: 0.6383
[22]	valid_0's multi_error: 0.638675
[23]	valid_0's multi_error: 0.638525
[24]	valid_0's multi_error: 0.638675
[25]	valid_0's multi_error: 0.638275
[26]	valid_0's multi_error: 0.6386
[27]	valid_0'



[1]	valid_0's multi_error: 0.6902
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.688925
[3]	valid_0's multi_error: 0.6753
[4]	valid_0's multi_error: 0.6663
[5]	valid_0's multi_error: 0.662375
[6]	valid_0's multi_error: 0.660475
[7]	valid_0's multi_error: 0.656825
[8]	valid_0's multi_error: 0.653375
[9]	valid_0's multi_error: 0.650925
[10]	valid_0's multi_error: 0.64885
[11]	valid_0's multi_error: 0.64785
[12]	valid_0's multi_error: 0.647625
[13]	valid_0's multi_error: 0.6471
[14]	valid_0's multi_error: 0.646275
[15]	valid_0's multi_error: 0.645125
[16]	valid_0's multi_error: 0.644725
[17]	valid_0's multi_error: 0.645175
[18]	valid_0's multi_error: 0.64395
[19]	valid_0's multi_error: 0.6436
[20]	valid_0's multi_error: 0.64335
[21]	valid_0's multi_error: 0.6423
[22]	valid_0's multi_error: 0.642525
[23]	valid_0's multi_error: 0.641725
[24]	valid_0's multi_error: 0.642025
[25]	valid_0's multi_error: 0.642075
[26]	valid_0's multi_error: 0.6418
[27



[1]	valid_0's multi_error: 0.684575
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.68115
[3]	valid_0's multi_error: 0.67095
[4]	valid_0's multi_error: 0.659775
[5]	valid_0's multi_error: 0.65625
[6]	valid_0's multi_error: 0.653375
[7]	valid_0's multi_error: 0.64985
[8]	valid_0's multi_error: 0.646075
[9]	valid_0's multi_error: 0.645225
[10]	valid_0's multi_error: 0.644325
[11]	valid_0's multi_error: 0.6437
[12]	valid_0's multi_error: 0.642875
[13]	valid_0's multi_error: 0.642
[14]	valid_0's multi_error: 0.6421
[15]	valid_0's multi_error: 0.64075
[16]	valid_0's multi_error: 0.639725
[17]	valid_0's multi_error: 0.6396
[18]	valid_0's multi_error: 0.63895
[19]	valid_0's multi_error: 0.638175
[20]	valid_0's multi_error: 0.6383
[21]	valid_0's multi_error: 0.639
[22]	valid_0's multi_error: 0.63845
[23]	valid_0's multi_error: 0.637575
[24]	valid_0's multi_error: 0.6382
[25]	valid_0's multi_error: 0.637875
[26]	valid_0's multi_error: 0.638225
[27]	val



[1]	valid_0's multi_error: 0.687975
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.685775
[3]	valid_0's multi_error: 0.6747
[4]	valid_0's multi_error: 0.66635
[5]	valid_0's multi_error: 0.662675
[6]	valid_0's multi_error: 0.6581
[7]	valid_0's multi_error: 0.653825
[8]	valid_0's multi_error: 0.6506
[9]	valid_0's multi_error: 0.650325
[10]	valid_0's multi_error: 0.648925
[11]	valid_0's multi_error: 0.64755
[12]	valid_0's multi_error: 0.6461
[13]	valid_0's multi_error: 0.64545
[14]	valid_0's multi_error: 0.64465
[15]	valid_0's multi_error: 0.644125
[16]	valid_0's multi_error: 0.64375
[17]	valid_0's multi_error: 0.643475
[18]	valid_0's multi_error: 0.643225
[19]	valid_0's multi_error: 0.642175
[20]	valid_0's multi_error: 0.642525
[21]	valid_0's multi_error: 0.64245
[22]	valid_0's multi_error: 0.6423
[23]	valid_0's multi_error: 0.641725
[24]	valid_0's multi_error: 0.641575
[25]	valid_0's multi_error: 0.64195
[26]	valid_0's multi_error: 0.641825
[2

In [7]:
from sklearn.metrics import confusion_matrix
# 混同行列を表示
cmx = confusion_matrix(y_test, y_pred)
cmx

array([[   0,  425,    2,    0,    0,  481,    0,  926,    3],
       [   0, 2253,    5,    1,    0,  987,    0, 1534,   19],
       [   0, 1026,    1,    0,    0,  743,    2, 1132,    6],
       [   0,  225,    1,    0,    0,  243,    1,  456,    5],
       [   0,  126,    0,    0,    0,  167,    0,  332,    1],
       [   1,  860,    1,    0,    0, 5295,    2, 4218,   15],
       [   0,  314,    1,    0,    2,  770,    1, 1902,    4],
       [   0,  750,    4,    1,    1, 2717,    1, 6820,   11],
       [   0, 1123,    4,    1,    2, 1432,    1, 2625,   18]])

In [10]:
# 提出データを適用できる形にする
x_submit = submit_data.drop(['id'], axis=1)

# 冒頭を表示して確認
x_submit.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,0,0,0,0,0,0,0,0,...,3,1,3,0,0,0,0,3,0,0
2,0,1,7,1,0,0,0,0,6,0,...,3,0,0,0,0,3,0,2,0,0
3,0,0,0,4,3,1,0,0,0,0,...,0,0,0,1,0,0,0,4,0,0
4,0,0,5,0,0,0,0,0,0,8,...,0,0,0,0,0,0,0,0,1,0


In [11]:
# 提出データを予測する
predictions = model.predict(x_submit)

# データの形を確認
predictions.shape

(100000, 9)

In [12]:
# 結果を提出形式に変形
df_predictions = pd.DataFrame(predictions)
df_predictions.columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
submit_data = pd.concat([submit_data.id,df_predictions],axis=1)
submit_data = submit_data.set_index('id')
submit_data.head()

Unnamed: 0_level_0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
200000,0.04862,0.367986,0.169901,0.024764,0.013797,0.17651,0.026546,0.058197,0.113678
200001,0.040406,0.114304,0.078888,0.021976,0.013751,0.223605,0.07635,0.295922,0.134797
200002,0.029793,0.035827,0.023231,0.013902,0.008297,0.646299,0.030061,0.133704,0.078886
200003,0.051153,0.105118,0.081511,0.035132,0.014811,0.247756,0.065955,0.227724,0.170839
200004,0.035837,0.102913,0.062952,0.163281,0.011439,0.282182,0.045951,0.18267,0.112775


In [13]:
# CSVファイルとして出力
submit_data.to_csv("submission_LGBM_1.csv")