ref
1) https://mathmatical22.xyz/2020/04/11/【初学者向け】lightgbm-基本的な使い方-多クラス分類編/
2) https://toukei-lab.com/light-gbm

# Import

In [1]:
import sys
%load_ext autoreload
%autoreload 2
sys.path.append('../')

In [2]:
import numpy as np
import pandas as pd
import warnings

import lightgbm as lgb
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from src import common

In [3]:
def lgbm_preprocessing(datas, mode='training', features_list = None):
    if mode == 'training':
        values = datas[0]
        labels = datas[1]
    elif mode == 'test':
        values = datas
        labels = None
    else:
        raise ValueError(f'{mode} is not defined.')
        
    # Use only some columns
    if features_list is None:
        features_list = ['geo_level_1_id', 
                        'geo_level_2_id', 
                        'geo_level_3_id', 
                        'height_percentage', 
                        'has_superstructure_adobe_mud', 
                        'has_superstructure_mud_mortar_stone', 
                        'has_superstructure_rc_non_engineered', 
                        'has_superstructure_timber', 
                        'foundation_type', 
                        'roof_type', 
                        'ground_floor_type']
    trian_values = pd.get_dummies(values[features_list])
    
    # convert obkect to category
    for _col in values.select_dtypes(include='object'):
        values[_col] = values[_col].astype("category")
    
    # convert labels range [1, 4) -> [0, 3)
    if not labels is None:
        labels = labels-1
    
    return values, labels

# Intro

In [65]:
DATA_DIR = Path('..', '..', '..', 'data', 'final', 'public')
train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

## preprocessing

In [67]:
train_values, train_labels = lgbm_preprocessing((train_values, train_labels), mode='training')

## split to train and test

In [68]:
X_train, X_test, y_train, y_test = train_test_split(train_values, train_labels, 
                                                    test_size=0.1, random_state=19, stratify=train_labels)

In [69]:
trains = lgb.Dataset(X_train, y_train)
valids = lgb.Dataset(X_test, y_test)

In [70]:
params = {
    "objective": "multiclass",
    "num_class": 3, 
    "metrics": "multi_logloss",
    'force_row_wise': True, 
    "learning_rate": 0.2
    
}

model = lgb.train(params, trains, valid_sets=valids, num_boost_round=1000, early_stopping_rounds=100)

[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 234540, number of used features: 37
[LightGBM] [Info] Start training from score -2.339190
[LightGBM] [Info] Start training from score -0.564027
[LightGBM] [Info] Start training from score -1.094578
[1]	valid_0's multi_logloss: 0.834899
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.79255
[3]	valid_0's multi_logloss: 0.764601
[4]	valid_0's multi_logloss: 0.745064
[5]	valid_0's multi_logloss: 0.730304
[6]	valid_0's multi_logloss: 0.71916
[7]	valid_0's multi_logloss: 0.710218




[8]	valid_0's multi_logloss: 0.703205
[9]	valid_0's multi_logloss: 0.69805
[10]	valid_0's multi_logloss: 0.693236
[11]	valid_0's multi_logloss: 0.689566
[12]	valid_0's multi_logloss: 0.686027
[13]	valid_0's multi_logloss: 0.682844
[14]	valid_0's multi_logloss: 0.680392
[15]	valid_0's multi_logloss: 0.677686
[16]	valid_0's multi_logloss: 0.675078
[17]	valid_0's multi_logloss: 0.672411
[18]	valid_0's multi_logloss: 0.670147
[19]	valid_0's multi_logloss: 0.668575
[20]	valid_0's multi_logloss: 0.666035
[21]	valid_0's multi_logloss: 0.664231
[22]	valid_0's multi_logloss: 0.662829
[23]	valid_0's multi_logloss: 0.661265
[24]	valid_0's multi_logloss: 0.659862
[25]	valid_0's multi_logloss: 0.658394
[26]	valid_0's multi_logloss: 0.657483
[27]	valid_0's multi_logloss: 0.656495
[28]	valid_0's multi_logloss: 0.655576
[29]	valid_0's multi_logloss: 0.654715
[30]	valid_0's multi_logloss: 0.653716
[31]	valid_0's multi_logloss: 0.652462
[32]	valid_0's multi_logloss: 0.650886
[33]	valid_0's multi_logloss

## predict

In [71]:
# テストデータの予測 ((各クラスの予測確率 [クラス0の予測確率,クラス1の予測確率,クラス2の予測確率] を返す))
y_pred_prob = model.predict(X_test)
# テストデータの予測 (予測クラス(0 or 1 or 2)を返す)
y_pred = np.argmax(y_pred_prob, axis=1) # 一番大きい予測確率のクラスを予測クラスに
# 真値と予測値の表示
df_pred = pd.DataFrame({'target':y_test['damage_grade'].values,'target_pred':y_pred})
display(df_pred)

# 真値と予測確率の表示
df_pred_prob = pd.DataFrame({'y':y_test['damage_grade'].values, 'target0_prob':y_pred_prob[:,0], 'target1_prob':y_pred_prob[:,1], 'target2_prob':y_pred_prob[:,2]})
display(df_pred_prob)

acc = accuracy_score(y_test,y_pred)
print('Acc :', acc)

Unnamed: 0,target,target_pred
0,1,1
1,2,1
2,1,2
3,1,1
4,1,1
...,...,...
26056,0,0
26057,1,1
26058,1,1
26059,0,1


Unnamed: 0,y,target0_prob,target1_prob,target2_prob
0,1,0.018569,0.821047,0.160384
1,2,0.002044,0.523914,0.474042
2,1,0.000164,0.373575,0.626261
3,1,0.005715,0.896519,0.097767
4,1,0.253461,0.686330,0.060210
...,...,...,...,...
26056,0,0.501258,0.456957,0.041785
26057,1,0.148519,0.678045,0.173436
26058,1,0.341064,0.602124,0.056812
26059,0,0.255593,0.712903,0.031504


Acc : 0.7444457234948774


## Submit

In [72]:
test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')
test_values, _ = lgbm_preprocessing(test_values, mode='test')

In [74]:
y_test_prob = model.predict(test_values)

In [76]:
y_test = np.argmax(y_test_prob, axis=1)

In [80]:
submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=y_test+1,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [84]:
my_submission.to_csv('../../../data/final/submit/submission_lgbm_first.csv')

# Use only high importance features

High importance columns in CatBoost

ref: https://github.com/pat42w/Richters_predictor/blob/master/Richter_entry_v1.ipynb

In [4]:
DATA_DIR = Path('..', '..', '..', 'data', 'final', 'public')
train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

In [5]:
features_list = ['geo_level_1_id', 
                 'geo_level_2_id', 
                 'geo_level_3_id',
                 'age', 
                 'area_percentage', 
                 'height_percentage', 
                 'foundation_type', 
                 'roof_type', 
                 'ground_floor_type', 
                 'other_floor_type', 
                 'position', 
                 'has_superstructure_mud_mortar_stone', 
                 'has_superstructure_cement_mortar_brick', 
                 'has_superstructure_timber', 
                 'count_families', 
                 'has_secondary_use'
                ]

## preprocessing

In [6]:
train_values, train_labels = lgbm_preprocessing((train_values, train_labels), mode='training', features_list=features_list)

## split to train and test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_values, train_labels, 
                                                    test_size=0.1, random_state=19, stratify=train_labels)

In [8]:
trains = lgb.Dataset(X_train, y_train)
valids = lgb.Dataset(X_test, y_test)

In [9]:
params = {
    "objective": "multiclass",
    "num_class": 3, 
    "metrics": "multi_logloss",
    'force_row_wise': True, 
    "learning_rate": 0.15, 
    'reg_alpha': 0.3,
    'reg_lambda': 0.3,
    
}

model = lgb.train(params, trains, valid_sets=valids, num_boost_round=2000, early_stopping_rounds=100)

[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 234540, number of used features: 37
[LightGBM] [Info] Start training from score -2.339190
[LightGBM] [Info] Start training from score -0.564027
[LightGBM] [Info] Start training from score -1.094578
[1]	valid_0's multi_logloss: 0.852582
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.814486
[3]	valid_0's multi_logloss: 0.787538
[4]	valid_0's multi_logloss: 0.767029
[5]	valid_0's multi_logloss: 0.751161
[6]	valid_0's multi_logloss: 0.739066




[7]	valid_0's multi_logloss: 0.729024
[8]	valid_0's multi_logloss: 0.720855
[9]	valid_0's multi_logloss: 0.713498
[10]	valid_0's multi_logloss: 0.708343
[11]	valid_0's multi_logloss: 0.702942
[12]	valid_0's multi_logloss: 0.699085
[13]	valid_0's multi_logloss: 0.695438
[14]	valid_0's multi_logloss: 0.692397
[15]	valid_0's multi_logloss: 0.689568
[16]	valid_0's multi_logloss: 0.686693
[17]	valid_0's multi_logloss: 0.684426
[18]	valid_0's multi_logloss: 0.682505
[19]	valid_0's multi_logloss: 0.680491
[20]	valid_0's multi_logloss: 0.678756
[21]	valid_0's multi_logloss: 0.676522
[22]	valid_0's multi_logloss: 0.674871
[23]	valid_0's multi_logloss: 0.67356
[24]	valid_0's multi_logloss: 0.672147
[25]	valid_0's multi_logloss: 0.67095
[26]	valid_0's multi_logloss: 0.669106
[27]	valid_0's multi_logloss: 0.667871
[28]	valid_0's multi_logloss: 0.666626
[29]	valid_0's multi_logloss: 0.665366
[30]	valid_0's multi_logloss: 0.663725
[31]	valid_0's multi_logloss: 0.66251
[32]	valid_0's multi_logloss: 0

## predict

In [10]:
# テストデータの予測 ((各クラスの予測確率 [クラス0の予測確率,クラス1の予測確率,クラス2の予測確率] を返す))
y_pred_prob = model.predict(X_test)
# テストデータの予測 (予測クラス(0 or 1 or 2)を返す)
y_pred = np.argmax(y_pred_prob, axis=1) # 一番大きい予測確率のクラスを予測クラスに
# 真値と予測値の表示
df_pred = pd.DataFrame({'target':y_test['damage_grade'].values,'target_pred':y_pred})
display(df_pred)

# 真値と予測確率の表示
df_pred_prob = pd.DataFrame({'y':y_test['damage_grade'].values, 'target0_prob':y_pred_prob[:,0], 'target1_prob':y_pred_prob[:,1], 'target2_prob':y_pred_prob[:,2]})
display(df_pred_prob)

acc = accuracy_score(y_test,y_pred)
print('Acc :', acc)

Unnamed: 0,target,target_pred
0,1,1
1,2,2
2,1,2
3,1,1
4,1,1
...,...,...
26056,0,0
26057,1,1
26058,1,1
26059,0,1


Unnamed: 0,y,target0_prob,target1_prob,target2_prob
0,1,0.030709,0.638411,0.330880
1,2,0.001730,0.472421,0.525849
2,1,0.000252,0.431001,0.568747
3,1,0.000938,0.903759,0.095302
4,1,0.254144,0.711050,0.034806
...,...,...,...,...
26056,0,0.481308,0.461495,0.057197
26057,1,0.295955,0.558076,0.145969
26058,1,0.328103,0.625668,0.046229
26059,0,0.176912,0.795127,0.027962


Acc : 0.7487049614366295


## Submit

In [11]:
test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')
test_values, _ = lgbm_preprocessing(test_values, mode='test')

In [12]:
y_test_prob = model.predict(test_values)

In [13]:
y_test = np.argmax(y_test_prob, axis=1)

In [14]:
submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=y_test+1,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [15]:
my_submission.to_csv('../../../data/final/submit/lgbm_mod_param.csv')

# grid search
ref: https://qiita.com/KROYO/items/6607bc77bb465f5e9a3a

In [4]:
train_values, train_labels = common.file.read_data('train')

In [5]:
features_list = ['geo_level_1_id', 
                 'geo_level_2_id', 
                 'geo_level_3_id',
                 'age', 
                 'area_percentage', 
                 'height_percentage', 
                 'foundation_type', 
                 'roof_type', 
                 'ground_floor_type', 
                 'other_floor_type', 
                 'position', 
                 'has_superstructure_mud_mortar_stone', 
                 'has_superstructure_cement_mortar_brick', 
                 'has_superstructure_timber', 
                 'count_families', 
                 'has_secondary_use'
                ]

## preprocessing

In [6]:
train_values, train_labels = lgbm_preprocessing((train_values, train_labels), mode='training', features_list=features_list)

In [7]:
skf = StratifiedKFold(n_splits=5,
                      shuffle=True,
                      random_state=0)

In [9]:
# initialize model and parameter grid
model = lgb.LGBMClassifier(num_boost_round=10)

param_grid = {"max_depth": [ 3, 6, 10,25], #10, 25,
             }

# grid search
grid_result = GridSearchCV(estimator = model,
                           param_grid = param_grid,
                           scoring = 'f1_micro', 
                           cv = skf,
                           return_train_score = True,
                           n_jobs = -1)

grid_result.fit(train_values, train_labels)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.clas



