# Colab setting

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd ./drive/Othercomputers/MacBook/Earth/module/dd_earthquake/book

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Othercomputers/MacBook/Earth/module/dd_earthquake/book


In [2]:
%ls

bench_mark.ipynb  colab_lgbt_multigrid.ipynb  learn_categorical_plots.ipynb
[0m[01;34mcatboost_info[0m/    edm_data.ipynb              Light_GBM.ipynb


Install lgbm with gpu option

ref: https://an-engineer-note.com/?p=624

In [3]:
!pip uninstall --yes lightgbm && pip install --install-option=--gpu lightgbm

Found existing installation: lightgbm 3.3.5
Uninstalling lightgbm-3.3.5:
  Successfully uninstalled lightgbm-3.3.5
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightgbm
  Using cached lightgbm-3.3.5.tar.gz (1.5 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Skipping wheel build for lightgbm, due to binaries being disabled for it.
Installing collected packages: lightgbm
  Running setup.py install for lightgbm ... [?25l[?25hdone
Successfully installed lightgbm-3.3.5


# lgbm

In [4]:
import sys
%load_ext autoreload
%autoreload 2
sys.path.append('../')

In [33]:
import numpy as np
import pandas as pd
import warnings
import pprint

import lightgbm as lgb
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from src import common

## grid search

In [6]:
train_values, train_labels = common.file.read_data('train')

In [7]:
train_values

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688636,25,1335,1621,1,55,6,3,n,r,n,...,0,0,0,0,0,0,0,0,0,0
669485,17,715,2060,2,0,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
602512,17,51,8163,3,55,6,7,t,r,q,...,0,0,0,0,0,0,0,0,0,0
151409,26,39,1851,2,10,14,6,t,r,x,...,0,0,0,0,0,0,0,0,0,0


In [8]:
features_list = ['geo_level_1_id', 
                 'geo_level_2_id', 
                 'geo_level_3_id',
                 'age', 
                 'area_percentage', 
                 'height_percentage', 
                 'foundation_type', 
                 'roof_type', 
                 'ground_floor_type', 
                 'other_floor_type', 
                 'position', 
                 'has_superstructure_mud_mortar_stone', 
                 'has_superstructure_cement_mortar_brick', 
                 'has_superstructure_timber', 
                 'count_families', 
                 'has_secondary_use'
                ]

In [9]:
train_values, train_labels = common.lgbm_preprocessing((train_values, train_labels), mode='training', features_list=features_list)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(train_values, train_labels,
                                                        test_size=0.1, random_state=19, stratify=train_labels)

In [11]:
skf = StratifiedKFold(n_splits=5,
                      shuffle=True,
                      random_state=0)

In [24]:
# initialize model and parameter grid
model = lgb.LGBMClassifier(objective='multi_class', num_class=3, 
                           learning_rate=0.15, 
                           force_row_wise=True, 
                           num_boost_round=200, 
                           device='gpu', 
                           # valid_sets=valids
                           )

param_grid = {
    "max_depth": [10, 25, 50, 75],
    # "learning_rate" : [0.001,0.01,0.05,0.1],
    # "num_leaves": [100,300,900,1200],
    "n_estimators": [100,200,500], 
    # 'reg_alpha': list(map(lambda x: x*0.1, range(1, 5))), 
    # 'reg_lambda': [0.1, 0.2],
             }

# grid search
grid_result = GridSearchCV(estimator = model,
                           param_grid = param_grid,
                           scoring = 'f1_micro', 
                           cv = skf,
                           return_train_score = True,
                           n_jobs = -1)

grid_result.fit(x_train, np.reshape(y_train, -1))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=LGBMClassifier(device='gpu', force_row_wise=True,
                                      learning_rate=0.15, num_boost_round=200,
                                      num_class=3, objective='multi_class'),
             n_jobs=-1,
             param_grid={'max_depth': [10, 25, 50, 75],
                         'n_estimators': [100, 200, 500]},
             return_train_score=True, scoring='f1_micro')

In [25]:
pprint.pprint(grid_result.best_estimator_)

LGBMClassifier(device='gpu', force_row_wise=True, learning_rate=0.15,
               max_depth=75, n_estimators=500, num_boost_round=200, num_class=3,
               objective='multi_class')


In [26]:
pprint.pprint(grid_result.best_score_)

0.7287754753986526


# Use best estimator 

In [28]:
grid_result.best_estimator_

LGBMClassifier(device='gpu', force_row_wise=True, learning_rate=0.15,
               max_depth=75, n_estimators=500, num_boost_round=200, num_class=3,
               objective='multi_class')

In [29]:
trains = lgb.Dataset(x_train, y_train)
valids = lgb.Dataset(x_test, y_test)

In [38]:
params = {
    "objective": "multiclass",
    "num_class": 3, 
    "metrics": "multi_logloss",
    'device': 'gpu', 
    'max_depth': 75, 
    'force_row_wise': True, 
    "learning_rate": 0.15, 
    # 'reg_alpha': 0.3,
    # 'reg_lambda': 0.3,
}

In [39]:
model = lgb.train(params, trains, valid_sets=valids, num_boost_round=2000, early_stopping_rounds=100)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 234540, number of used features: 37
[LightGBM] [Info] Using GPU Device: NVIDIA A100-SXM4-40GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 8 dense feature groups (1.79 MB) transferred to GPU in 0.009215 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -2.339190
[LightGBM] [Info] Start training from score -0.564027
[LightGBM] [Info] Start training from score -1.094578
[1]	valid_0's multi_logloss: 0.852538
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.814482
[3]	valid_0's multi_logloss: 0.787538
[4]	valid_0's multi_logloss: 0.767254
[5]	valid_0's multi_logloss: 0.751502
[6]	valid_0's multi_logloss: 0.739141
[7]	valid_0's multi_logl

In [40]:
# predict

# テストデータの予測 ((各クラスの予測確率 [クラス0の予測確率,クラス1の予測確率,クラス2の予測確率] を返す))
y_pred_prob = model.predict(x_test)
# テストデータの予測 (予測クラス(0 or 1 or 2)を返す)
y_pred = np.argmax(y_pred_prob, axis=1) # 一番大きい予測確率のクラスを予測クラスに
# 真値と予測値の表示
df_pred = pd.DataFrame({'target':y_test['damage_grade'].values,'target_pred':y_pred})
display(df_pred)

# 真値と予測確率の表示
df_pred_prob = pd.DataFrame({'y':y_test['damage_grade'].values, 'target0_prob':y_pred_prob[:,0], 'target1_prob':y_pred_prob[:,1], 'target2_prob':y_pred_prob[:,2]})
display(df_pred_prob)

acc = f1_score(y_test,y_pred, average='micro')
print('Acc :', acc)

Unnamed: 0,target,target_pred
0,1,1
1,2,2
2,1,2
3,1,1
4,1,1
...,...,...
26056,0,0
26057,1,1
26058,1,1
26059,0,1


Unnamed: 0,y,target0_prob,target1_prob,target2_prob
0,1,0.052355,0.663366,0.284280
1,2,0.001301,0.477457,0.521242
2,1,0.000210,0.384235,0.615555
3,1,0.003291,0.896287,0.100422
4,1,0.251143,0.665758,0.083099
...,...,...,...,...
26056,0,0.476838,0.475961,0.047201
26057,1,0.223785,0.618440,0.157775
26058,1,0.330121,0.613730,0.056149
26059,0,0.292405,0.678399,0.029197


Acc : 0.7460189555274165


# submit

In [45]:
test_values = common.file.read_data('test')

In [46]:
test_values, _ = common.lgbm_preprocessing(test_values, mode='test', features_list=features_list)