### import library & setting

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, f1_score, classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

from collections import Counter

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
# 파일 읽기
train = pd.read_csv("train_dat.csv")
test_real = pd.read_csv("test_dat.csv")

# check
train.shape, test_real.shape

((699978, 48), (7078, 47))

##### cv data 나누기

In [None]:
train_x = train.dropna(axis = 0).drop(['cond_loc', 'mea_ddhr', 'tem_in_loc', 'hum_out_loc', 'wind_dir129', 
'spot_press129', 'sea_press129',
'rain616', 'dp_out', 'dp616', 'hour', 'min'], axis = 1)
train_y = train.dropna(axis = 0)['cond_loc']
train_x.shape, train_y.shape

x_test_real = test_real.drop(['mea_ddhr', 'tem_in_loc', 'hum_out_loc', 'wind_dir129', 
'spot_press129', 'sea_press129',
'rain616', 'dp_out', 'dp616', 'hour', 'min'], axis = 1)[train_x.columns]
x_test_real.shape

((657740, 46), (657740,))

#### CSI 구하는 함수

In [None]:
def CSIF(y_true, y_pred):
    a = confusion_matrix(y_true, y_pred)
    return(a[1][1] / (a[1][1] + a[1][0] + a[0][1]))

##### modelling

In [None]:
 # validation set 나누기
x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, test_size = 0.1, random_state = 1886)
# parameter 설정
params = {'learning_rate' : 0.01, 
          'num_iterations' : 2000,
          'max_depth' : 5, 
          'bagging_fraction' : 0.8,
          'feature_fraction' : 0.7,
          'scale_pos_weight' : 1, 
          'metric' : 'auc',
          'objective' : 'binary',
          'random_state' : 1886
           }

 dtrain = lgb.Dataset(x_train, y_train)
 dvalid = lgb.Dataset(x_val, y_val)
 # model fitting
 lgbm = lgb.train(params,
                  dtrain,
                  valid_sets=dvalid,
                  early_stopping_rounds=500,
                  verbose_eval= -1)

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.997838
[200]	valid_0's auc: 0.998505
[300]	valid_0's auc: 0.998775
[400]	valid_0's auc: 0.998959
[500]	valid_0's auc: 0.999118
[600]	valid_0's auc: 0.999225
[700]	valid_0's auc: 0.999312
[800]	valid_0's auc: 0.999366
[900]	valid_0's auc: 0.99943
[1000]	valid_0's auc: 0.999471
[1100]	valid_0's auc: 0.999513
[1200]	valid_0's auc: 0.999551
[1300]	valid_0's auc: 0.999589
[1400]	valid_0's auc: 0.999613
[1500]	valid_0's auc: 0.999633
[1600]	valid_0's auc: 0.999651
[1700]	valid_0's auc: 0.999664
[1800]	valid_0's auc: 0.999676
[1900]	valid_0's auc: 0.999683
[2000]	valid_0's auc: 0.999699
Did not meet early stopping. Best iteration is:
[2000]	valid_0's auc: 0.999699


In [None]:
# prediction
test_real['prob_cond_loc'] = lgbm.predict(x_test_real)
test_24 = test_real[test_real['time'] == 24]
test_48 = test_real[test_real['time'] == 48]

prob_24 = np.mean(test_24.prob_cond_loc) 
prob_48 = np.mean(test_48.prob_cond_loc)

test_24['cond_loc'] = np.where(test_24['prob_cond_loc'] > 0.0075, 1, 0)
test_48['cond_loc'] = np.where(test_48['prob_cond_loc'] > 0.008, 1, 0)

cond_loc_24 = np.sum(test_24.cond_loc)
cond_loc_48 = np.sum(test_48.cond_loc)

print("결로 개수 : ", cond_loc_24, cond_loc_48)
print("결로 확률 평균 : ", prob_24*100, prob_48 *100)

결로 개수 :  336 323
결로 확률 평균 :  1.6885024025816657 2.515084358039384


In [None]:
##### 저장
test_final = pd.concat([test_24, test_48], axis = 0)
test_final.shape, test_real.shape
test_final.to_csv("test_final_tuning.csv", index = False)