In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import lightgbm as lgb

In [4]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV

In [5]:
from sklearn.metrics import accuracy_score, classification_report

In [6]:
train_bert = pd.read_csv('KNOW_2018_train_bert.csv')
test_bert = pd.read_csv('KNOW_2018_test_bert.csv')
train_tf_idf = pd.read_csv('KNOW_2018_train_tfidf.csv')
test_tf_idf = pd.read_csv('KNOW_2018_test_tfidf.csv')
train_simcse = pd.read_csv('KNOW_2018_train_simcse.csv')
test_simcse = pd.read_csv('KNOW_2018_test_simcse.csv')
train_no_text = pd.read_csv('KNOW_2018_train_no_text.csv')
test_no_text = pd.read_csv('KNOW_2018_test_no_text.csv')

# bert

In [11]:
train_bert_X, train_bert_y = train_bert.drop('knowcode', axis=1), train_bert.knowcode
test_bert_X = test_bert

In [12]:
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=8, learning_rate=0.02)

In [13]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_bert_X, 
    train_bert_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_bert_y # Classification 중요 option
)

In [14]:
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_bert_X.iloc[train_index], train_bert_X.iloc[test_index]
    y_train, y_test = train_bert_y[train_index], train_bert_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 3.87834
[200]	valid_0's multi_logloss: 3.49812
[300]	valid_0's multi_logloss: 3.23763
[400]	valid_0's multi_logloss: 3.20699
[500]	valid_0's multi_logloss: 3.20486
[600]	valid_0's multi_logloss: 3.20375
[700]	valid_0's multi_logloss: 3.20308
[800]	valid_0's multi_logloss: 3.20284

#1 교차 검증 정확도 : 0.4428,  학습 데이터 크기 : 5804,  검증 데이터 크기 : 1452
#1 검증 세트 인덱스 : [   0    1    2 ... 3763 4044 4092]




[100]	valid_0's multi_logloss: 3.66613
[200]	valid_0's multi_logloss: 3.28878
[300]	valid_0's multi_logloss: 3.04782
[400]	valid_0's multi_logloss: 3.0235
[500]	valid_0's multi_logloss: 3.0232

#2 교차 검증 정확도 : 0.4473,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#2 검증 세트 인덱스 : [ 149  164  233 ... 5306 5501 5502]




[100]	valid_0's multi_logloss: 3.5427
[200]	valid_0's multi_logloss: 3.14194
[300]	valid_0's multi_logloss: 2.88274
[400]	valid_0's multi_logloss: 2.85027
[500]	valid_0's multi_logloss: 2.8489

#3 교차 검증 정확도 : 0.4762,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#3 검증 세트 인덱스 : [ 329  625  708 ... 6267 6365 6416]




[100]	valid_0's multi_logloss: 3.60295
[200]	valid_0's multi_logloss: 3.2265
[300]	valid_0's multi_logloss: 2.9778
[400]	valid_0's multi_logloss: 2.94775
[500]	valid_0's multi_logloss: 2.9458
[600]	valid_0's multi_logloss: 2.94544

#4 교차 검증 정확도 : 0.4618,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#4 검증 세트 인덱스 : [1176 1507 1571 ... 6971 6980 7034]




[100]	valid_0's multi_logloss: 3.62948
[200]	valid_0's multi_logloss: 3.23236
[300]	valid_0's multi_logloss: 2.98174
[400]	valid_0's multi_logloss: 2.95101
[500]	valid_0's multi_logloss: 2.94793
[600]	valid_0's multi_logloss: 2.94716
[700]	valid_0's multi_logloss: 2.94713

#5 교차 검증 정확도 : 0.4748,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#5 검증 세트 인덱스 : [3137 3456 3457 ... 7253 7254 7255]

## 평균 검증 정확도: 0.46058000000000004


In [15]:
test_bert['knowcode'] = model_lgbm.predict(test_bert_X)
submission_bert = test_bert[['index', 'knowcode']]
submission_bert.to_csv('KNOW_2018_bert_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_bert_X, train_bert_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)

# simcse

In [36]:
train_simcse_X, train_simcse_y = train_simcse.drop(['knowcode', 'index'], axis=1), train_simcse.knowcode
test_simcse_X = test_simcse.drop('index', axis=1)

In [37]:
train_simcse

Unnamed: 0,index,cq1,cq2,cq3,cq4,cq5,cq6,cq7,cq8,cq9,...,bq37_1_bert_22,bq37_1_bert_23,bq37_1_bert_24,bq37_1_bert_25,bq37_1_bert_26,bq37_1_bert_27,bq37_1_bert_28,bq37_1_bert_29,bq37_1_bert_30,bq37_1_bert_31
0,0,5,3,4,2,2,5,4,5,4,...,-0.136475,0.217625,-0.335952,0.309630,0.538342,-0.558905,-0.625207,0.212848,-1.013789,0.571870
1,1,4,3,4,4,4,4,4,4,4,...,-0.484079,-0.004640,-0.746827,-0.698481,-0.647769,-0.253628,-0.073871,-0.044415,-0.702257,0.471131
2,2,4,1,5,3,3,4,2,4,1,...,-0.650855,-0.615933,-0.356884,-0.218918,0.076575,-0.117625,-0.385737,0.133675,-0.293114,0.303875
3,3,4,3,4,4,4,3,3,2,3,...,-0.333475,0.003152,0.277275,-0.196111,0.138847,0.027677,0.530003,-0.341574,-0.238093,0.156080
4,4,3,2,3,1,1,4,3,4,3,...,-0.508624,0.186097,0.349935,0.383173,0.289008,-0.471890,-0.280663,0.345207,-0.709913,0.467666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9066,9067,5,5,5,2,2,4,2,5,3,...,0.288827,-0.084053,-0.036524,-1.060192,1.272000,-0.156451,-0.672342,-0.024451,-0.056371,-0.281917
9067,9068,5,5,5,3,3,5,4,4,4,...,0.069330,0.687799,-0.170616,-0.991487,-0.199754,0.106210,-0.017960,-0.104614,-0.624934,0.257686
9068,9069,4,4,4,3,3,5,4,4,4,...,0.362966,-0.259746,0.253413,-0.748183,0.090738,-0.308453,0.342466,0.202401,-0.651561,0.383281
9069,9070,5,2,3,2,3,5,4,4,3,...,0.362064,0.583812,-0.422108,-0.195119,0.116324,-0.137547,-0.175078,-0.077400,0.720627,-0.297357


In [38]:
train_simcse.isnull().sum()

index             0
cq1               0
cq2               0
cq3               0
cq4               0
                 ..
bq37_1_bert_27    0
bq37_1_bert_28    0
bq37_1_bert_29    0
bq37_1_bert_30    0
bq37_1_bert_31    0
Length: 418, dtype: int64

In [39]:
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=12, learning_rate=0.02, num_leaves=40)

In [40]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_simcse_X, 
    train_simcse_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_simcse_y # Classification 중요 option
)

In [41]:
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_simcse_X.iloc[train_index], train_simcse_X.iloc[test_index]
    y_train, y_test = train_simcse_y[train_index], train_simcse_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 3.52962
[200]	valid_0's multi_logloss: 3.15848
[300]	valid_0's multi_logloss: 2.90326
[400]	valid_0's multi_logloss: 2.87842
[500]	valid_0's multi_logloss: 2.87678
[600]	valid_0's multi_logloss: 2.87641
[700]	valid_0's multi_logloss: 2.87652

#1 교차 검증 정확도 : 0.4745,  학습 데이터 크기 : 5804,  검증 데이터 크기 : 1452
#1 검증 세트 인덱스 : [   0    1    2 ... 3763 4044 4092]




[100]	valid_0's multi_logloss: 3.42162
[200]	valid_0's multi_logloss: 3.04372
[300]	valid_0's multi_logloss: 2.80873
[400]	valid_0's multi_logloss: 2.78892
[500]	valid_0's multi_logloss: 2.78921

#2 교차 검증 정확도 : 0.479,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#2 검증 세트 인덱스 : [ 149  164  233 ... 5306 5501 5502]




[100]	valid_0's multi_logloss: 3.3349
[200]	valid_0's multi_logloss: 2.93264
[300]	valid_0's multi_logloss: 2.6605
[400]	valid_0's multi_logloss: 2.63265
[500]	valid_0's multi_logloss: 2.63162

#3 교차 검증 정확도 : 0.501,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#3 검증 세트 인덱스 : [ 329  625  708 ... 6267 6365 6416]




[100]	valid_0's multi_logloss: 3.36084
[200]	valid_0's multi_logloss: 2.96353
[300]	valid_0's multi_logloss: 2.70997
[400]	valid_0's multi_logloss: 2.68091
[500]	valid_0's multi_logloss: 2.67852
[600]	valid_0's multi_logloss: 2.67821
[700]	valid_0's multi_logloss: 2.6785

#4 교차 검증 정확도 : 0.5072,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#4 검증 세트 인덱스 : [1176 1507 1571 ... 6971 6980 7034]




[100]	valid_0's multi_logloss: 3.43404
[200]	valid_0's multi_logloss: 3.04488
[300]	valid_0's multi_logloss: 2.78524
[400]	valid_0's multi_logloss: 2.75844
[500]	valid_0's multi_logloss: 2.75491
[600]	valid_0's multi_logloss: 2.75361
[700]	valid_0's multi_logloss: 2.75275
[800]	valid_0's multi_logloss: 2.75234
[900]	valid_0's multi_logloss: 2.75231
[1000]	valid_0's multi_logloss: 2.75208

#5 교차 검증 정확도 : 0.4948,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#5 검증 세트 인덱스 : [3137 3456 3457 ... 7253 7254 7255]

## 평균 검증 정확도: 0.4913


In [42]:
test_simcse['knowcode'] = model_lgbm.predict(test_simcse_X)
submission_simcse = test_simcse[['index', 'knowcode']]
submission_simcse.to_csv('KNOW_2018_simcse_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_simcse_X, train_simcse_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)

# tf_idf

In [7]:
tf_idf_without_knowcode = train_tf_idf.drop(columns=['knowcode'], inplace=False)

In [8]:
test_tf_idf = test_tf_idf[tf_idf_without_knowcode.columns]

In [9]:
train_tf_idf_X, train_tf_idf_y = train_tf_idf.drop(['knowcode', 'index'], axis=1), train_tf_idf.knowcode
test_tf_idf_X = test_tf_idf.drop('index', axis=1)

In [10]:
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.02, max_depth=8)

In [11]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_tf_idf_X, 
    train_tf_idf_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_tf_idf_y # Classification 중요 option
)

In [12]:
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_tf_idf_X.iloc[train_index], train_tf_idf_X.iloc[test_index]
    y_train, y_test = train_tf_idf_y[train_index], train_tf_idf_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 3.5067
[200]	valid_0's multi_logloss: 3.1836
[300]	valid_0's multi_logloss: 2.95668
[400]	valid_0's multi_logloss: 2.9154
[500]	valid_0's multi_logloss: 2.90712
[600]	valid_0's multi_logloss: 2.90661

#1 교차 검증 정확도 : 0.4525,  학습 데이터 크기 : 5804,  검증 데이터 크기 : 1452
#1 검증 세트 인덱스 : [   0    1    2 ... 3763 4044 4092]




[100]	valid_0's multi_logloss: 3.28108
[200]	valid_0's multi_logloss: 2.98692
[300]	valid_0's multi_logloss: 2.79149
[400]	valid_0's multi_logloss: 2.75867
[500]	valid_0's multi_logloss: 2.75318
[600]	valid_0's multi_logloss: 2.74991
[700]	valid_0's multi_logloss: 2.74972

#2 교차 검증 정확도 : 0.4893,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#2 검증 세트 인덱스 : [ 149  164  233 ... 5306 5501 5502]




[100]	valid_0's multi_logloss: 3.14888
[200]	valid_0's multi_logloss: 2.78956
[300]	valid_0's multi_logloss: 2.57575
[400]	valid_0's multi_logloss: 2.53641
[500]	valid_0's multi_logloss: 2.53342
[600]	valid_0's multi_logloss: 2.5333

#3 교차 검증 정확도 : 0.5162,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#3 검증 세트 인덱스 : [ 329  625  708 ... 6267 6365 6416]




[100]	valid_0's multi_logloss: 3.19146
[200]	valid_0's multi_logloss: 2.87635
[300]	valid_0's multi_logloss: 2.64888
[400]	valid_0's multi_logloss: 2.60614
[500]	valid_0's multi_logloss: 2.5976
[600]	valid_0's multi_logloss: 2.59414
[700]	valid_0's multi_logloss: 2.59269
[800]	valid_0's multi_logloss: 2.59175
[900]	valid_0's multi_logloss: 2.59124
[1000]	valid_0's multi_logloss: 2.59102

#4 교차 검증 정확도 : 0.5017,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#4 검증 세트 인덱스 : [1176 1507 1571 ... 6971 6980 7034]




[100]	valid_0's multi_logloss: 3.23576
[200]	valid_0's multi_logloss: 2.91918
[300]	valid_0's multi_logloss: 2.70166
[400]	valid_0's multi_logloss: 2.65928
[500]	valid_0's multi_logloss: 2.64897
[600]	valid_0's multi_logloss: 2.64636

#5 교차 검증 정확도 : 0.5003,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#5 검증 세트 인덱스 : [3137 3456 3457 ... 7253 7254 7255]

## 평균 검증 정확도: 0.492


In [16]:
test_tf_idf['knowcode'] = model_lgbm.predict(test_tf_idf_X)
submission_tf_idf = test_tf_idf[['index', 'knowcode']]
submission_tf_idf.to_csv('KNOW_2018_tfidf_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_tf_idf_X, train_tf_idf_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)

# no_text

In [17]:
train_no_text_X, train_no_text_y = train_no_text.drop(['knowcode', 'index'], axis=1), train_no_text.knowcode
test_no_text_X = test_no_text.drop('index', axis=1)

In [18]:
#하이퍼파라미터 없이, 지금은 임의로 넣어둠
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=8, learning_rate=0.01)

In [19]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_no_text_X, 
    train_no_text_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_no_text_y # Classification 중요 option
)

In [20]:
#without hyperparameter
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_no_text_X.iloc[train_index], train_no_text_X.iloc[test_index]
    y_train, y_test = train_no_text_y[train_index], train_no_text_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 4.16443
[200]	valid_0's multi_logloss: 3.83177
[300]	valid_0's multi_logloss: 3.68178
[400]	valid_0's multi_logloss: 3.62291
[500]	valid_0's multi_logloss: 3.55721
[600]	valid_0's multi_logloss: 3.52186
[700]	valid_0's multi_logloss: 3.50978
[800]	valid_0's multi_logloss: 3.49868
[900]	valid_0's multi_logloss: 3.49204
[1000]	valid_0's multi_logloss: 3.48927

#1 교차 검증 정확도 : 0.3974,  학습 데이터 크기 : 5804,  검증 데이터 크기 : 1452
#1 검증 세트 인덱스 : [   0    1    2 ... 3763 4044 4092]




[100]	valid_0's multi_logloss: 3.88318
[200]	valid_0's multi_logloss: 3.49175
[300]	valid_0's multi_logloss: 3.30345
[400]	valid_0's multi_logloss: 3.22133
[500]	valid_0's multi_logloss: 3.15667
[600]	valid_0's multi_logloss: 3.12809
[700]	valid_0's multi_logloss: 3.11589
[800]	valid_0's multi_logloss: 3.10765
[900]	valid_0's multi_logloss: 3.10239
[1000]	valid_0's multi_logloss: 3.10014

#2 교차 검증 정확도 : 0.4459,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#2 검증 세트 인덱스 : [ 149  164  233 ... 5306 5501 5502]




[100]	valid_0's multi_logloss: 3.86155
[200]	valid_0's multi_logloss: 3.45846
[300]	valid_0's multi_logloss: 3.26325
[400]	valid_0's multi_logloss: 3.17813
[500]	valid_0's multi_logloss: 3.0998
[600]	valid_0's multi_logloss: 3.0531
[700]	valid_0's multi_logloss: 3.03309
[800]	valid_0's multi_logloss: 3.02183
[900]	valid_0's multi_logloss: 3.01784
[1000]	valid_0's multi_logloss: 3.01858

#3 교차 검증 정확도 : 0.4549,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#3 검증 세트 인덱스 : [ 329  625  708 ... 6267 6365 6416]




[100]	valid_0's multi_logloss: 3.8862
[200]	valid_0's multi_logloss: 3.4965
[300]	valid_0's multi_logloss: 3.3015
[400]	valid_0's multi_logloss: 3.21785
[500]	valid_0's multi_logloss: 3.13756
[600]	valid_0's multi_logloss: 3.09741
[700]	valid_0's multi_logloss: 3.08144
[800]	valid_0's multi_logloss: 3.07221
[900]	valid_0's multi_logloss: 3.06921
[1000]	valid_0's multi_logloss: 3.0697

#4 교차 검증 정확도 : 0.4452,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#4 검증 세트 인덱스 : [1176 1507 1571 ... 6971 6980 7034]




[100]	valid_0's multi_logloss: 3.89944
[200]	valid_0's multi_logloss: 3.51402
[300]	valid_0's multi_logloss: 3.33333
[400]	valid_0's multi_logloss: 3.26231
[500]	valid_0's multi_logloss: 3.1851
[600]	valid_0's multi_logloss: 3.14706
[700]	valid_0's multi_logloss: 3.13086
[800]	valid_0's multi_logloss: 3.12231
[900]	valid_0's multi_logloss: 3.1177
[1000]	valid_0's multi_logloss: 3.11502

#5 교차 검증 정확도 : 0.4356,  학습 데이터 크기 : 5805,  검증 데이터 크기 : 1451
#5 검증 세트 인덱스 : [3137 3456 3457 ... 7253 7254 7255]

## 평균 검증 정확도: 0.4358000000000001


In [21]:
test_no_text['knowcode'] = model_lgbm.predict(test_no_text_X)
submission_no_text = test_no_text[['index', 'knowcode']]
submission_no_text.to_csv('KNOW_2018_no_text_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_no_text_X, train_no_text_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)