In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import lightgbm as lgb

In [3]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV

In [4]:
from sklearn.metrics import accuracy_score, classification_report

In [5]:
train_bert = pd.read_csv('KNOW_2020_train_bert.csv')
test_bert = pd.read_csv('KNOW_2020_test_bert.csv')
train_tf_idf = pd.read_csv('KNOW_2020_train_tfidf.csv')
test_tf_idf = pd.read_csv('KNOW_2020_test_tfidf.csv')
train_simcse = pd.read_csv('KNOW_2020_train_simcse.csv')
test_simcse = pd.read_csv('KNOW_2020_test_simcse.csv')
train_no_text = pd.read_csv('KNOW_2020_train_no_text.csv')
test_no_text = pd.read_csv('KNOW_2020_test_no_text.csv')

# bert

In [7]:
train_bert_X, train_bert_y = train_bert.drop('knowcode', axis=1), train_bert.knowcode
test_bert_X = test_bert

In [8]:
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=6, learning_rate=0.02)

In [9]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_bert_X, 
    train_bert_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_bert_y # Classification 중요 option
)

In [10]:
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_bert_X.iloc[train_index], train_bert_X.iloc[test_index]
    y_train, y_test = train_bert_y[train_index], train_bert_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 3.52208
[200]	valid_0's multi_logloss: 3.22324
[300]	valid_0's multi_logloss: 3.0095
[400]	valid_0's multi_logloss: 2.9736
[500]	valid_0's multi_logloss: 2.96777
[600]	valid_0's multi_logloss: 2.96616
[700]	valid_0's multi_logloss: 2.96519
[800]	valid_0's multi_logloss: 2.96443
[900]	valid_0's multi_logloss: 2.96432

#1 교차 검증 정확도 : 0.4562,  학습 데이터 크기 : 5197,  검증 데이터 크기 : 1300
#1 검증 세트 인덱스 : [   0    1    2 ... 3223 3344 3484]




[100]	valid_0's multi_logloss: 3.5132
[200]	valid_0's multi_logloss: 3.21114
[300]	valid_0's multi_logloss: 3.00735
[400]	valid_0's multi_logloss: 2.97906
[500]	valid_0's multi_logloss: 2.974
[600]	valid_0's multi_logloss: 2.97252
[700]	valid_0's multi_logloss: 2.97223
[800]	valid_0's multi_logloss: 2.97221

#2 교차 검증 정확도 : 0.4623,  학습 데이터 크기 : 5197,  검증 데이터 크기 : 1300
#2 검증 세트 인덱스 : [ 252  285  316 ... 4688 4832 4867]




[100]	valid_0's multi_logloss: 3.60778
[200]	valid_0's multi_logloss: 3.33605
[300]	valid_0's multi_logloss: 3.12537
[400]	valid_0's multi_logloss: 3.09271
[500]	valid_0's multi_logloss: 3.08916
[600]	valid_0's multi_logloss: 3.08801
[700]	valid_0's multi_logloss: 3.08794
[800]	valid_0's multi_logloss: 3.08757
[900]	valid_0's multi_logloss: 3.08716
[1000]	valid_0's multi_logloss: 3.08697

#3 교차 검증 정확도 : 0.4426,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#3 검증 세트 인덱스 : [ 707  794  816 ... 5618 5739 5776]




[100]	valid_0's multi_logloss: 3.67575
[200]	valid_0's multi_logloss: 3.4139
[300]	valid_0's multi_logloss: 3.21515
[400]	valid_0's multi_logloss: 3.17796
[500]	valid_0's multi_logloss: 3.17086
[600]	valid_0's multi_logloss: 3.17019

#4 교차 검증 정확도 : 0.4296,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#4 검증 세트 인덱스 : [1652 1712 1736 ... 6297 6324 6410]




[100]	valid_0's multi_logloss: 3.5246
[200]	valid_0's multi_logloss: 3.23982
[300]	valid_0's multi_logloss: 3.04741
[400]	valid_0's multi_logloss: 3.01815
[500]	valid_0's multi_logloss: 3.01427
[600]	valid_0's multi_logloss: 3.01254
[700]	valid_0's multi_logloss: 3.01214
[800]	valid_0's multi_logloss: 3.01195

#5 교차 검증 정확도 : 0.445,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#5 검증 세트 인덱스 : [2562 2782 3029 ... 6494 6495 6496]

## 평균 검증 정확도: 0.44714


In [11]:
test_bert['knowcode'] = model_lgbm.predict(test_bert_X)
submission_bert = test_bert[['index', 'knowcode']]
submission_bert.to_csv('KNOW_2020_bert_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_bert_X, train_bert_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)

# simcse

In [12]:
train_simcse_X, train_simcse_y = train_simcse.drop(['knowcode', 'index'], axis=1), train_simcse.knowcode
test_simcse_X = test_simcse.drop('index', axis=1)

In [13]:
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=10, learning_rate=0.02, num_leaves=40)

In [14]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_simcse_X, 
    train_simcse_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_simcse_y # Classification 중요 option
)

In [15]:
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_simcse_X.iloc[train_index], train_simcse_X.iloc[test_index]
    y_train, y_test = train_simcse_y[train_index], train_simcse_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 3.37571
[200]	valid_0's multi_logloss: 3.08595
[300]	valid_0's multi_logloss: 2.86892
[400]	valid_0's multi_logloss: 2.84222
[500]	valid_0's multi_logloss: 2.83727
[600]	valid_0's multi_logloss: 2.83649
[700]	valid_0's multi_logloss: 2.83644
[800]	valid_0's multi_logloss: 2.83642

#1 교차 검증 정확도 : 0.4869,  학습 데이터 크기 : 5197,  검증 데이터 크기 : 1300
#1 검증 세트 인덱스 : [   0    1    2 ... 3223 3344 3484]




[100]	valid_0's multi_logloss: 3.3138
[200]	valid_0's multi_logloss: 2.98644
[300]	valid_0's multi_logloss: 2.76293
[400]	valid_0's multi_logloss: 2.74127
[500]	valid_0's multi_logloss: 2.74076

#2 교차 검증 정확도 : 0.4877,  학습 데이터 크기 : 5197,  검증 데이터 크기 : 1300
#2 검증 세트 인덱스 : [ 252  285  316 ... 4688 4832 4867]




[100]	valid_0's multi_logloss: 3.46387
[200]	valid_0's multi_logloss: 3.17142
[300]	valid_0's multi_logloss: 2.95773
[400]	valid_0's multi_logloss: 2.93566
[500]	valid_0's multi_logloss: 2.93381
[600]	valid_0's multi_logloss: 2.93278
[700]	valid_0's multi_logloss: 2.93226
[800]	valid_0's multi_logloss: 2.93199

#3 교차 검증 정확도 : 0.4604,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#3 검증 세트 인덱스 : [ 707  794  816 ... 5618 5739 5776]




[100]	valid_0's multi_logloss: 3.46018
[200]	valid_0's multi_logloss: 3.1766
[300]	valid_0's multi_logloss: 2.96085
[400]	valid_0's multi_logloss: 2.93143
[500]	valid_0's multi_logloss: 2.92878
[600]	valid_0's multi_logloss: 2.9276
[700]	valid_0's multi_logloss: 2.92734
[800]	valid_0's multi_logloss: 2.92732

#4 교차 검증 정확도 : 0.465,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#4 검증 세트 인덱스 : [1652 1712 1736 ... 6297 6324 6410]




[100]	valid_0's multi_logloss: 3.37347
[200]	valid_0's multi_logloss: 3.06999
[300]	valid_0's multi_logloss: 2.86462
[400]	valid_0's multi_logloss: 2.84537
[500]	valid_0's multi_logloss: 2.84126
[600]	valid_0's multi_logloss: 2.84129

#5 교차 검증 정확도 : 0.4942,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#5 검증 세트 인덱스 : [2562 2782 3029 ... 6494 6495 6496]

## 평균 검증 정확도: 0.47884000000000004


In [16]:
test_simcse['knowcode'] = model_lgbm.predict(test_simcse_X)
submission_simcse = test_simcse[['index', 'knowcode']]
submission_simcse.to_csv('KNOW_2020_simcse_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_simcse_X, train_simcse_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)

# tf_idf

In [17]:
tf_idf_without_knowcode = train_tf_idf.drop(columns=['knowcode'], inplace=False)

In [18]:
test_tf_idf = test_tf_idf[tf_idf_without_knowcode.columns]

In [19]:
train_tf_idf_X, train_tf_idf_y = train_tf_idf.drop(['knowcode', 'index'], axis=1), train_tf_idf.knowcode
test_tf_idf_X = test_tf_idf.drop('index', axis=1)

In [20]:
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.02, max_depth=8)

In [21]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_tf_idf_X, 
    train_tf_idf_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_tf_idf_y # Classification 중요 option
)

In [22]:
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_tf_idf_X.iloc[train_index], train_tf_idf_X.iloc[test_index]
    y_train, y_test = train_tf_idf_y[train_index], train_tf_idf_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 3.56019
[200]	valid_0's multi_logloss: 3.28167
[300]	valid_0's multi_logloss: 3.05809
[400]	valid_0's multi_logloss: 2.99258
[500]	valid_0's multi_logloss: 2.97917
[600]	valid_0's multi_logloss: 2.97342
[700]	valid_0's multi_logloss: 2.97006
[800]	valid_0's multi_logloss: 2.96777
[900]	valid_0's multi_logloss: 2.96678
[1000]	valid_0's multi_logloss: 2.96628

#1 교차 검증 정확도 : 0.4415,  학습 데이터 크기 : 5197,  검증 데이터 크기 : 1300
#1 검증 세트 인덱스 : [   0    1    2 ... 3223 3344 3484]




[100]	valid_0's multi_logloss: 3.50568
[200]	valid_0's multi_logloss: 3.23215
[300]	valid_0's multi_logloss: 3.04795
[400]	valid_0's multi_logloss: 3.00965
[500]	valid_0's multi_logloss: 3.00711
[600]	valid_0's multi_logloss: 3.00397
[700]	valid_0's multi_logloss: 3.00335
[800]	valid_0's multi_logloss: 3.00265
[900]	valid_0's multi_logloss: 3.00229
[1000]	valid_0's multi_logloss: 3.00225

#2 교차 검증 정확도 : 0.4492,  학습 데이터 크기 : 5197,  검증 데이터 크기 : 1300
#2 검증 세트 인덱스 : [ 252  285  316 ... 4688 4832 4867]




[100]	valid_0's multi_logloss: 3.61992
[200]	valid_0's multi_logloss: 3.35689
[300]	valid_0's multi_logloss: 3.15373
[400]	valid_0's multi_logloss: 3.10645
[500]	valid_0's multi_logloss: 3.09871
[600]	valid_0's multi_logloss: 3.09365
[700]	valid_0's multi_logloss: 3.09213
[800]	valid_0's multi_logloss: 3.09112
[900]	valid_0's multi_logloss: 3.0909

#3 교차 검증 정확도 : 0.4296,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#3 검증 세트 인덱스 : [ 707  794  816 ... 5618 5739 5776]




[100]	valid_0's multi_logloss: 3.5801
[200]	valid_0's multi_logloss: 3.34158
[300]	valid_0's multi_logloss: 3.16024
[400]	valid_0's multi_logloss: 3.11224
[500]	valid_0's multi_logloss: 3.09966
[600]	valid_0's multi_logloss: 3.09412
[700]	valid_0's multi_logloss: 3.0917
[800]	valid_0's multi_logloss: 3.08995
[900]	valid_0's multi_logloss: 3.0891
[1000]	valid_0's multi_logloss: 3.08852

#4 교차 검증 정확도 : 0.4326,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#4 검증 세트 인덱스 : [1652 1712 1736 ... 6297 6324 6410]




[100]	valid_0's multi_logloss: 3.47618
[200]	valid_0's multi_logloss: 3.20046
[300]	valid_0's multi_logloss: 3.01368
[400]	valid_0's multi_logloss: 2.97131
[500]	valid_0's multi_logloss: 2.96145
[600]	valid_0's multi_logloss: 2.95855
[700]	valid_0's multi_logloss: 2.95606
[800]	valid_0's multi_logloss: 2.95441
[900]	valid_0's multi_logloss: 2.95411
[1000]	valid_0's multi_logloss: 2.95372

#5 교차 검증 정확도 : 0.4488,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#5 검증 세트 인덱스 : [2562 2782 3029 ... 6494 6495 6496]

## 평균 검증 정확도: 0.44033999999999995


In [26]:
test_tf_idf['knowcode'] = model_lgbm.predict(test_tf_idf_X)
submission_tf_idf = test_tf_idf[['index', 'knowcode']]
submission_tf_idf.to_csv('KNOW_2020_tfidf_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_tf_idf_X, train_tf_idf_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)

# no_text

In [27]:
train_no_text_X, train_no_text_y = train_no_text.drop(['knowcode', 'index'], axis=1), train_no_text.knowcode
test_no_text_X = test_no_text.drop('index', axis=1)

In [28]:
#하이퍼파라미터 없이, 지금은 임의로 넣어둠
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=6, learning_rate=0.01)

In [29]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_no_text_X, 
    train_no_text_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_no_text_y # Classification 중요 option
)

In [30]:
#without hyperparameter
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_no_text_X.iloc[train_index], train_no_text_X.iloc[test_index]
    y_train, y_test = train_no_text_y[train_index], train_no_text_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 4.37409
[200]	valid_0's multi_logloss: 4.02285
[300]	valid_0's multi_logloss: 3.85718
[400]	valid_0's multi_logloss: 3.77714
[500]	valid_0's multi_logloss: 3.6879
[600]	valid_0's multi_logloss: 3.60098
[700]	valid_0's multi_logloss: 3.55831
[800]	valid_0's multi_logloss: 3.53329
[900]	valid_0's multi_logloss: 3.51858
[1000]	valid_0's multi_logloss: 3.51199

#1 교차 검증 정확도 : 0.3546,  학습 데이터 크기 : 5197,  검증 데이터 크기 : 1300
#1 검증 세트 인덱스 : [   0    1    2 ... 3223 3344 3484]




[100]	valid_0's multi_logloss: 4.2172
[200]	valid_0's multi_logloss: 3.85313
[300]	valid_0's multi_logloss: 3.67781
[400]	valid_0's multi_logloss: 3.601
[500]	valid_0's multi_logloss: 3.53148
[600]	valid_0's multi_logloss: 3.47404
[700]	valid_0's multi_logloss: 3.44667
[800]	valid_0's multi_logloss: 3.43466
[900]	valid_0's multi_logloss: 3.43059
[1000]	valid_0's multi_logloss: 3.42833

#2 교차 검증 정확도 : 0.3846,  학습 데이터 크기 : 5197,  검증 데이터 크기 : 1300
#2 검증 세트 인덱스 : [ 252  285  316 ... 4688 4832 4867]




[100]	valid_0's multi_logloss: 4.30519
[200]	valid_0's multi_logloss: 3.93699
[300]	valid_0's multi_logloss: 3.74195
[400]	valid_0's multi_logloss: 3.64855
[500]	valid_0's multi_logloss: 3.55839
[600]	valid_0's multi_logloss: 3.47808
[700]	valid_0's multi_logloss: 3.44217
[800]	valid_0's multi_logloss: 3.4293
[900]	valid_0's multi_logloss: 3.42406
[1000]	valid_0's multi_logloss: 3.41854

#3 교차 검증 정확도 : 0.3811,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#3 검증 세트 인덱스 : [ 707  794  816 ... 5618 5739 5776]




[100]	valid_0's multi_logloss: 4.34438
[200]	valid_0's multi_logloss: 3.98138
[300]	valid_0's multi_logloss: 3.80397
[400]	valid_0's multi_logloss: 3.72689
[500]	valid_0's multi_logloss: 3.64249
[600]	valid_0's multi_logloss: 3.57384
[700]	valid_0's multi_logloss: 3.54308
[800]	valid_0's multi_logloss: 3.52596
[900]	valid_0's multi_logloss: 3.51396
[1000]	valid_0's multi_logloss: 3.50682

#4 교차 검증 정확도 : 0.3557,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#4 검증 세트 인덱스 : [1652 1712 1736 ... 6297 6324 6410]




[100]	valid_0's multi_logloss: 4.33504
[200]	valid_0's multi_logloss: 3.97198
[300]	valid_0's multi_logloss: 3.77654
[400]	valid_0's multi_logloss: 3.69381
[500]	valid_0's multi_logloss: 3.60389
[600]	valid_0's multi_logloss: 3.53805
[700]	valid_0's multi_logloss: 3.5057
[800]	valid_0's multi_logloss: 3.49213
[900]	valid_0's multi_logloss: 3.48625
[1000]	valid_0's multi_logloss: 3.48206

#5 교차 검증 정확도 : 0.3657,  학습 데이터 크기 : 5198,  검증 데이터 크기 : 1299
#5 검증 세트 인덱스 : [2562 2782 3029 ... 6494 6495 6496]

## 평균 검증 정확도: 0.36834


In [31]:
test_no_text['knowcode'] = model_lgbm.predict(test_no_text_X)
submission_no_text = test_no_text[['index', 'knowcode']]
submission_no_text.to_csv('KNOW_2020_no_text_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_no_text_X, train_no_text_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)