In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import lightgbm as lgb

In [3]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV

In [4]:
from sklearn.metrics import accuracy_score, classification_report

In [5]:
train_bert = pd.read_csv('KNOW_2019_train_bert.csv')
test_bert = pd.read_csv('KNOW_2019_test_bert.csv')
train_tf_idf = pd.read_csv('KNOW_2019_train_tfidf.csv')
test_tf_idf = pd.read_csv('KNOW_2019_test_tfidf.csv')
train_simcse = pd.read_csv('KNOW_2019_train_simcse.csv')
test_simcse = pd.read_csv('KNOW_2019_test_simcse.csv')
train_no_text = pd.read_csv('KNOW_2019_train_no_text.csv')
test_no_text = pd.read_csv('KNOW_2019_test_no_text.csv')

# bert

In [7]:
train_bert_X, train_bert_y = train_bert.drop('knowcode', axis=1), train_bert.knowcode
test_bert_X = test_bert

In [8]:
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=6, learning_rate=0.02)

In [9]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_bert_X, 
    train_bert_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_bert_y # Classification 중요 option
)

In [10]:
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_bert_X.iloc[train_index], train_bert_X.iloc[test_index]
    y_train, y_test = train_bert_y[train_index], train_bert_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 3.76208
[200]	valid_0's multi_logloss: 3.38448
[300]	valid_0's multi_logloss: 3.13114
[400]	valid_0's multi_logloss: 3.10134
[500]	valid_0's multi_logloss: 3.09645
[600]	valid_0's multi_logloss: 3.09454
[700]	valid_0's multi_logloss: 3.09347
[800]	valid_0's multi_logloss: 3.09317
[900]	valid_0's multi_logloss: 3.09293
[1000]	valid_0's multi_logloss: 3.09287

#1 교차 검증 정확도 : 0.4335,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#1 검증 세트 인덱스 : [   0    1    2 ... 3208 3351 3574]




[100]	valid_0's multi_logloss: 3.66027
[200]	valid_0's multi_logloss: 3.26509
[300]	valid_0's multi_logloss: 3.01518
[400]	valid_0's multi_logloss: 2.98821
[500]	valid_0's multi_logloss: 2.9876

#2 교차 검증 정확도 : 0.4444,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#2 검증 세트 인덱스 : [ 105  185  239 ... 4970 5093 5198]




[100]	valid_0's multi_logloss: 3.64051
[200]	valid_0's multi_logloss: 3.26928
[300]	valid_0's multi_logloss: 3.04434
[400]	valid_0's multi_logloss: 3.01679
[500]	valid_0's multi_logloss: 3.01688

#3 교차 검증 정확도 : 0.4518,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#3 검증 세트 인덱스 : [ 443  447  619 ... 6010 6070 6231]




[100]	valid_0's multi_logloss: 3.5859
[200]	valid_0's multi_logloss: 3.23231
[300]	valid_0's multi_logloss: 3.00273
[400]	valid_0's multi_logloss: 2.97664
[500]	valid_0's multi_logloss: 2.97644

#4 교차 검증 정확도 : 0.4481,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#4 검증 세트 인덱스 : [1315 1667 1807 ... 6508 6629 6690]




[100]	valid_0's multi_logloss: 3.69509
[200]	valid_0's multi_logloss: 3.32198
[300]	valid_0's multi_logloss: 3.07975
[400]	valid_0's multi_logloss: 3.04493
[500]	valid_0's multi_logloss: 3.04006
[600]	valid_0's multi_logloss: 3.03879
[700]	valid_0's multi_logloss: 3.03831

#5 교차 검증 정확도 : 0.4408,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#5 검증 세트 인덱스 : [2451 2777 2931 ... 6837 6838 6839]

## 평균 검증 정확도: 0.44372


In [11]:
test_bert['knowcode'] = model_lgbm.predict(test_bert_X)
submission_bert = test_bert[['index', 'knowcode']]
submission_bert.to_csv('KNOW_2019_bert_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_bert_X, train_bert_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)

# simcse

In [18]:
train_simcse_X, train_simcse_y = train_simcse.drop(['knowcode', 'index'], axis=1), train_simcse.knowcode
test_simcse_X = test_simcse.drop('index', axis=1)

In [19]:
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=12, learning_rate=0.02, num_leaves=40)

In [20]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_simcse_X, 
    train_simcse_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_simcse_y # Classification 중요 option
)

In [21]:
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_simcse_X.iloc[train_index], train_simcse_X.iloc[test_index]
    y_train, y_test = train_simcse_y[train_index], train_simcse_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 3.51064
[200]	valid_0's multi_logloss: 3.17603
[300]	valid_0's multi_logloss: 2.97566
[400]	valid_0's multi_logloss: 2.95545
[500]	valid_0's multi_logloss: 2.95102
[600]	valid_0's multi_logloss: 2.9493
[700]	valid_0's multi_logloss: 2.94825
[800]	valid_0's multi_logloss: 2.94757
[900]	valid_0's multi_logloss: 2.94729
[1000]	valid_0's multi_logloss: 2.94714

#1 교차 검증 정확도 : 0.4642,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#1 검증 세트 인덱스 : [   0    1    2 ... 3208 3351 3574]




[100]	valid_0's multi_logloss: 3.475
[200]	valid_0's multi_logloss: 3.09279
[300]	valid_0's multi_logloss: 2.85675
[400]	valid_0's multi_logloss: 2.84041
[500]	valid_0's multi_logloss: 2.83821
[600]	valid_0's multi_logloss: 2.83811

#2 교차 검증 정확도 : 0.47,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#2 검증 세트 인덱스 : [ 105  185  239 ... 4970 5093 5198]




[100]	valid_0's multi_logloss: 3.45398
[200]	valid_0's multi_logloss: 3.06177
[300]	valid_0's multi_logloss: 2.82626
[400]	valid_0's multi_logloss: 2.80467
[500]	valid_0's multi_logloss: 2.80381
[600]	valid_0's multi_logloss: 2.80377

#3 교차 검증 정확도 : 0.4868,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#3 검증 세트 인덱스 : [ 443  447  619 ... 6010 6070 6231]




[100]	valid_0's multi_logloss: 3.36641
[200]	valid_0's multi_logloss: 3.02514
[300]	valid_0's multi_logloss: 2.81736
[400]	valid_0's multi_logloss: 2.80111
[500]	valid_0's multi_logloss: 2.80096
[600]	valid_0's multi_logloss: 2.80131

#4 교차 검증 정확도 : 0.4963,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#4 검증 세트 인덱스 : [1315 1667 1807 ... 6508 6629 6690]




[100]	valid_0's multi_logloss: 3.49966
[200]	valid_0's multi_logloss: 3.14386
[300]	valid_0's multi_logloss: 2.92729
[400]	valid_0's multi_logloss: 2.90413
[500]	valid_0's multi_logloss: 2.90009

#5 교차 검증 정확도 : 0.4722,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#5 검증 세트 인덱스 : [2451 2777 2931 ... 6837 6838 6839]

## 평균 검증 정확도: 0.4779


In [22]:
test_simcse['knowcode'] = model_lgbm.predict(test_simcse_X)
submission_simcse = test_simcse[['index', 'knowcode']]
submission_simcse.to_csv('KNOW_2019_simcse_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_simcse_X, train_simcse_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)

# tf_idf

In [33]:
tf_idf_without_knowcode = train_tf_idf.drop(columns=['knowcode'], inplace=False)

In [34]:
test_tf_idf = test_tf_idf[tf_idf_without_knowcode.columns]

In [35]:
train_tf_idf_X, train_tf_idf_y = train_tf_idf.drop(['knowcode', 'index'], axis=1), train_tf_idf.knowcode
test_tf_idf_X = test_tf_idf.drop('index', axis=1)

In [36]:
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.02, max_depth=8)

In [37]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_tf_idf_X, 
    train_tf_idf_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_tf_idf_y # Classification 중요 option
)

In [38]:
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_tf_idf_X.iloc[train_index], train_tf_idf_X.iloc[test_index]
    y_train, y_test = train_tf_idf_y[train_index], train_tf_idf_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 3.50234
[200]	valid_0's multi_logloss: 3.16795
[300]	valid_0's multi_logloss: 2.94907
[400]	valid_0's multi_logloss: 2.90648
[500]	valid_0's multi_logloss: 2.90103
[600]	valid_0's multi_logloss: 2.89825
[700]	valid_0's multi_logloss: 2.89673
[800]	valid_0's multi_logloss: 2.89551
[900]	valid_0's multi_logloss: 2.89484
[1000]	valid_0's multi_logloss: 2.89431

#1 교차 검증 정확도 : 0.443,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#1 검증 세트 인덱스 : [   0    1    2 ... 3208 3351 3574]




[100]	valid_0's multi_logloss: 3.40588
[200]	valid_0's multi_logloss: 3.07792
[300]	valid_0's multi_logloss: 2.86142
[400]	valid_0's multi_logloss: 2.81761
[500]	valid_0's multi_logloss: 2.8102
[600]	valid_0's multi_logloss: 2.80641
[700]	valid_0's multi_logloss: 2.80433
[800]	valid_0's multi_logloss: 2.80376
[900]	valid_0's multi_logloss: 2.80308
[1000]	valid_0's multi_logloss: 2.80303

#2 교차 검증 정확도 : 0.462,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#2 검증 세트 인덱스 : [ 105  185  239 ... 4970 5093 5198]




[100]	valid_0's multi_logloss: 3.38672
[200]	valid_0's multi_logloss: 3.103
[300]	valid_0's multi_logloss: 2.90897
[400]	valid_0's multi_logloss: 2.86758
[500]	valid_0's multi_logloss: 2.86465
[600]	valid_0's multi_logloss: 2.86454
[700]	valid_0's multi_logloss: 2.86389
[800]	valid_0's multi_logloss: 2.86401

#3 교차 검증 정확도 : 0.4576,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#3 검증 세트 인덱스 : [ 443  447  619 ... 6010 6070 6231]




[100]	valid_0's multi_logloss: 3.33627
[200]	valid_0's multi_logloss: 3.03659
[300]	valid_0's multi_logloss: 2.83678
[400]	valid_0's multi_logloss: 2.80658
[500]	valid_0's multi_logloss: 2.80248

#4 교차 검증 정확도 : 0.4649,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#4 검증 세트 인덱스 : [1315 1667 1807 ... 6508 6629 6690]




[100]	valid_0's multi_logloss: 3.40398
[200]	valid_0's multi_logloss: 3.07102
[300]	valid_0's multi_logloss: 2.84887
[400]	valid_0's multi_logloss: 2.80283
[500]	valid_0's multi_logloss: 2.79502
[600]	valid_0's multi_logloss: 2.7946

#5 교차 검증 정확도 : 0.4803,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#5 검증 세트 인덱스 : [2451 2777 2931 ... 6837 6838 6839]

## 평균 검증 정확도: 0.4615600000000001


In [42]:
test_tf_idf['knowcode'] = model_lgbm.predict(test_tf_idf_X)
submission_tf_idf = test_tf_idf[['index', 'knowcode']]
submission_tf_idf.to_csv('KNOW_2019_tfidf_submission.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_tf_idf['knowcode'] = model_lgbm.predict(test_tf_idf_X)


In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_tf_idf_X, train_tf_idf_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)

# no_text

In [45]:
train_no_text_X, train_no_text_y = train_no_text.drop(['knowcode', 'index'], axis=1), train_no_text.knowcode
test_no_text_X = test_no_text.drop('index', axis=1)

In [46]:
#하이퍼파라미터 없이, 지금은 임의로 넣어둠
model_lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=6, learning_rate=0.01)

In [47]:
# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    train_no_text_X, 
    train_no_text_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True,
    stratify = train_no_text_y # Classification 중요 option
)

In [48]:
#without hyperparameter
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(tr_val_X, tr_val_y):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = train_no_text_X.iloc[train_index], train_no_text_X.iloc[test_index]
    y_train, y_test = train_no_text_y[train_index], train_no_text_y[test_index]
    
    model_lgbm.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=100, verbose=100)
    pred = model_lgbm.predict(x_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4) # 소수점 4자리 반올림
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('\n#{0} 교차 검증 정확도 : {1},  학습 데이터 크기 : {2},  검증 데이터 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스 : {1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))



[100]	valid_0's multi_logloss: 4.10121
[200]	valid_0's multi_logloss: 3.73409
[300]	valid_0's multi_logloss: 3.55801
[400]	valid_0's multi_logloss: 3.47535
[500]	valid_0's multi_logloss: 3.38161
[600]	valid_0's multi_logloss: 3.32154
[700]	valid_0's multi_logloss: 3.30294
[800]	valid_0's multi_logloss: 3.29478
[900]	valid_0's multi_logloss: 3.29236

#1 교차 검증 정확도 : 0.3947,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#1 검증 세트 인덱스 : [   0    1    2 ... 3208 3351 3574]




[100]	valid_0's multi_logloss: 3.96158
[200]	valid_0's multi_logloss: 3.57862
[300]	valid_0's multi_logloss: 3.37149
[400]	valid_0's multi_logloss: 3.26071
[500]	valid_0's multi_logloss: 3.16268
[600]	valid_0's multi_logloss: 3.11521
[700]	valid_0's multi_logloss: 3.08655
[800]	valid_0's multi_logloss: 3.07197
[900]	valid_0's multi_logloss: 3.06616
[1000]	valid_0's multi_logloss: 3.06269

#2 교차 검증 정확도 : 0.4232,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#2 검증 세트 인덱스 : [ 105  185  239 ... 4970 5093 5198]




[100]	valid_0's multi_logloss: 4.04657
[200]	valid_0's multi_logloss: 3.65539
[300]	valid_0's multi_logloss: 3.46913
[400]	valid_0's multi_logloss: 3.39734
[500]	valid_0's multi_logloss: 3.31695
[600]	valid_0's multi_logloss: 3.26114
[700]	valid_0's multi_logloss: 3.23342
[800]	valid_0's multi_logloss: 3.21914
[900]	valid_0's multi_logloss: 3.21625
[1000]	valid_0's multi_logloss: 3.21456

#3 교차 검증 정확도 : 0.413,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#3 검증 세트 인덱스 : [ 443  447  619 ... 6010 6070 6231]




[100]	valid_0's multi_logloss: 3.92117
[200]	valid_0's multi_logloss: 3.53956
[300]	valid_0's multi_logloss: 3.35062
[400]	valid_0's multi_logloss: 3.25987
[500]	valid_0's multi_logloss: 3.16706
[600]	valid_0's multi_logloss: 3.11239
[700]	valid_0's multi_logloss: 3.0885
[800]	valid_0's multi_logloss: 3.08028
[900]	valid_0's multi_logloss: 3.07827
[1000]	valid_0's multi_logloss: 3.07681

#4 교차 검증 정확도 : 0.4327,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#4 검증 세트 인덱스 : [1315 1667 1807 ... 6508 6629 6690]




[100]	valid_0's multi_logloss: 4.04466
[200]	valid_0's multi_logloss: 3.65248
[300]	valid_0's multi_logloss: 3.44625
[400]	valid_0's multi_logloss: 3.35375
[500]	valid_0's multi_logloss: 3.25989
[600]	valid_0's multi_logloss: 3.19887
[700]	valid_0's multi_logloss: 3.17315
[800]	valid_0's multi_logloss: 3.15975
[900]	valid_0's multi_logloss: 3.15412
[1000]	valid_0's multi_logloss: 3.15096

#5 교차 검증 정확도 : 0.4232,  학습 데이터 크기 : 5472,  검증 데이터 크기 : 1368
#5 검증 세트 인덱스 : [2451 2777 2931 ... 6837 6838 6839]

## 평균 검증 정확도: 0.41736000000000006


In [49]:
test_no_text['knowcode'] = model_lgbm.predict(test_no_text_X)
submission_no_text = test_no_text[['index', 'knowcode']]
submission_no_text.to_csv('KNOW_2019_no_text_submission.csv')

In [None]:
params = {
    'learning_rate': [0.02, 0.1, 0.05],
    'max_depth': [6,8]
    
}

gs = GridSearchCV(estimator=model,
                  param_grid = params,
                  scoring = 'neg_mean_squared_error', 
                  cv=kfold)

In [None]:
gs.fit(train_no_text_X, train_no_text_y)

In [None]:
print('Hyper Parameter: ', gs.best_params_)

In [None]:
print('cv score with Hyper Parameter', gs.best_score_)