## 데이터 불러오기 (Read Data)

In [2]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')


from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold,GroupKFold
from sklearn.metrics import f1_score
import lightgbm as lgb
# from lightgbm import LGBMClassifier as lgb

# import optuna.integration.lightgbm as lgb

# from imblearn.over_sampling import SMOTE
SEED = 1996

In [6]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test_x.csv')
submission = pd.read_csv('sample_submission.csv')

## 데이터 구조 확인 (Checking the shapes of data)

In [7]:
print(train.shape)
print(test.shape)
print(submission.shape)

(45532, 78)
(11383, 77)
(11383, 2)


In [10]:
all_df = pd.concat([train, test])
all_df.reset_index(inplace = True, drop = True)
del(all_df['index'])

In [12]:
all_df.head()

Unnamed: 0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,...,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13
0,3.0,363,4.0,1370,5.0,997,1.0,1024,2.0,1577,...,0,1,0,1,1,0,1,0,1,1
1,5.0,647,5.0,1313,3.0,3387,5.0,2969,1.0,4320,...,1,1,0,1,1,0,1,0,1,1
2,4.0,1623,1.0,1480,1.0,1021,4.0,3374,5.0,1333,...,1,1,0,1,1,1,1,0,1,1
3,3.0,504,3.0,2311,4.0,992,3.0,3245,1.0,357,...,0,0,0,0,1,0,1,0,1,1
4,1.0,927,1.0,707,5.0,556,2.0,1062,1.0,1014,...,1,1,1,1,1,0,1,1,1,1


In [11]:
all_df['gain-loss'] = all_df['capital_gain'] - all_df['capital_loss']
all_df['work'] = all_df['workclass'].apply(str) + '_' + all_df['occupation'].apply(str)
all_df['human'] = all_df['race'].apply(str) + '_' + all_df['sex'].apply(str)

all_df.loc[all_df['native_country'] == '?', 'native_country'] = np.nan
all_df.loc[all_df['workclass'] == '?', 'native_country'] = np.nan
all_df.loc[all_df['occupation'] == '?', 'native_country'] = np.nan

KeyError: 'capital_gain'

In [None]:
log_var = ['capital_gain','capital_loss','gain-loss','fnlwgt']
for col in log_var:
    all_df[col] = np.log1p(all_df[col])

In [None]:
train = all_df.iloc[:26049,:]
test = all_df.iloc[26049:,:]

## 분류 작업에 필용한 함수 불러오기 (Import methods for classification task)

In [None]:
from pycaret.classification import *

In [5]:
# 'voted' 컬럼이 예측 대상이므로 target 인자에 명시
# 'voted' column is the target variable
clf = setup(data = train, target = 'income', session_id = 1996,silent = True)

NameError: name 'setup' is not defined

In [24]:
best_3 = compare_models(sort = 'F1', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Light Gradient Boosting Machine,0.8713,0.927,0.6581,0.7765,0.7121,0.63,0.6338,0.4691
1,Extreme Gradient Boosting,0.8703,0.927,0.661,0.7709,0.7114,0.6284,0.6317,12.0064
2,CatBoost Classifier,0.8717,0.9276,0.6481,0.7845,0.7096,0.6282,0.6331,7.7538
3,Gradient Boosting Classifier,0.8649,0.9218,0.6055,0.7867,0.684,0.6,0.6085,4.5527
4,Ada Boost Classifier,0.8586,0.9154,0.6089,0.7589,0.6755,0.5865,0.5924,1.4354
5,Linear Discriminant Analysis,0.8429,0.8961,0.5951,0.7088,0.6468,0.5468,0.5503,0.3378
6,Random Forest Classifier,0.8444,0.8791,0.5806,0.7217,0.6431,0.5452,0.5507,0.127
7,Logistic Regression,0.8381,0.8948,0.581,0.6991,0.6342,0.5315,0.5355,0.8151
8,Extra Trees Classifier,0.8296,0.8769,0.6028,0.6623,0.6308,0.5204,0.5216,1.1669
9,Ridge Classifier,0.8428,0.0,0.5448,0.7372,0.6263,0.5296,0.5395,0.0634


## 모델 앙상블 (Model Ensemble)

In [25]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8687,0.9232,0.6429,0.7756,0.703,0.6197,0.6242
1,0.8692,0.9237,0.6546,0.7707,0.7079,0.6244,0.6278
2,0.8794,0.9337,0.658,0.8081,0.7253,0.6491,0.6547
3,0.8709,0.9282,0.6467,0.7822,0.708,0.626,0.6307
4,0.8738,0.9291,0.6712,0.7769,0.7202,0.6393,0.6422
Mean,0.8724,0.9276,0.6547,0.7827,0.7129,0.6317,0.6359
SD,0.0039,0.0038,0.0099,0.0132,0.0084,0.0109,0.0112


## 모델 예측 (Prediction)
- 구축된 앙상블 모델을 통해 예측을 해보겠습니다. 
- setup 환경에 이미 hold-out set이 존재하므로 해당 데이터에 대해 예측을 하여 모델 성능을 확인하겠습니다. 

----
- We will use the ensembled model on predicting unseen data.
- There is already a hold-out set constucted on our environment so we will test on it to evaluate the performance.

In [26]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8737,0.9277,0.6591,0.7848,0.7165,0.636,0.64


## 전체 데이터에 대한 재학습 (Re-training the model on whole data)

- 현재까지 실험은 주어진 train 데이터를 다시 한 번 train / validation으로 나눠서 실험을 한 것이므로, 전체 train 데이터에 학습되어 있지 않습니다. 
- 최적의 성능을 위해 전체 데이터에 학습을 시켜주도록 하겠습니다. 

------
- Until now we have splitted the given train data into another train / validation sets to experiment. So the models are not trained on the full training data set.
- We will train the model on the whole dataset for the most optimal performance. 

In [27]:
final_model = finalize_model(blended)

## 대회용 test set에 대한 예측 (Predicting on test set for the competition)

- predict_model 함수를 통해 재학습된 모델을 대회용 test set에 대해 예측해보겠습니다. 
- We will now use the re-trained model on the test set for the competition

In [28]:
predictions = predict_model(final_model, data = test)

In [29]:
test.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,gain-loss,work,human
32556,35,Private,11.024253,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0.0,0.0,40,United-States,,0.0,Private_Sales,White_Male
32557,41,Self-emp-inc,10.379287,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0.0,0.0,40,United-States,,0.0,Self-emp-inc_Tech-support,White_Male
32558,39,Private,12.921935,5th-6th,3,Married-civ-spouse,Other-service,Husband,White,Male,0.0,0.0,40,Mexico,,0.0,Private_Other-service,White_Male
32559,35,Private,12.102616,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40,United-States,,0.0,Private_Craft-repair,White_Male
32560,28,Private,11.962854,HS-grad,9,Divorced,Handlers-cleaners,Unmarried,White,Female,0.0,0.0,36,United-States,,0.0,Private_Handlers-cleaners,White_Female


In [30]:
predictions

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,gain-loss,work,human,Label,Score
0,,,,,,,,,,,,,,,,,,,,0.0029
1,,,,,,,,,,,,,,,,,,,,0.4822
2,,,,,,,,,,,,,,,,,,,,0.0003
3,,,,,,,,,,,,,,,,,,,,0.7117
4,,,,,,,,,,,,,,,,,,,,0.5182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,35.0,Private,11.024253,Bachelors,13.0,Married-civ-spouse,Sales,Husband,White,Male,0.0,0.0,40.0,United-States,,0.0,Private_Sales,White_Male,>50K,
32557,41.0,Self-emp-inc,10.379287,Bachelors,13.0,Married-civ-spouse,Tech-support,Husband,White,Male,0.0,0.0,40.0,United-States,,0.0,Self-emp-inc_Tech-support,White_Male,>50K,
32558,39.0,Private,12.921935,5th-6th,3.0,Married-civ-spouse,Other-service,Husband,White,Male,0.0,0.0,40.0,Mexico,,0.0,Private_Other-service,White_Male,<=50K,
32559,35.0,Private,12.102616,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,,0.0,Private_Craft-repair,White_Male,<=50K,


- 확률 값이 'Score' 컬럼에 저장되어 있으므로 해당 값을 submission 파일에 옮겨 데이콘에 제출하겠습니다. 
- The probability values are stored on 'Score' column. So we will write them on our submission format and submit on DACON.

In [31]:
submission['voted'] = predictions['Score']

In [32]:
submission.loc[submission['voted'] >= 0.5, 'prediction'] = 1

In [33]:
del(submission['voted'])

In [34]:
submission

Unnamed: 0,id,prediction
0,0,0
1,1,0
2,2,0
3,3,1
4,4,1
...,...,...
6507,6507,1
6508,6508,1
6509,6509,0
6510,6510,0


In [35]:
submission.to_csv('base_87714_seed_0_addnan.csv', index = False)