### 사용한 방법:
#### 결측치 처리:
1. 다른 값으로부터 예측 - lightGBM
2. iterative imputer
3. median, mean, constant, mode

#### 인코딩
1. labelencoding
2. targetencoding

#### 칼럼선택
1. 전진선택법
2. 후진선택법

### 조합한 방법 - 성능이 좋은 순으로 나열함:
1. iterative imputer + labelencoding + 전진선택법 + lightGBM
2. 결측치 안채움 + labelencoding + lightGBM
3. iterative imputer + targetencoding + 전진선택법 + MLP

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys, os

In [None]:
file_path = '/Users/yongchanchun/Desktop/MacBook_Pro_Desktop/TAVE/머신러닝/DACON/여행_상품_신청_여부/data'
submission_path = 'sample_submission.csv'
train_path = 'train.csv'
test_path = 'test.csv'

In [None]:
train_df = pd.read_csv(os.path.join(file_path, train_path), index_col = 'id')
test_df = pd.read_csv(os.path.join(file_path, test_path), index_col = 'id')
sub_file = pd.read_csv(os.path.join(file_path, submission_path), index_col = 'id')

In [None]:
train_df.info()

In [None]:
train_df

In [None]:
train_df.columns

In [None]:
num_na = []
for col in train_df.columns:
    num_na.append(train_df[train_df[col].isna()].shape[0])
num_na

In [None]:
# Female 과 Fe Male 동일시하기
train_df.loc[train_df['Gender'] == 'Fe Male', 'Gender'] = 'Female'
train_df[train_df['Gender'] == 'Fe Male'].shape[0]

In [None]:
# Female 과 Fe Male 동일시하기
test_df.loc[test_df['Gender'] == 'Fe Male', 'Gender'] = 'Female'
test_df[test_df['Gender'] == 'Fe Male'].shape[0]

In [None]:
print(train_df['Gender'].unique())
print(test_df['Gender'].unique())

## train_df와 test_df의 범주 값 확인하기

In [None]:
for i, col in enumerate(train_df.columns):
    print(f'{i}) colname: {col}, \n unique value of col: {train_df[col].unique()}')

In [None]:
for i, col in enumerate(test_df.columns):
    print(f'{i}) colname: {col}, \n unique value of col: {test_df[col].unique()}')

## 결측치를 제거하지 않은 상태로 GBDT를 적용해보자
1. lightgbm 사용
2. xgboost 사용

In [None]:
from sklearn.preprocessing import LabelEncoder

train_df_all_label_encoded = pd.DataFrame()
# transform all object columns with labelencoder
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(train_df[col])
        train_df_all_label_encoded[col] = le.transform(train_df[col])
    else:
        train_df_all_label_encoded[col] = train_df[col]

In [None]:
test_df_all_label_encoded = pd.DataFrame()
# transform all object columns with labelencoder
for col in test_df.columns:
    if test_df[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(test_df[col])
        test_df_all_label_encoded[col] = le.transform(test_df[col])
    else:
        test_df_all_label_encoded[col] = test_df[col]

In [None]:
test_df_all_label_encoded

In [None]:
sub_file

### 1. lightgbm 사용

In [None]:
import lightgbm as lgb

train_x = train_df_all_label_encoded.drop(['ProdTaken'], axis = 1)
train_y = train_df_all_label_encoded['ProdTaken']
lightgbm_model = lgb.LGBMClassifier(random_state=777, n_estimators=1000)
lightgbm_model.fit(train_x, train_y)


In [None]:
sub_file['ProdTaken'] = lightgbm_model.predict(test_df_all_label_encoded)
file_name = 'lightgbm_with_null_result.csv'
sub_file.to_csv(os.path.join(file_path, file_name))

### 2. xgboost 사용

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=18, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [None]:
print(tr_x.shape[0])
print(va_x.shape[0])

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# 특징과 목적변수를 xgboost의 데이터 구조로 변환
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_df_all_label_encoded)


# 매개변수의 탐색범위
param_space = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.1,
    'eval_metric': 'error',
    'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1),
    'max_depth': hp.quniform('max_depth', 3, 9, 1),    
    'subsample': 0.8,
    'colsample_bytree': hp.quniform('subsample', 0.6, 0.95, 0.05),
    'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
     # 여유가 있으면 alpha, lambda도 조정
    'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
    'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0)),
    'random_state': 71

}

num_round=500
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

# define classifier
def classifier(value_list):
    result = []
    for value in value_list:
        if value > 0.5:
            result.append(1)
        else:
            result.append(0)
    result = np.array(result)
    return result
    
def score(params):
    # max_depth의 형을 정수형으로 수정
    params['max_depth'] = int(params['max_depth']) 
    model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=50)
    # model.fit(tr_x, tr_y)
    va_pred = model.predict(dvalid)
    print(type(va_pred), type(va_y))
    va_pred = classifier(va_pred)
    score = accuracy_score(va_y, va_pred)
    print(f'params: {params}, AS: {score:.4f}')

    # 정보를 기록
    history.append((params, score))

    return {'loss': score, 'status': STATUS_OK}


# hyperopt에 의한 매개변수 탐색 실행
max_evals = 15
trials = Trials()
history = []
fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)

# 기록한 정보에서 매개변수와 점수를 출력
# (trials에서도 정보를 취득할 수 있지만 매개변수의 취득이 다소 어려움)
history = sorted(history, key=lambda tpl: tpl[1], reverse=True)
best = history[0]
print(f'best params:{best[0]}, score:{best[1]:.4f}')

In [None]:
params = {'alpha': 0.00015592994331247996, 'booster': 'gbtree', 'colsample_bytree': 0.65, 'eta': 0.1, 'eval_metric': 'error', 'gamma': 0.006801021591153265, 'lambda': 0.17461896494143567, 'max_depth': 7, 'min_child_weight': 2.0, 'objective': 'binary:logistic', 'random_state': 71, 'subsample': 0.8}
# dtrain = xgb.DMatrix(train_x, label=train_y)
# dvalid = xgb.DMatrix(va_x, label=va_y)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round,
                  evals=watchlist,
                  early_stopping_rounds=50)

# 최적의 결정 트리의 개수로 예측

pred = model.predict(dtest, ntree_limit=model.best_ntree_limit)
pred = classifier(pred)

In [None]:
sub_file['ProdTaken'] = pred
file_name = 'xgboost_result.csv'
sub_file.to_csv(os.path.join(file_path, file_name))

### 3. 전진선택법 + xgboost

## 결측치를 제거한 상태로 GBDT를 적용해보자

### iterative incomputer를 사용하여 결측치를 채움

In [None]:
train_df.info()

In [None]:
from copy import deepcopy
train_df_iter = deepcopy(train_df)
test_df_iter = deepcopy(test_df)

In [None]:
# sklearn에서 제공하는 label encoder로 object형을 int형으로 labeling
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

for col in train_df_iter.columns:
    if train_df_iter[col].dtype == 'object':
        train_df_iter[col] = label_encoder.fit_transform(train_df_iter[col])

for col in test_df_iter.columns:
    if test_df_iter[col].dtype == 'object':
        test_df_iter[col] = label_encoder.fit_transform(test_df_iter[col])

In [None]:
train_df_iter.info()
test_df_iter.info()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(estimator = LinearRegression(), 
                       tol= 1e-10, 
                       max_iter=30, 
                       verbose=2, 
                       initial_strategy='median',
                       imputation_order='ascending')

train_df_iter2 = pd.DataFrame(imp.fit_transform(train_df_iter))
test_df_iter2 = pd.DataFrame(imp.fit_transform(test_df_iter))

In [None]:
test_df_iter2.head()

In [None]:
train_df_iter2.isnull().sum()

In [None]:
train_df_iter2.columns = train_df_iter.columns
test_df_iter2.columns = test_df_iter.columns

In [None]:
test_df_iter2.columns

In [None]:
plt.figure(figsize=(15,15))
for i, col in enumerate(train_df_iter2.columns, 1):
    # row = 5
    plt.subplot(5,4,i)
    plt.hist(train_df_iter2[col], bins=20)
    plt.title(col)
plt.show

## 전진 선택법으로 필요없는 칼럼제거

### iterative imputer를 통해 결측치 값을 채운 데이터에 먼저 적용

In [None]:
train_x = train_df_iter2.drop(['ProdTaken'], axis = 1)
train_y = train_df_iter2['ProdTaken']

## 전진 단계별 선택법
import statsmodels.api as sm

# feature 및 target
variables = train_x.columns.tolist() 
y = train_y 

# 선택된 변수들 list 생성
forward_variables = []

# 전진선택시 P 값을 고려할 때, 선택과 제거 임계치 설정    
sl_enter = 0.05
sl_remove = 0.05

# 각 스텝별로 선택된 변수들
sv_per_step = [] 
# 각 스텝별 수정된 결정계수
adj_r_squared_list = []
# 스텝
steps = []
step = 0


while len(variables) > 0:
    remainder = list(set(variables) - set(forward_variables))
    pval = pd.Series(index=remainder) ## 변수의 p-value
    ## 기존에 포함된 변수와 새로운 변수 하나씩 돌아가면서 
    ## 선형 모형을 적합한다.
    for col in remainder: 
        X = train_x[forward_variables+[col]]
        X = sm.add_constant(X)
        model = sm.OLS(y,X).fit(disp=0)
        pval[col] = model.pvalues[col]
 
    min_pval = pval.min()
    if min_pval < sl_enter: ## 최소 p-value 값이 기준 값보다 작으면 포함
        forward_variables.append(pval.idxmin())
        ## 선택된 변수들에대해서
        ## 어떤 변수를 제거할지 고른다.
        while len(forward_variables) > 0:
            selected_X = train_x[forward_variables]
            selected_X = sm.add_constant(selected_X)
            selected_pval = sm.OLS(y,selected_X).fit(disp=0).pvalues[1:] ## 절편항의 p-value는 뺀다
            max_pval = selected_pval.max()
            if max_pval >= sl_remove: ## 최대 p-value값이 기준값보다 크거나 같으면 제외
                remove_variable = selected_pval.idxmax()
                forward_variables.remove(remove_variable)
            else:
                break
        
        step += 1
        steps.append(step)
        adj_r_squared = sm.OLS(y,sm.add_constant(train_x[forward_variables])).fit(disp=0).rsquared_adj
        adj_r_squared_list.append(adj_r_squared)
        sv_per_step.append(forward_variables.copy())
    else:
        break

In [None]:
len(forward_variables)

In [None]:
train_x_forward = train_x[forward_variables]
test_df_iter2_forward = test_df_iter2[forward_variables]

In [None]:
print(len(train_x_forward.columns))
print(len(test_df_iter2_forward.columns))

#### lightGBM

In [None]:
import lightgbm as lgb

lightgbm_model =lgb.LGBMClassifier(random_state=777, n_estimators=1000)
lightgbm_model.fit(train_x_forward, train_y)


In [None]:
sub_file['ProdTaken'] = lightgbm_model.predict(test_df_iter2_forward)
file_name = 'lightgbm_with_iter_forward_result.csv'
sub_file.to_csv(os.path.join(file_path, file_name))

#### xgboost

In [None]:
# train_x, train_y, test_df만 수정하면 됨

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x_forward))[0]
tr_x, va_x = train_x_forward.iloc[tr_idx], train_x_forward.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

# 특징과 목적변수를 xgboost의 데이터 구조로 변환
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_df_iter2_forward)


# 매개변수의 탐색범위
param_space = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.1,
    'eval_metric': 'error',
    'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1),
    'max_depth': hp.quniform('max_depth', 3, 9, 1),    
    'subsample': 0.8,
    'colsample_bytree': hp.quniform('subsample', 0.6, 0.95, 0.05),
    'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
     # 여유가 있으면 alpha, lambda도 조정
    'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
    'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0)),
    'random_state': 71

}

num_round=500
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

# define classifier
def classifier(value_list):
    result = []
    for value in value_list:
        if value > 0.5:
            result.append(1)
        else:
            result.append(0)
    result = np.array(result)
    return result
    
def score(params):
    # max_depth의 형을 정수형으로 수정
    params['max_depth'] = int(params['max_depth']) 
    model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=50)
    # model.fit(tr_x, tr_y)
    va_pred = model.predict(dvalid)
    print(type(va_pred), type(va_y))
    va_pred = classifier(va_pred)
    score = accuracy_score(va_y, va_pred)
    print(f'params: {params}, AS: {score:.4f}')

    # 정보를 기록
    history.append((params, score))

    return {'loss': score, 'status': STATUS_OK}


# hyperopt에 의한 매개변수 탐색 실행
max_evals = 15
trials = Trials()
history = []
fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)

# 기록한 정보에서 매개변수와 점수를 출력
# (trials에서도 정보를 취득할 수 있지만 매개변수의 취득이 다소 어려움)
history = sorted(history, key=lambda tpl: tpl[1], reverse=True)
best = history[0]
print(f'best params:{best[0]}, score:{best[1]:.4f}')

### 결측치를 제거하지 않은 데이터에 적용해보기

In [None]:
train_df_all_label_encoded_forward = train_df_all_label_encoded[forward_variables]
test_df_all_label_encoded_forward = test_df_all_label_encoded[forward_variables]

In [None]:
print(train_df_all_label_encoded_forward.shape[1])
print(test_df_all_label_encoded_forward.shape[1])

In [None]:
lightgbm_model =lgb.LGBMClassifier(random_state=777, n_estimators=1000)
lightgbm_model.fit(train_df_all_label_encoded_forward, train_y)

In [None]:
sub_file['ProdTaken'] = lightgbm_model.predict(test_df_all_label_encoded_forward)
file_name = 'lightgbm_with_nan_forward_result.csv'
sub_file.to_csv(os.path.join(file_path, file_name))

## 결측치 하나씩 확인해보기
1. 결측치가 들어있는 열을 직접 확인해보기
2. 열마다 결측치를 처리할 적절한 방법 찾아보기

In [None]:
train_df

In [None]:
# 후에 iterative imputer를 이용해 결측치를 채울 것을 대비해 데이터를 복사함
train_df_iter3 = deepcopy(train_df)
test_df_iter3 = deepcopy(test_df)

### 1. 결측치가 들어있는 열을 직접 확인해보기

In [None]:
col_names = train_df.columns
na_col = []
no_na_col = []
for i, num in enumerate(num_na):
    if num != 0:
        na_col.append(col_names[i])
    else:
        no_na_col.append(col_names[i])

train_df[na_col]        
        

In [None]:
train_df[na_col].info()

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

plt.figure(figsize=(20,10))
for i, col in enumerate(na_col, 1):
    row = int(np.sqrt(len(na_col)))
    plt.subplot(int(np.sqrt(len(na_col))), int(len(na_col)/row), i)
    if col == 'TypeofContact':
        plt.hist(label_encoder.fit_transform(train_df[col]), bins=20)
        plt.title(col)
        continue
    plt.hist(train_df[col], bins=20)
    plt.title(col)
plt.show

In [None]:
train_df_all_label_encoded.describe()

In [None]:
train_df.head(10)

In [None]:
train_df_all_label_encoded.corr(method = 'pearson')

### 2. 열마다 결측치를 처리할 적절한 방법 찾아보기
1. 여행상품을 신청하는 여부를 결정하는데 중요하게 작용하는 특징 확인하기

In [None]:
train_df_all_label_encoded = pd.DataFrame()
# transform all object columns with labelencoder
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(train_df[col])
        train_df_all_label_encoded[col] = le.transform(train_df[col])
    else:
        train_df_all_label_encoded[col] = train_df[col]

In [None]:
train_df_all_label_encoded

In [None]:
feature_values = pd.DataFrame(lightgbm_model.feature_importances_)
feature_columns = pd.DataFrame(train_x.columns)
feature_im = pd.concat([feature_values,feature_columns],axis=1)
feature_im.columns=['importance','column']
feature_im=feature_im.sort_values('importance',ascending=False)
feature_im # 랜덤포레스트 모델에서 특징중요도로 선택한 특징들을 확인

2. monthlyincome, age, numberoftrips는 다른 변수로부터 결측값을 예측함 - LGBMRegressor

In [None]:
predict_na_col = ['MonthlyIncome', 'Age', 'NumberOfTrips']
for col in predict_na_col:
    na_idx = train_df[train_df[col].isna()].index
    na_idx = list(map(lambda x:x-1, na_idx))
    not_na_idx = train_df[list(map(lambda x:not x, train_df[col].isna()))].index
    not_na_idx = list(map(lambda x:x-1, not_na_idx))
    train_y = train_df_all_label_encoded.iloc[not_na_idx][col]
    train_x = train_df_all_label_encoded.iloc[not_na_idx].drop([col], axis=1) 
    feature_model =lgb.LGBMRegressor(random_state=777, n_estimators=1000)
    feature_model.fit(train_x, train_y)
    feature_pred = feature_model.predict(train_df_all_label_encoded.iloc[na_idx].drop([col], axis=1))
    train_df.loc[train_df[train_df[col].isna()].index,col] = feature_pred
    print(train_df[col].isnull().sum())

3. numberoffollowups, PreferredPropertyStar, NumberOfChildrenVisiting 는 다른 변수로 부터 값을 예측함 - LGBMClassifier

In [None]:
predict_na_col = ['NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfChildrenVisiting']
for col in predict_na_col:
    na_idx = train_df[train_df[col].isna()].index
    na_idx = list(map(lambda x:x-1, na_idx))
    not_na_idx = train_df[list(map(lambda x:not x, train_df[col].isna()))].index
    not_na_idx = list(map(lambda x:x-1, not_na_idx))
    train_y = train_df_all_label_encoded.iloc[not_na_idx][col]
    train_x = train_df_all_label_encoded.iloc[not_na_idx].drop([col], axis=1) 
    feature_model =lgb.LGBMClassifier(random_state=777, n_estimators=1000)
    feature_model.fit(train_x, train_y)
    feature_pred = feature_model.predict(train_df_all_label_encoded.iloc[na_idx].drop([col], axis=1))
    train_df.loc[train_df[train_df[col].isna()].index,col] = feature_pred
    print(train_df[train_df[col].isna()].shape)

4. 'TypeofContact'는 다른 변수로 부터 값을 예측함 - LGBMClassifier

In [None]:
predict_na_col = ['TypeofContact']
for col in predict_na_col:
    na_idx = train_df[train_df[col].isna()].index
    na_idx = list(map(lambda x:x-1, na_idx))
    not_na_idx = train_df[list(map(lambda x:not x, train_df[col].isna()))].index
    not_na_idx = list(map(lambda x:x-1, not_na_idx))
    train_y = train_df_all_label_encoded.iloc[not_na_idx][col]
    train_x = train_df_all_label_encoded.iloc[not_na_idx].drop([col], axis=1) 
    feature_model =lgb.LGBMClassifier(random_state=777, n_estimators=1000)
    feature_model.fit(train_x, train_y)
    feature_pred = feature_model.predict(train_df_all_label_encoded.iloc[na_idx].drop([col], axis=1))
    back2col = lambda x: 'Company Invited' if x==0 else 'Self Enquiry'
    feature_pred = list(map(back2col, feature_pred))
    train_df.loc[train_df[train_df[col].isna()].index,col] = feature_pred
    print(train_df[train_df[col].isna()].shape)
    

5. durationofpitch는 중앙값을 구함

In [None]:
med_val = np.median(train_df[train_df['DurationOfPitch'].isna().apply(lambda x:not x)]['DurationOfPitch'])
train_df.loc[train_df[train_df['DurationOfPitch'].isna()].index,'DurationOfPitch'] = med_val

In [None]:
train_df.isnull().sum()
# 모든 결측치 값 채워넣음

In [None]:
train_df

6. test data도 같은 방식으로 변환

In [None]:
test_df

In [None]:
predict_na_col = ['MonthlyIncome', 'Age', 'NumberOfTrips']
for col in predict_na_col:
    na_idx = test_df[test_df[col].isna()].index
    na_idx = list(map(lambda x:x-1, na_idx))
    not_na_idx = test_df[list(map(lambda x:not x, test_df[col].isna()))].index
    not_na_idx = list(map(lambda x:x-1, not_na_idx))
    test_y = test_df_all_label_encoded.iloc[not_na_idx][col]
    test_x = test_df_all_label_encoded.iloc[not_na_idx].drop([col], axis=1) 
    feature_model =lgb.LGBMRegressor(random_state=777, n_estimators=1000)
    feature_model.fit(test_x, test_y)
    feature_pred = feature_model.predict(test_df_all_label_encoded.iloc[na_idx].drop([col], axis=1))
    test_df.loc[test_df[test_df[col].isna()].index,col] = feature_pred
    print(test_df[col].isnull().sum())

In [None]:
predict_na_col = ['NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfChildrenVisiting']
for col in predict_na_col:
    na_idx = test_df[test_df[col].isna()].index
    na_idx = list(map(lambda x:x-1, na_idx))
    not_na_idx = test_df[list(map(lambda x:not x, test_df[col].isna()))].index
    not_na_idx = list(map(lambda x:x-1, not_na_idx))
    test_y = test_df_all_label_encoded.iloc[not_na_idx][col]
    test_x = test_df_all_label_encoded.iloc[not_na_idx].drop([col], axis=1) 
    feature_model =lgb.LGBMClassifier(random_state=777, n_estimators=1000)
    feature_model.fit(test_x, test_y)
    feature_pred = feature_model.predict(test_df_all_label_encoded.iloc[na_idx].drop([col], axis=1))
    test_df.loc[test_df[test_df[col].isna()].index,col] = feature_pred
    print(test_df[test_df[col].isna()].shape)

In [None]:
predict_na_col = ['TypeofContact']
for col in predict_na_col:
    na_idx = test_df[test_df[col].isna()].index
    na_idx = list(map(lambda x:x-1, na_idx))
    not_na_idx = test_df[list(map(lambda x:not x, test_df[col].isna()))].index
    not_na_idx = list(map(lambda x:x-1, not_na_idx))
    test_y = test_df_all_label_encoded.iloc[not_na_idx][col]
    test_x = test_df_all_label_encoded.iloc[not_na_idx].drop([col], axis=1) 
    feature_model =lgb.LGBMClassifier(random_state=777, n_estimators=1000)
    feature_model.fit(test_x, test_y)
    feature_pred = feature_model.predict(test_df_all_label_encoded.iloc[na_idx].drop([col], axis=1))
    back2col = lambda x: 'Company Invited' if x==0 else 'Self Enquiry'
    feature_pred = list(map(back2col, feature_pred))
    test_df.loc[test_df[test_df[col].isna()].index,col] = feature_pred
    print(test_df[test_df[col].isna()].shape)

In [None]:
med_val = np.median(test_df[test_df['DurationOfPitch'].isna().apply(lambda x:not x)]['DurationOfPitch'])
test_df.loc[test_df[test_df['DurationOfPitch'].isna()].index,'DurationOfPitch'] = med_val

## 3. 범주형 데이터 변환
### 타깃 인코딩 적용

In [None]:
train_df.info()

In [None]:
train_df

In [None]:
train_df.describe()

In [None]:
obj_col = []
for col in train_df.columns:
    # print(type(train_df[col].dtypes))
    if train_df[col].dtypes == 'object':
        obj_col.append(col)

categorical_col = ['CityTier', 'NumberOfPersonVisiting', 'Passport', 'PitchSatisfactionScore', 
                   'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfChildrenVisiting',
                  'OwnCar']
obj_col += categorical_col

In [None]:
from sklearn.model_selection import KFold


for c in obj_col:
    # target encode test data 
    data_tmp = pd.DataFrame({c:train_df[c], 'target': train_df['ProdTaken']})
    target_mean = data_tmp.groupby(c)['target'].mean()
    test_df[c] = test_df[c].map(target_mean)
    
    # target encode train data
    tmp = np.repeat(np.nan, train_df.shape[0])
    kf = KFold(n_splits=4, shuffle=True, random_state=72)
    for idx_1, idx_2 in kf.split(train_df):
        target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
        tmp[idx_2] = train_df[c].iloc[idx_2].map(target_mean)
        
    train_df[c] = tmp

In [None]:
test_df

In [None]:
test_df.isnull().sum()

## 4. 결측치 채움 + 타깃인코딩한 데이터에 GBDT 적용

In [None]:
import lightgbm as lgb

train_x = train_df.drop(['ProdTaken'], axis = 1)
train_y = train_df['ProdTaken']
lightgbm_model =lgb.LGBMClassifier(random_state=777, n_estimators=1000)
lightgbm_model.fit(train_x, train_y)

In [None]:
sub_file['ProdTaken'] = lightgbm_model.predict(test_df)
file_name = 'lightgbm_without_null_result.csv'
sub_file.to_csv(os.path.join(file_path, file_name))

#### 성능이 결측치 채움 + 타깃인코딩한 데이터에서 더 좋지 않음

## 4.5 결측치를 iterative imputer로 채움 + 전진선택법 + 타깃인코딩한 데이터에 GBDT 적용

In [None]:
obj_col = []
for col in train_df_iter3.columns:
    # print(type(train_df_iter3[col].dtypes))
    if train_df_iter3[col].dtypes == 'object':
        obj_col.append(col)

categorical_col = ['CityTier', 'NumberOfPersonVisiting', 'Passport', 'PitchSatisfactionScore', 
                   'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfChildrenVisiting',
                  'OwnCar']

obj_col += categorical_col

In [None]:
train_df_iter3

In [None]:
from sklearn.model_selection import KFold


for c in obj_col:
    # target encode test data 
    data_tmp = pd.DataFrame({c:train_df_iter3[c], 'target': train_df_iter3['ProdTaken']})
    target_mean = data_tmp.groupby(c)['target'].mean()
    test_df_iter3[c] = test_df_iter3[c].map(target_mean)
    
    # target encode train data
    tmp = np.repeat(np.nan, train_df_iter3.shape[0])
    kf = KFold(n_splits=4, shuffle=True, random_state=72)
    for idx_1, idx_2 in kf.split(train_df_iter3):
        target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
        tmp[idx_2] = train_df_iter3[c].iloc[idx_2].map(target_mean)
        
    train_df_iter3[c] = tmp

In [None]:
test_df_iter3

In [None]:
imp = IterativeImputer(estimator = LinearRegression(), 
                       tol= 1e-10, 
                       max_iter=30, 
                       verbose=2, 
                       initial_strategy='median',
                       imputation_order='ascending')

train_df_iter4 = pd.DataFrame(imp.fit_transform(train_df_iter3))
test_df_iter4 = pd.DataFrame(imp.fit_transform(test_df_iter3))

In [None]:
train_df_iter4.columns = train_df.columns
test_df_iter4.columns = test_df.columns

In [None]:
train_df_iter4

In [None]:
train_x = train_df_iter4.drop(['ProdTaken'], axis = 1)
train_y = train_df_iter4['ProdTaken']

## 전진 단계별 선택법
import statsmodels.api as sm

# feature 및 target
variables = train_x.columns.tolist() 
y = train_y 

# 선택된 변수들 list 생성
forward_variables = []

# 전진선택시 P 값을 고려할 때, 선택과 제거 임계치 설정    
sl_enter = 0.05
sl_remove = 0.05

# 각 스텝별로 선택된 변수들
sv_per_step = [] 
# 각 스텝별 수정된 결정계수
adj_r_squared_list = []
# 스텝
steps = []
step = 0


while len(variables) > 0:
    remainder = list(set(variables) - set(forward_variables))
    pval = pd.Series(index=remainder) ## 변수의 p-value
    ## 기존에 포함된 변수와 새로운 변수 하나씩 돌아가면서 
    ## 선형 모형을 적합한다.
    for col in remainder: 
        X = train_x[forward_variables+[col]]
        X = sm.add_constant(X)
        model = sm.OLS(y,X).fit(disp=0)
        pval[col] = model.pvalues[col]
 
    min_pval = pval.min()
    if min_pval < sl_enter: ## 최소 p-value 값이 기준 값보다 작으면 포함
        forward_variables.append(pval.idxmin())
        ## 선택된 변수들에대해서
        ## 어떤 변수를 제거할지 고른다.
        while len(forward_variables) > 0:
            selected_X = train_x[forward_variables]
            selected_X = sm.add_constant(selected_X)
            selected_pval = sm.OLS(y,selected_X).fit(disp=0).pvalues[1:] ## 절편항의 p-value는 뺀다
            max_pval = selected_pval.max()
            if max_pval >= sl_remove: ## 최대 p-value값이 기준값보다 크거나 같으면 제외
                remove_variable = selected_pval.idxmax()
                forward_variables.remove(remove_variable)
            else:
                break
        
        step += 1
        steps.append(step)
        adj_r_squared = sm.OLS(y,sm.add_constant(train_x[forward_variables])).fit(disp=0).rsquared_adj
        adj_r_squared_list.append(adj_r_squared)
        sv_per_step.append(forward_variables.copy())
    else:
        break

In [None]:
len(forward_variables)

In [None]:
train_x_forward = train_x[forward_variables]
test_df_iter4_forward = test_df_iter4[forward_variables]

In [None]:
import lightgbm as lgb

lightgbm_model =lgb.LGBMClassifier(random_state=777, n_estimators=1000)
lightgbm_model.fit(train_x_forward, train_y)
sub_file['ProdTaken'] = lightgbm_model.predict(test_df_iter4_forward)
file_name = 'lightgbm_with_targetencoded_iter_forward_result.csv'
sub_file.to_csv(os.path.join(file_path, file_name))

## 5. 신경망 

In [None]:
plt.figure(figsize=(20,15))
train_col = train_df.columns
for i, col in enumerate(train_col, 1):
    row = int(np.sqrt(len(train_col)))
    plt.subplot(row, int(len(train_col)/row)+1, i)
    plt.hist(train_df[col], bins=20)
    plt.title(col)
plt.show

num_cols = ['Age', 'DurationOfPitch', 'NumberOfTrips', 'MonthlyIncome']
1. Age는 rankgauss 방법을 이용함

In [None]:
from sklearn.preprocessing import QuantileTransformer

transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal')
transformer.fit(pd.concat([train_df[['Age']], test_df[['Age']]]))

train_df[['Age']] = transformer.transform(train_df[['Age']])
test_df[['Age']] = transformer.transform(test_df[['Age']])

2. DurationOfPitch는 박스-칵스 변환

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='box-cox')
pt.fit(pd.concat([train_df[['DurationOfPitch']], test_df[['DurationOfPitch']]]))

train_df[['DurationOfPitch']] = pt.transform(train_df[['DurationOfPitch']])
test_df[['DurationOfPitch']] = pt.transform(test_df[['DurationOfPitch']])

3. NumberOfTrips, MonthlyIncome는 클리핑

In [None]:
p01 = pd.concat([train_df[['NumberOfTrips', 'MonthlyIncome']], test_df[['NumberOfTrips', 'MonthlyIncome']]]).quantile(0.01)
p99 = pd.concat([train_df[['NumberOfTrips', 'MonthlyIncome']], test_df[['NumberOfTrips', 'MonthlyIncome']]]).quantile(0.90)

train_df[['NumberOfTrips', 'MonthlyIncome']] = train_df[['NumberOfTrips', 'MonthlyIncome']].clip(p01, p99, axis=1)
test_df[['NumberOfTrips', 'MonthlyIncome']] = test_df[['NumberOfTrips', 'MonthlyIncome']].clip(p01, p99, axis=1)

4. NumberOfTrips, MonthlyIncome는 rankgauss

In [None]:
transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal')
transformer.fit(pd.concat([train_df[['NumberOfTrips', 'MonthlyIncome']], test_df[['NumberOfTrips', 'MonthlyIncome']]]))

train_df[['NumberOfTrips', 'MonthlyIncome']] = transformer.transform(train_df[['NumberOfTrips', 'MonthlyIncome']])
test_df[['NumberOfTrips', 'MonthlyIncome']] = transformer.transform(test_df[['NumberOfTrips', 'MonthlyIncome']])

In [None]:
plt.figure(figsize=(20,15))
train_col = train_df.columns
for i, col in enumerate(train_col, 1):
    row = int(np.sqrt(len(train_col)))
    plt.subplot(row, int(len(train_col)/row)+1, i)
    plt.hist(train_df[col], bins=20)
    plt.title(col)
plt.show

In [None]:
plt.figure(figsize=(20,15))
test_col = test_df.columns
for i, col in enumerate(test_col, 1):
    row = int(np.sqrt(len(test_col)))
    plt.subplot(row, int(len(test_col)/row)+1, i)
    plt.hist(test_df[col], bins=20)
    plt.title(col)
plt.show

5. 신경망 생성

In [None]:
train_df.isnull().sum()

In [None]:
train_df = train_df[-train_df['Occupation'].isna()]

In [None]:
train_df = train_df[-train_df['NumberOfPersonVisiting'].isna()]

In [None]:
train_df.isnull().sum()

### pytorch 사용

In [None]:
import torch
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch.nn.functional as F                       
from sklearn.metrics import mean_squared_error   
from torch.utils.data import TensorDataset, DataLoader, Dataset


x = train_df.drop('ProdTaken', axis=1).to_numpy()
y = train_df[['ProdTaken']].astype('float64').to_numpy().reshape((-1,1))

In [None]:
class TensorData(Dataset):

    def __init__(self, x_data, y_data):
        self.x_data = torch.FloatTensor(x_data)
        self.y_data = torch.FloatTensor(y_data)
        self.len = self.y_data.shape[0]

    def __getitem__(self, index):

        return self.x_data[index], self.y_data[index] 

    def __len__(self):
        return self.len

In [None]:
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size = 0.1)

In [None]:
train_x.shape

In [None]:
train_dataset = TensorData(train_x, train_y)
train_dataloader = DataLoader(train_dataset, batch_size=30, shuffle=False)

In [None]:
class travelNet(nn.Module):
    
    def __init__(self):
        super(travelNet, self).__init__()
        self.l1 = nn.Linear(18, 192, bias=True)
        self.l2 = nn.Linear(192, 150, bias=True)
        self.l3 = nn.Linear(150, 100, bias=True)
        self.l4 = nn.Linear(100, 50, bias=True)
        self.l5 = nn.Linear(50, 2, bias=True)
        self.batchnorm = nn.BatchNorm1d
        self.dropout = nn.Dropout(0.05)
        self.classifier = torch.softmax
        
    def forward(self, x):
        x = self.dropout(x)
        x = self.l1(x)
        x = self.batchnorm(x.size(dim=1))(x)
        x = nn.LeakyReLU(0.2)(x)
        x = self.l2(x)
        x = self.batchnorm(x.size(dim=1))(x)
        x = nn.LeakyReLU(0.2)(x)
        x = self.l3(x)
        x = self.batchnorm(x.size(dim=1))(x)
        x = nn.LeakyReLU(0.2)(x)
        x = self.l4(x)
        x = self.batchnorm(x.size(dim=1))(x)
        x = nn.LeakyReLU(0.2)(x)
        x = self.l5(x)
        x = self.batchnorm(x.size(dim=1))(x)
        x = nn.LeakyReLU(0.2)(x)
        x = self.classifier(x, dim=1)
        return x

In [None]:
mlp_model = travelNet()
criterion = nn.CrossEntropyLoss()

lr1 = 0.01
lr2 = 0.001
lr3 = 0.0001

params_ft = []
params_ft.append({'params': mlp_model.l1.parameters(), 'lr': lr3})
params_ft.append({'params': mlp_model.l2.parameters(), 'lr': lr2})
params_ft.append({'params': mlp_model.l3.parameters(), 'lr': lr2})
params_ft.append({'params': mlp_model.l4.parameters(), 'lr': lr1})

optimizer = optim.Adam(params_ft)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=300, gamma=0.1)
epoch_num = 600

In [None]:
len(train_dataloader)

In [None]:
mlp_model.train(True)

for epoch in range(epoch_num):
    print(f'Epoch {epoch+1}/{epoch_num} is running ---------------')
    
    running_loss = 0.0
    running_corrects = 0
    
    for i, (data, label) in enumerate(train_dataloader):
        
        optimizer.zero_grad()
        output = mlp_model.forward(data)
        
        # view([r, c])는 크기 -1이 아닌 [r, c]만큼의 행렬로 만들어준다.
        label = label.view([1,-1]).squeeze()
        label = label.long()
        
        _, preds = torch.max(output.data, 1)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.data.item()
        running_corrects += torch.sum(preds == label)
    
    # print(running_loss)
    epoch_loss = running_loss / i
    epoch_corrects = running_corrects / train_x.shape[0]
    
    scheduler.step()
    
    print(f'loss:{epoch_loss}, accuracy:{epoch_corrects}')



In [None]:
valid_dataset = TensorData(valid_x, valid_y)
valid_dataloader = DataLoader(valid_dataset, batch_size=len(valid_dataset), shuffle=False)

In [None]:
mlp_model.train(False)
#mlp_model.dropout = nn.Sequential()

for i, (data, label) in enumerate(valid_dataloader):
    
    output = mlp_model.forward(data)
    label = label.view([1,-1]).squeeze()
    label = label.long()
    _, preds = torch.max(output.data, 1)
    
accuracy = torch.sum(preds == label) / len(valid_dataset)
print(f'accuracy:{accuracy}')    

In [None]:
total_train_dataset = TensorData(x, y)
total_train_dataloader = DataLoader(total_train_dataset, batch_size=64, shuffle=False)

In [None]:
mlp_model = travelNet()
criterion = nn.CrossEntropyLoss()

lr1 = 0.01
lr2 = 0.001
lr3 = 0.0001
best_lr = 7.9094022518724e-05

params_ft = []
params_ft.append({'params': mlp_model.l1.parameters(), 'lr': best_lr})
params_ft.append({'params': mlp_model.l2.parameters(), 'lr': best_lr})
params_ft.append({'params': mlp_model.l3.parameters(), 'lr': best_lr})
params_ft.append({'params': mlp_model.l4.parameters(), 'lr': best_lr})

optimizer = optim.Adam(params_ft)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=300, gamma=0.1)
epoch_num = 600

'batch_norm': 'before_act', 'batch_size': 64.0, 'hidden_activation': 'relu', 'hidden_dropout': 0.0, 'hidden_layers': 4.0, 'hidden_units': 192.0, 'input_dropout': 0.05, 'optimizer': {'lr': 7.9094022518724e-05, 'type': 'adam'}

In [None]:
mlp_model.train(True)

for epoch in range(epoch_num):
    print(f'Epoch {epoch+1}/{epoch_num} is running ---------------')
    
    running_loss = 0.0
    running_corrects = 0
    
    for i, (data, label) in enumerate(total_train_dataloader):
        
        optimizer.zero_grad()
        output = mlp_model.forward(data)
        
        # view([r, c])는 크기 -1이 아닌 [r, c]만큼의 행렬로 만들어준다.
        label = label.view([1,-1]).squeeze()
        label = label.long()
        
        _, preds = torch.max(output.data, 1)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.data.item()
        running_corrects += torch.sum(preds == label)
    
    # print(running_loss)
    epoch_loss = running_loss / train_x.shape[0]
    epoch_corrects = running_corrects / train_x.shape[0]
    
    scheduler.step()
    
    print(f'loss:{epoch_loss}, accuracy:{epoch_corrects}')


In [None]:
empty_torch = np.zeros(test_df.shape[0], dtype=float)
test_dataset = TensorData(test_df.to_numpy(), empty_torch)
test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

In [None]:
mlp_model.train(False)

for i, (data,label) in enumerate(test_dataloader):
    
    output = mlp_model.forward(data)
    _, preds = torch.max(output.data, 1)
    
sub_file['ProdTaken'] = preds.detach().numpy()
file_name = 'mlp_result.csv'
sub_file.to_csv(os.path.join(file_path, file_name))

In [None]:
len(test_df)

### keras 사용

In [None]:
import tensorflow as tf
from hyperopt import hp
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import ReLU, PReLU
from keras.layers.core import Dense, Dropout
from keras.layers import BatchNormalization
from keras.models import Sequential
from tensorflow.keras.optimizers import SGD, Adam
from sklearn.preprocessing import StandardScaler

# 기본이 되는 매개변수
base_param = {
    'input_dropout': 0.0,
    'hidden_layers': 3,
    'hidden_units': 96,
    'hidden_activation': 'relu',
    'hidden_dropout': 0.2,
    'batch_norm': 'before_act',
    'optimizer': {'type': 'adam', 'lr': 0.001},
    'batch_size': 64,
}

# 탐색할 매개변수 공간을 지정
param_space = {
    'input_dropout': hp.quniform('input_dropout', 0, 0.2, 0.05),
    'hidden_layers': hp.quniform('hidden_layers', 2, 4, 1),
    'hidden_units': hp.quniform('hidden_units', 32, 256, 32),
    'hidden_activation': hp.choice('hidden_activation', ['prelu', 'relu']),
    'hidden_dropout': hp.quniform('hidden_dropout', 0, 0.3, 0.05),
    'batch_norm': hp.choice('batch_norm', ['before_act', 'no']),
    'optimizer': hp.choice('optimizer',
                           [{'type': 'adam',
                             'lr': hp.loguniform('adam_lr', np.log(0.00001), np.log(0.01))},
                            {'type': 'sgd',
                             'lr': hp.loguniform('sgd_lr', np.log(0.00001), np.log(0.01))}]),
    'batch_size': hp.quniform('batch_size', 32, 128, 32),
}


class MLP:

    def __init__(self, params):
        self.params = params
        self.scaler = None
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):

        # 매개변수
        input_dropout = self.params['input_dropout']
        hidden_layers = int(self.params['hidden_layers'])
        hidden_units = int(self.params['hidden_units'])
        hidden_activation = self.params['hidden_activation']
        hidden_dropout = self.params['hidden_dropout']
        batch_norm = self.params['batch_norm']
        optimizer_type = self.params['optimizer']['type']
        optimizer_lr = self.params['optimizer']['lr']
        batch_size = int(self.params['batch_size'])

        # 표준화
#         self.scaler = StandardScaler()
#         tr_x = self.scaler.fit_transform(tr_x)
#         va_x = self.scaler.transform(va_x)

        self.model = Sequential()

        # 입력계층
        self.model.add(Dropout(input_dropout, input_shape=(tr_x.shape[1],)))

        # 은닉계층
        for i in range(hidden_layers):
            self.model.add(Dense(hidden_units))
            if batch_norm == 'before_act':
                self.model.add(BatchNormalization())
            if hidden_activation == 'prelu':
                self.model.add(PReLU())
            elif hidden_activation == 'relu':
                self.model.add(ReLU())
            else:
                raise NotImplementedError
            self.model.add(Dropout(hidden_dropout))

        # 출력 계층
        self.model.add(Dense(1, activation='sigmoid'))

        # 최적화(옵티마이저)
        if optimizer_type == 'sgd':
            optimizer = SGD(lr=optimizer_lr, decay=1e-6, momentum=0.9, nesterov=True)
        elif optimizer_type == 'adam':
            optimizer = Adam(lr=optimizer_lr, beta_1=0.9, beta_2=0.999, decay=0.)
        else:
            raise NotImplementedError

        # 목적함수, 평가지표 등의 설정
        self.model.compile(loss='binary_crossentropy',
                           optimizer=optimizer, metrics=['accuracy'])

        # 에폭 수, 조기 종료
        # 에폭을 너무 크게 하면 작은 학습률일 때 끝나지 않을 수 있으므로 주의
        nb_epoch = 200
        patience = 20
        early_stopping = EarlyStopping(patience=patience, restore_best_weights=True)

        # 학습의 실행
        history = self.model.fit(tr_x, tr_y,
                                 epochs=nb_epoch,
                                 batch_size=batch_size, verbose=1,
                                 validation_data=(va_x, va_y),
                                 callbacks=[early_stopping])

    def predict(self, x):
        # 예측
        # x = self.scaler.transform(x)
        y_pred = self.model.predict(x)
        y_pred = y_pred.flatten()
        return y_pred


# -----------------------------------
# 매개변수 튜닝의 실행
# -----------------------------------
from hyperopt import fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import log_loss


def score(params):
    # 매개변수 셋을 지정했을 때, 최소화해야 할 함수를 지정
    # 모델의 매개변수 탐색에서는 모델에 매개변수를 지정하여 학습예측한 경우의 점수로 함
    model = MLP(params)
    model.fit(train_x, train_y, valid_x, valid_y)
    valid_pred = model.predict(valid_x)
    score = log_loss(valid_y, valid_pred)
    print(f'params: {params}, logloss: {score:.4f}')

    # 정보를 기록
    history.append((params, score))

    return {'loss': score, 'status': STATUS_OK}


# hyperopt에 의한 매개변수 탐색의 실행
max_evals = 10
trials = Trials()
history = []
fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)

# 기록한 정보에서 매개변수와 점수를 출력
# trials에서도 정보를 취득할 수 있지만 매개변수를 취득하기 어려움
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f'best params:{best[0]}, score:{best[1]:.4f}')

In [None]:
tr_x

In [None]:
train_df_2 = pd.read_csv(os.path.join(file_path, train_path))

In [None]:
for col in train_df_2.columns:
    print(f'col:{col}, uniuqe: {train_df_2[col].unique()}')