# 패키지 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%matplotlib inline

In [2]:
data_dir = '/content/drive/MyDrive/앙상블 강의자료/과제/'

train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
submission = pd.read_csv(data_dir + 'sample_submission.csv')

# ID 변수 생성

본 변수를 생성하여 제출할 시 성능이 0.06502 까지 올라감

In [3]:
# def get_id(x):
#   return str(x['Total_Trans_Ct']) + '_' + str(x['Total_Relationship_Count']) + '_' + str(x['Months_Inactive_12_mon']) \
#   + '_' + str(x['Gender']) + '_' + str(x['Customer_Age']) + '_' + str(x['Contacts_Count_12_mon']) \
#   + '_' + str(x['Months_on_book'])

# train['ID'] = train.apply(lambda x : get_id(x), axis = 1)
# test['ID'] = test.apply(lambda x : get_id(x), axis = 1)

# 라벨인코딩

저는 단순히 라벨인코딩을 하고 파라미터 튜닝을 통해서 모델의 성능을 올렸습니다

In [4]:
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook

cat_cols = test.dtypes[test.dtypes == 'O'].index.tolist()

lbl = LabelEncoder()

# 원본데이터의 변형을 막고자 데이터를 copy함
temp_train = train.copy()
temp_test = test.copy()

for col in tqdm_notebook( cat_cols ):
  # local_train과 local_test를 concat하여 temp_df에 저장
  temp_df = pd.concat([ temp_train[[col]], temp_test[[col]] ] , axis=0)
  
  # Label-Encoding을 fitting함
  lbl.fit( temp_df[col] )
  
  # local_train/local_test에 label_encoding한 값을 대입함
  temp_train[col] = lbl.transform(temp_train[col])
  temp_test[col] = lbl.transform(temp_test[col])

features = test.columns.tolist()
target = 'Attrition_Flag'

X_train = temp_train[features]
y_train = temp_train[target]

X_test = temp_test[features]

  0%|          | 0/5 [00:00<?, ?it/s]

# 파라미터 튜닝 X / OOF X

In [5]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X_train, y_train, test_size=0.2, random_state = 16, stratify = y_train)

In [6]:
from xgboost import XGBClassifier

model = XGBClassifier(random_state = 16, n_estimators = 500)

model.fit(train_X, train_y)

y_val_pred = model.predict_proba(val_X)

y_pred = model.predict_proba(X_test)

In [7]:
from sklearn.metrics import log_loss

print("파라미터 튜닝 X / OOF X Score :{} ".format( log_loss(val_y, y_val_pred) ))

파라미터 튜닝 X / OOF X Score :0.06932638344101702 


In [8]:
submission.iloc[:, 1:] = y_pred

In [9]:
submission.to_csv(data_dir + 'tuning_X_oof_X.csv', index = False)
submission.head()

Unnamed: 0,id,Existing Customer,Attrited Customer
0,0,0.996366,0.003634
1,1,0.998413,0.001587
2,2,0.724303,0.275697
3,3,0.998183,0.001817
4,4,0.999942,5.8e-05


In [10]:
pd.DataFrame(model.feature_importances_, index = train_X.columns).sort_values(by = 0, ascending = False)

Unnamed: 0,0
Total_Trans_Ct,0.195833
Total_Revolving_Bal,0.178432
Total_Relationship_Count,0.128161
Total_Ct_Chng_Q4_Q1,0.081185
Total_Trans_Amt,0.080688
Months_Inactive_12_mon,0.072554
Total_Amt_Chng_Q4_Q1,0.043227
Gender,0.038911
Customer_Age,0.029999
Contacts_Count_12_mon,0.029445


# 파라미터 튜닝 X / OOF O

In [11]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

model = XGBClassifier(random_state = 16, n_estimators = 500)

n_split = 5
skf = StratifiedKFold(n_splits = n_split, shuffle=True, random_state=22)

# train 데이터에 대한 예측 값이 저장될 배열
oof_val = np.zeros((y_train.shape[0], 2))

# test 데이터에 대한 예측 값이 저장될 배열
oof_test = np.zeros((X_test.shape[0], 2))

# oof 방식을 통한 예측
for trn_idx, val_idx in tqdm_notebook(skf.split(X_train, y_train)):
  trn_data, trn_label = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
  val_data, val_label = X_train.iloc[val_idx], y_train.iloc[val_idx]

  model.fit(trn_data, trn_label, eval_set=[(val_data, val_label)], early_stopping_rounds = 500, verbose=False, eval_metric = 'logloss')
  model_valid_pred = model.predict_proba(val_data)
  oof_val[val_idx] = model_valid_pred

  model_test_pred = model.predict_proba(X_test)
  oof_test += model_test_pred / n_split

0it [00:00, ?it/s]

In [12]:
from sklearn.metrics import log_loss

print("파라미터 튜닝 X / OOF O Score :{} ".format( log_loss(y_train, oof_val) ))

파라미터 튜닝 X / OOF O Score :0.07274058189378227 


In [13]:
submission.iloc[:, 1:] = oof_test

In [14]:
submission.to_csv(data_dir + 'tuning_X_oof_O.csv', index = False)
submission.head()

Unnamed: 0,id,Existing Customer,Attrited Customer
0,0,0.988864,0.011136
1,1,0.995646,0.004354
2,2,0.840674,0.159326
3,3,0.996305,0.003695
4,4,0.99979,0.00021


# 파라미터 튜닝 O / OOF O

In [None]:
# # 파라미터 튜닝용 패키지 다운로드
# !pip install optuna

In [16]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

In [17]:
# def objective(trial, data = X_train, target = y_train):
    
#     train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2, random_state=16, stratify = target)
#     param = {
#         'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
#         'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
#         'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
#         'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
#         'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
#         'n_estimators': 500,
#         'max_depth': trial.suggest_int('max_depth', 2, 20),
#         'random_state': 16,
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
#         'objective ' : 'binary:logistic',
#     }

#     model = xgb.XGBClassifier(**param)
    
#     model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds = 500, verbose=False, eval_metric = 'logloss')
    
#     preds = model.predict_proba(test_x)
    
#     logloss = log_loss(test_y, preds)
    
#     return logloss

In [18]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [19]:
n_split = 5
skf = StratifiedKFold(n_splits = n_split, shuffle=True, random_state=22)

# train 데이터에 대한 예측 값이 저장될 배열
oof_val = np.zeros((y_train.shape[0], 2))

# test 데이터에 대한 예측 값이 저장될 배열
oof_test = np.zeros((X_test.shape[0], 2))

# best params
params = {
    'lambda': 0.0010360599058905956, 
    'alpha': 0.011567539976325336, 
    'colsample_bytree': 0.9, 
    'subsample': 0.8, 
    'learning_rate': 0.03655776032101411, 
    'max_depth': 5, 
    'min_child_weight': 1, 
    'n_estimators': 10000,
    'random_state': 16,
    'tree_method':'gpu_hist',}

model = xgb.XGBClassifier(**params)

# oof 방식을 통한 예측
for trn_idx, val_idx in tqdm_notebook(skf.split(X_train, y_train)):
  trn_data, trn_label = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
  val_data, val_label = X_train.iloc[val_idx], y_train.iloc[val_idx]

  model.fit(trn_data, trn_label, eval_set=[(val_data, val_label)], early_stopping_rounds = 500, verbose=False, eval_metric = 'logloss')
  model_valid_pred = model.predict_proba(val_data)
  oof_val[val_idx] = model_valid_pred

  model_test_pred = model.predict_proba(X_test)
  oof_test += model_test_pred / n_split

0it [00:00, ?it/s]

In [20]:
from sklearn.metrics import log_loss

print("파라미터 튜닝 O / OOF O Score :{} ".format( log_loss(y_train, oof_val) ))

파라미터 튜닝 O / OOF O Score :0.07436080205196653 


In [21]:
submission.iloc[:, 1:] = oof_test

In [22]:
submission.to_csv(data_dir + 'tuning_O_oof_O.csv', index = False)
submission.head()

Unnamed: 0,id,Existing Customer,Attrited Customer
0,0,0.997294,0.002706
1,1,0.997453,0.002547
2,2,0.956214,0.043786
3,3,0.99454,0.00546
4,4,0.999869,0.000131
