# **필요한 라이브러리 불러오기**

In [1]:
!pip install catboost -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import datetime
import copy
import numpy as np
import joblib
from functools import reduce
from tqdm import tqdm
from itertools import product
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# **데이터 불러오기**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

train = pd.read_parquet(path = '/content/drive/MyDrive/Colab Notebooks/머신러닝_ZBDS/modeling/sample_train (ratio = 0.5).parquet')
transactions = pd.read_parquet(path = '/content/drive/MyDrive/Colab Notebooks/머신러닝_ZBDS/modeling/sample_transactions (ratio = 0.5).parquet')
members = pd.read_parquet(path = '/content/drive/MyDrive/Colab Notebooks/머신러닝_ZBDS/modeling/sample_members (ratio = 0.5).parquet')
user_logs = pd.read_parquet(path = '/content/drive/MyDrive/Colab Notebooks/머신러닝_ZBDS/modeling/sample_user_logs (ratio = 0.5).parquet')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **전처리**

* 수행한 전처리
  * 구매 기록
    * 거래 일자가 중복되는 데이터는 가장 마지막만 남김
    * 멤버십 만료일자가 KKBox 서비스 시작연도 이전이거나 <br>데이터 공개시점인 2017년 3월 30일로부터 1년 1개월※이후인 경우는 제외 <br>(최대 1년 단위 구독 가능 + 30일 무료 체험기간)
  * 인적 정보
    * 아래의 사유로 인해 정보의 신뢰도가 낮다고 판단되는 성별, 나이 데이터는 제외
      * 결측치 혹은 이상치의 비중이 높음
      * 고객이 직접 입력할 뿐 별도의 인증절차가 요구되지 않음
  * 사용 기록
    * 일일 총 재생시간이 음수이거나 하루(24시간 * 60분 * 60초)를 넘어서는 데이터는 삭제
    * Boxplot 전개 시 확인되는 재생 음원 수의 이상치 데이터 삭제 (IQR 활용)  

In [None]:
# 전처리 함수 정의
def preprocess(transactions, members, user_logs):

    # transactions
    transactions = transactions.drop_duplicates(subset = ['msno_num', 'transaction_date'], keep ='last')
    con_1 = transactions['transaction_date'] <= transactions['membership_expire_date']
    transactions = transactions[con_1]

    min_timestamp = pd.Timestamp(datetime.date(2005, 1, 1))
    max_timestamp = pd.Timestamp(datetime.date(2018, 4, 30))
    con_2 = transactions['membership_expire_date'] >= min_timestamp
    con_3 = transactions['membership_expire_date'] <= max_timestamp
    transactions = transactions[con_2 & con_3]

    # members
    members = members.drop(['gender', 'bd'], axis = 1)

    # user_logs
    con_1 = user_logs['total_secs'] >= 0
    con_2 = user_logs['total_secs'] < 86400
    user_logs = user_logs[con_1 & con_2]

    target_columns = user_logs.columns[2:8].tolist()
    for column in target_columns:
        con_3 = user_logs[column] >= 0
        user_logs = user_logs[con_3]
        q1 = user_logs[column].quantile(0.25)
        q3 = user_logs[column].quantile(0.75)
        iqr = q3 - q1
        con_4 = user_logs[column] <= q3 + 1.5*iqr
        con_5 = user_logs[column] >= q1 - 1.5*iqr
        user_logs = user_logs[con_4 & con_5]

    return transactions, members, user_logs

In [None]:
# 전처리 실행
transactions, members, user_logs = preprocess(transactions, members, user_logs)

  con_2 = transactions['membership_expire_date'] >= min_timestamp
  con_3 = transactions['membership_expire_date'] <= max_timestamp


# **파생 변수 생성 및 학습용 데이터 완성**

<table align = 'center'>
  <tr>
    <th valign = 'center' align = 'center'>종류</th>
    <th valign = 'center' align = 'center'>변수명</th>
    <th valign = 'center' align = 'center'>내용</th>
  </tr>
  <tr>
    <td rowspan = '6' valign = 'center' align = 'center'>구매 기록</td>
    <td>in_membership_days</td>
    <td>멤버십을 유지한 기간</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>구매 기록</td>-->
    <td>is_cancel</td>
    <td>구독을 취소한 경험의 유무</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>구매 기록</td>-->
    <td>is_auto_renew</td>
    <td>자동 구독 이용 여부 <br> (전체 구매 중에서 자동 구독의 비율이 50% 이상인 경우 참)</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>구매 기록</td>-->
    <td>discount</td>
    <td>할인 받은 구독 거래의 횟수</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>구매 기록</td>-->
    <td>is_method_change</td>
    <td>가입 후 결제수단을 변경한 이력의 유무</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>구매 기록</td>-->
    <td>pri_payment_method</td>
    <td>가입 후 가장 자주 사용한 결제수단</td>
  </tr>
  <tr>
    <td rowspan = '4' valign = 'center' align = 'center'>인적 정보</td>
    <td>city</td>
    <td>이용자 거주 도시</td>
  </tr>
  <tr>
    <!--<td rowspan = '4' valign = 'center' align = 'center'>인적 정보</td>-->
    <td>register_via</td>
    <td>서비스 가입 경로</td>
  </tr>
  <tr>
    <!--<td rowspan = '4' valign = 'center' align = 'center'>인적 정보</td>-->
    <td>register_init_time</td>
    <td>최초 가입 연도</td>
  </tr>
  <tr>
    <!--<td rowspan = '4' valign = 'center' align = 'center'>인적 정보</td>-->
    <td>after_regit_to_buy</td>
    <td>가입 후 최초 구입까지 걸린 시간 (일)</td>
  </tr>
  <tr>
    <td rowspan = '6' valign = 'center' align = 'center'>사용 기록</td>
    <td>per_25</td>
    <td>25% 이하 감상한 노래의 비율</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>사용 기록</td>-->
    <td>per_25_75</td>
    <td>25~75% 감상한 노래의 비율</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>사용 기록</td>-->
    <td>per_100</td>
    <td>75% 이상 감상한 노래의 비율</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>사용 기록</td>-->
    <td>seconds_per_song</td>
    <td>총 재생시간 / 사용자가 재생한 음악의 수</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>사용 기록</td>-->
    <td>mean_seconds</td>
    <td>평균 노래 재생시간</td>
  </tr>
  <tr>
    <!--<td rowspan = '6' valign = 'center' align = 'center'>사용 기록</td>-->
    <td>max_log_term</td>
    <td>최장 미접속 기간</td>
  </tr>
</table>

In [None]:
def gen_data(train, transactions, members, user_logs):
  # transactions
  transactions.sort_values(['msno_num', 'transaction_date'], ascending = [1, 1], inplace = True)

  ## 멤버십 유지기간
  tmp_0 = []
  grouped = transactions.groupby('msno_num')
  for key, group in tqdm(grouped):
    group['section_num'] = [0 for _ in range(group.shape[0])]
    section_num = 0
    for row_num in range(group.shape[0]):
      if group.iloc[row_num, 8] == 0:
        group.iloc[row_num, 9] = section_num
      else:
        group.iloc[row_num, 9] = section_num
        section_num += 1
    agg_format = {'transaction_date' : 'min', 'membership_expire_date' : 'max'}
    group = group.groupby('section_num').agg(agg_format)
    group['in_membership_days'] = (group['membership_expire_date'] - group['transaction_date']).dt.days
    tmp_0.append([key, np.sum(group['in_membership_days'])])
  tmp_0 = pd.DataFrame(tmp_0, columns = ['msno_num', 'in_membership_days'])

  ## 구독 취소 경험 유무
  tmp_1 = pd.DataFrame(transactions.groupby('msno_num', as_index = False)['is_cancel'].sum())
  tmp_1['is_cancel'] = [1 if data >= 1 else 0 for data in tmp_1['is_cancel'].tolist()]

  ## 구독 자동 갱신 여부
  agg_format = {'is_auto_renew' : 'sum', 'transaction_date' : 'count'}
  tmp_2 = pd.DataFrame(transactions.groupby('msno_num', as_index = False).agg(agg_format))
  tmp_2.rename({'transaction_date' : 'total_nums'}, axis = 1, inplace = True)
  tmp_2['is_auto_renew'] = [1 if data_1 >= data_2 * 0.5 else 0 for data_1, data_2 in zip(tmp_2['is_auto_renew'], tmp_2['total_nums'])]
  tmp_2.drop('total_nums', axis = 1, inplace = True)

  ## 할인 받은 구독 거래의 횟수
  tmp_3 = copy.deepcopy(transactions)
  tmp_3['discount'] = [1 if data_1 > data_2 else 0 for data_1, data_2 in zip(tmp_3['plan_list_price'], tmp_3['actual_amount_paid'])]
  tmp_3 = pd.DataFrame(tmp_3.groupby('msno_num', as_index = False)['discount'].sum())

  ## 최초 거래일
  tmp_4 = pd.DataFrame(transactions.groupby('msno_num', as_index = False)['transaction_date'].min())
  tmp_4.rename({'transaction_date' : 'transaction_start_date'}, axis = 1, inplace = True)

  ## 거래 수단 변경 여부
  tmp_5 = pd.DataFrame(transactions.groupby('msno_num', as_index = False)['payment_method_id'].nunique())
  tmp_5.rename({'payment_method_id' : 'is_method_change'}, axis = 1, inplace = True)
  tmp_5['is_method_change'] = [1 if data >= 2 else 0 for data in tmp_5['is_method_change'].tolist()]

  ## 가장 자주 사용한 거래 수단
  tmp_6 = pd.DataFrame(transactions.groupby('msno_num', as_index = False)['payment_method_id'].\
                      agg({'pri_payment_method':lambda x:x.mode()[0]}))

  ## tmp_0부터 tmp_6까지 합치기
  dfs = [tmp_0, tmp_1, tmp_2, tmp_3, tmp_4, tmp_5, tmp_6]
  data = reduce(lambda left, right: pd.merge(left, right, on = 'msno_num', how = 'inner'), dfs)

  print('transactions 병합 완료')

  # members
  data = data.merge(members, on = 'msno_num', how = 'inner')

  ## 가입 후 최초 거래까지의 기간
  data['after_regit_to_buy'] = [(start_date - regist_date).days for start_date, regist_date in zip(data['transaction_start_date'], data['registration_init_time'])]
  data.drop('transaction_start_date', axis = 1, inplace = True)

  ## 가입 날짜는 연도만 사용
  data['registration_init_time'] = pd.to_datetime(data['registration_init_time']).dt.year

  print('members 병합 완료')

  # user_logs
  user_logs.sort_values(['msno_num', 'date'], ascending = [1, 1], inplace = True)

  ## 최장 미접속 기간
  diff_df = []
  grouped = user_logs.groupby('msno_num')
  for key, group in tqdm(grouped):
    cond_df = group['date']
    if cond_df.shape[0] == 1:
      diff_df.append([key, 0])
    else:
      diff_df.append([key, np.max(cond_df.diff().dt.days[1:])])
  diff_df = pd.DataFrame(data = diff_df, columns = ['msno_num', 'max_log_term'])

  ## n% 감상한 음원의 비율, 음원당 재생시간, 평균 음원 재생시간
  target_columns = user_logs.columns[2:7].tolist()
  for idx, column in enumerate(target_columns):
    if idx == 0:
      user_logs['num_total'] = user_logs[column]
    else:
      user_logs['num_total'] += user_logs[column]
  user_logs['per_25'] = user_logs['num_25'] / user_logs['num_total']
  user_logs['per_25_75'] = (user_logs['num_50'] + user_logs['num_75']) / user_logs['num_total']
  user_logs['per_100'] = (user_logs['num_985'] + user_logs['num_100']) / user_logs['num_total']
  user_logs['seconds_per_song'] = user_logs['total_secs'] / user_logs['num_total']
  mean_vars = ['per_25', 'per_25_75', 'per_100', 'seconds_per_song', 'total_secs']
  mean_df = pd.DataFrame(user_logs.groupby('msno_num', as_index = False)[mean_vars].mean())
  mean_df.rename({'total_secs' : 'mean_seconds'}, axis = 1, inplace = True)

  final_df = diff_df.merge(mean_df, on = 'msno_num', how = 'inner')
  data = data.merge(final_df, on = 'msno_num', how = 'inner')

  print('user_logs 병합 완료')

  # 자주 사용하는 결제수단, 거주 도시, 가입경로, 가입연도는 더미변수로 변환
  data = pd.get_dummies(data, columns = ['pri_payment_method', 'city', 'registered_via', 'registration_init_time'])

  # train
  data = data.merge(train, on = 'msno_num', how = 'inner')

  print('train 병합 완료')

  return data

In [None]:
data_target = gen_data(train = train,
                       transactions = transactions,
                       members = members,
                       user_logs = user_logs)

100%|██████████| 477576/477576 [36:07<00:00, 220.35it/s]


transactions 병합 완료
members 병합 완료


100%|██████████| 476479/476479 [07:39<00:00, 1037.78it/s]


user_logs 병합 완료
train 병합 완료


# **훈련/테스트 데이터 분리, 스케일링, 오버 샘플링**

In [8]:
scale_columns = ['in_membership_days', 'discount', 'max_log_term',
                'per_25', 'per_25_75', 'per_100', 'seconds_per_song',
                'mean_seconds', 'after_regit_to_buy']
ct = ColumnTransformer([('scaling', StandardScaler(), scale_columns)],
                      remainder = 'passthrough')

def split_rescale_resample(data_target, ct, scaling = False, sel_dec = 'plain', resampling = 'plain'):
  # 훈련/테스트 데이터 분리
  input = data_target.iloc[:, 1:-1]
  target = data_target.iloc[:, -1]
  train_input, test_input, train_target, test_target = train_test_split(input, target, stratify = target, random_state = 42)

  # 스케일링 (훈련용 데이터에는 fit_transform, 테스트 데이터에는 transform)
  if scaling == False:
    pass
  elif scaling == True:
    train_input = pd.DataFrame(ct.fit_transform(train_input), columns = ct.get_feature_names_out())
    test_input = pd.DataFrame(ct.transform(test_input), columns = ct.get_feature_names_out())

  # sel_dec의 값에 따라 PCA 혹은 SelectKBest를 반영
  if sel_dec == 'plain':
    pass
  elif sel_dec == 'pca':
    pca = PCA(n_components = 0.90)
    train_input = pca.fit_transform(train_input)
    test_input = pca.transform(test_input)
  elif sel_dec == 'select':
    selector = SelectKBest(score_func = f_classif, k = 10)
    train_input = selector.fit_transform(train_input, train_target)
    test_input = selector.transform(test_input)

  # 리샘플링 (훈련용 데이터에만 적용)
  if resampling == 'plain':
    pass
  elif resampling == 'over':
    smote = SMOTE(random_state = 42)
    train_input, train_target = smote.fit_resample(train_input, train_target)
  elif resampling == 'mixed':
    smotetomek = SMOTETomek(random_state = 42)
    train_input, train_target = smotetomek.fit_resample(train_input, train_target)
  elif resampling == 'under':
    tomek = TomekLinks()
    train_input, train_target = tomek.fit_resample(train_input, train_target)

  return train_input, test_input, train_target, test_target

# **모델링 및 평가**

* 분류 알고리즘, 스케일링 여부, 리샘플링 여부 등 경우의 수를 총 72가지로 나누어 각각 모델링을 수행하고 평가함
* Precision 기준 과적합의 정도가 가장 적으면서, Test Precision이 높았던 아래 조합을 기준으로 하이퍼파라미터 튜닝을 진행함
  * Standard Scaler 사용
  * Feature Selection이나 PCA를 사용하지 않음
  * 오버샘플링, 언더샘플링, 복합샘플링 사용하지 않음
  * LGBM 모델 사용

In [None]:
out_lgbm = lgb.LGBMClassifier(application = 'binary', metric = 'binary_logloss')
out_cat = CatBoostClassifier(logging_level = 'Silent')
out_xgb = XGBClassifier()
out_list = [[out_lgbm, 'lightgbm'], [out_cat, 'catboost'], [out_xgb, 'xgboost']]

params = [[True, False],['plain', 'pca', 'select'], ['plain', 'over', 'mixed', 'under']]
case_list = list(product(*list(params)))
result_df = []

for c in tqdm(case_list):
  train_input, test_input, train_target, test_target = split_rescale_resample(data_target = data_target,
                                                                              ct = ct,
                                                                              scaling = c[0],
                                                                              sel_dec = c[1],
                                                                              resampling = c[2])
  for out in out_list:
    model = out[0].fit(train_input, train_target)
    train_pred = model.predict(train_input)
    test_pred = model.predict(test_input)
    result_df.append([c[0], c[1], c[2], out[1],
                      np.round(accuracy_score(train_target, train_pred), 4),
                      np.round(accuracy_score(test_target, test_pred), 4),
                      np.round(recall_score(train_target, train_pred), 4),
                      np.round(recall_score(test_target, test_pred), 4),
                      np.round(precision_score(train_target, train_pred), 4),
                      np.round(precision_score(test_target, test_pred), 4),
                      np.round(f1_score(train_target, train_pred), 4),
                      np.round(f1_score(test_target, test_pred), 4)])

result_df = pd.DataFrame(result_df, columns = ['scaling', 'sel_dec', 'resampling','model_name',
                                               'train_accuracy', 'test_accuracy',
                                               'train_recall', 'test_recall',
                                               'train_precision', 'test_precision',
                                               'train_f1_score', 'test_f1_score'])
result_df['precision_gap'] = abs(result_df['train_precision'] - result_df['test_precision'])

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 24/24 [1:35:49<00:00, 239.55s/it]


In [None]:
final_result = result_df[(result_df['precision_gap'] <= 0.1) & (result_df['test_f1_score'] >= 0.5)]
final_result.sort_values(['test_precision'], ascending = [0], inplace = True)
final_result.reset_index(drop = True, inplace = True)
final_result.head(72)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_result.sort_values(['test_precision'], ascending = [0], inplace = True)


Unnamed: 0,scaling,sel_dec,resampling,model_name,train_accuracy,test_accuracy,train_recall,test_recall,train_precision,test_precision,train_f1_score,test_f1_score,precision_gap
0,True,plain,plain,lightgbm,0.9115,0.9091,0.4561,0.4449,0.7632,0.7496,0.571,0.5583,0.0136
1,False,plain,plain,lightgbm,0.9117,0.9091,0.4569,0.4453,0.7645,0.7487,0.5719,0.5585,0.0158
2,False,plain,plain,catboost,0.9224,0.9112,0.5353,0.4877,0.7971,0.735,0.6405,0.5864,0.0621
3,True,plain,plain,xgboost,0.9194,0.9098,0.5154,0.4748,0.7863,0.7321,0.6227,0.576,0.0542
4,False,plain,plain,xgboost,0.9194,0.9098,0.5154,0.4748,0.7863,0.7321,0.6227,0.576,0.0542
5,True,plain,plain,catboost,0.9222,0.9105,0.534,0.4859,0.7961,0.7308,0.6392,0.5837,0.0653
6,False,plain,under,lightgbm,0.9111,0.909,0.4919,0.4803,0.7669,0.7222,0.5994,0.5769,0.0447
7,False,plain,under,catboost,0.9221,0.9107,0.5644,0.5164,0.8007,0.7131,0.6621,0.5991,0.0876
8,True,plain,under,lightgbm,0.915,0.9092,0.5158,0.5042,0.771,0.7088,0.6181,0.5892,0.0622
9,False,plain,under,xgboost,0.9185,0.9088,0.5446,0.5042,0.7876,0.7056,0.6439,0.5881,0.082


# **하이퍼파라미터 튜닝**

* Average-Precision을 기준하여 모델을 선정, 아래와 같은 성능 변화가 확인됨
  * 하이퍼 파라미터
    * learning_rate : 0.1,
    * min_child_samples : 20
    * n_estimators : 50,
    * num_leaves : 248
  * 성능 변화
    * Test Recall : 0.4449 -> 0.4912 (+ 0.0463, 기존 대비 약 10% 상승)
    * Test Precision : 0.7496 -> 0.7389 (- 0.0107, 기존 대비 약 1% 하락)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_target = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/머신러닝_ZBDS/modeling/data_target.parquet')

Mounted at /content/drive


In [None]:
# 하이퍼 파리미터 튜닝을 위한 Grid Search 실행
out_lgbm = lgb.LGBMClassifier(application = 'binary', metric = 'binary_logloss')
train_input, test_input, train_target, test_target = split_rescale_resample(data_target = data_target,
                                                                            scaling = True,
                                                                            sel_dec = 'plain',
                                                                            resampling = 'plain')
params = {'n_estimators' : [50, 100, 200],
          'learning_rate' : [0.001, 0.01, 0.1],
          'min_child_samples' : [10, 20, 40],
          'num_leaves' : [15, 31, 62, 124, 248]}

gs = GridSearchCV(out_lgbm, params, scoring = 'average_precision', verbose = 4)
gs.fit(train_input, train_target)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV 1/5] END learning_rate=0.001, min_child_samples=10, n_estimators=50, num_leaves=15;, score=0.566 total time=   1.2s
[CV 2/5] END learning_rate=0.001, min_child_samples=10, n_estimators=50, num_leaves=15;, score=0.578 total time=   0.9s
[CV 3/5] END learning_rate=0.001, min_child_samples=10, n_estimators=50, num_leaves=15;, score=0.560 total time=   0.9s
[CV 4/5] END learning_rate=0.001, min_child_samples=10, n_estimators=50, num_leaves=15;, score=0.570 total time=   0.9s
[CV 5/5] END learning_rate=0.001, min_child_samples=10, n_estimators=50, num_leaves=15;, score=0.560 total time=   0.9s
[CV 1/5] END learning_rate=0.001, min_child_samples=10, n_estimators=50, num_leaves=31;, score=0.610 total time=   1.0s
[CV 2/5] END learning_rate=0.001, min_child_samples=10, n_estimators=50, num_leaves=31;, score=0.610 total time=   1.0s
[CV 3/5] END learning_rate=0.001, min_child_samples=10, n_estimators=50, num_leaves=31;, score=0.

In [None]:
# 하이퍼 파라미터 튜닝을 마친 최적의 모델
dt = gs.best_estimator_
train_pred = dt.predict(train_input)
test_pred = dt.predict(test_input)

print('Best Params : ', gs.best_params_)

print('Train accuracy : ', round(accuracy_score(train_target, train_pred), 4))
print('Test accuracy : ', round(accuracy_score(test_target, test_pred), 4))

print('Train recall : ', round(recall_score(train_target, train_pred), 4))
print('Test recall : ', round(recall_score(test_target, test_pred), 4))

print('Train precision : ', round(precision_score(train_target, train_pred), 4))
print('Test precision : ', round(precision_score(test_target, test_pred), 4))

Best Params :  {'learning_rate': 0.1, 'min_child_samples': 20, 'n_estimators': 50, 'num_leaves': 248}
Train accuracy :  0.9206
Test accuracy :  0.9119
Train recall :  0.528
Test recall :  0.4912
Train precision :  0.7865
Test precision :  0.7389


In [None]:
# 모델 및 스케일러 저장
joblib.dump(dt, './churn_predict_best_lgbm.pkl')
joblib.dump(ct, '/content/column_transformer.pkl')

['./churn_predict_best_lgbm.pkl']