In [1]:
# Notebook 출력설정
# 주요 라이브러리 임포트

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns',2000)
pd.set_option('display.width', 2000)

%matplotlib inline
import matplotlib.pylab as plt
plt.rcParams["figure.figsize"] = (15,5)
plt.rcParams['lines.linewidth'] = 1
plt.rcParams['axes.grid'] = True
import seaborn as sns

In [2]:
# 원본 데이터셋 로드
DIR_DATASET = "C:/Users/0stix/Datasets/"
NAME_PROJECT = "2203-kaggle-netflix_appetency"

In [3]:
df_train = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/train.csv', index_col='id')
df_test = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/test.csv', index_col='id')

len_train = len(df_train)
df_all = pd.concat([df_train, df_test], axis=0)
target_ = 'target'

In [4]:
df_v0 = pd.DataFrame()
df_v0[target_] = df_all[target_]
lst_p = [
    'feature_269',
]
set_p = set(lst_p)

lst_ = []
for idx_, attrib_ in enumerate(df_all.columns):
    # if attrib_ not in set_p:
    v_ = df_all[attrib_].value_counts()
    s_ = 'v_' + str(idx_).zfill(3)
    if sum(v_) == 100000:
        dtype_= df_all[attrib_].dtype
        len_ = len(v_)
        
        if dtype_ == 'int64':
            if len_ < 2:
                continue
            elif 2 <= len_:
                df_v0[s_] = df_all[attrib_].astype('float32')

        elif dtype_ == 'float64':
            if len_ < 2:
                continue
            elif 2 <= len_:
                df_v0[s_] = df_all[attrib_].astype('float32')
        
        elif dtype_ == 'object':
            if len_ < 2:
                continue
            elif len_ == 2:
                df_v0[s_] = df_all[attrib_].astype('category').cat.codes
            elif 2 < len_ < 1000:
                df_v0 = pd.concat([df_v0, pd.get_dummies(df_all[attrib_].astype('category').cat.codes, prefix=s_)], axis=1)
            else:
                continue


In [5]:
# df_v0.head().T
df_v0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99995
Columns: 2715 entries, target to v_507
dtypes: float32(299), float64(1), int8(25), uint8(2390)
memory usage: 345.9 MB


In [6]:
from sklearn.model_selection import train_test_split


X_v0, y_v0 = df_v0.drop(target_, axis=1), df_v0[target_]
# X_v0_tr, X_v0_te, y_v0_tr = X_v0[:len_train], X_v0[len_train:], y_v0[:len_train]
X_train, X_test, y_train = X_v0[:len_train], X_v0[len_train:], y_v0[:len_train]

# X_v0_tr, X_v0_va, y_v0_tr, y_v0_va = train_test_split(X_v0_tr, y_v0_tr, test_size=.3)
# X_v0_tr.shape, X_v0_te.shape, y_v0_tr.shape

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

import time


def objective_(trial_):
    lst_param = {
        "random_state":trial_.suggest_categorical("random_state", [1]),
        'learning_rate' : trial_.suggest_loguniform('learning_rate', 0.0001, 0.3),
        'bagging_temperature' :trial_.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        "n_estimators": 1000,
        "max_depth":trial_.suggest_int("max_depth", 2, 10),
        'random_strength' :trial_.suggest_int('random_strength', 0, 100),
        "l2_leaf_reg":trial_.suggest_float("l2_leaf_reg",1e-8,3e-5),
        "min_child_samples": trial_.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial_.suggest_int("max_bin", 50, 1000),
        'od_type': trial_.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        # 'task_type': trial_.suggest_categorical('task_type', ['GPU']),
        # 'loss_function': trial_.suggest_categorical('loss_function', ['Accuracy']),
        'eval_metric': trial_.suggest_categorical('eval_metric', ['AUC'])
    }
    
    X_tr_t, X_va_t, y_tr_t, y_va_t = train_test_split(X_train, y_train, test_size=.3, random_state=1)
    
    model_ = CatBoostRegressor(**lst_param)
    model_.fit(
        X_tr_t,
        y_tr_t,
        eval_set=[(X_va_t, y_va_t)],
        early_stopping_rounds=15,
        verbose=0
    )
    
    y_tr_p = model_.predict(X_tr_t)
    y_va_p = model_.predict(X_va_t)
    
    fpr, tpr, thresholds = roc_curve(y_tr_t, y_tr_p)
    score_tr = auc(fpr, tpr)
    
    fpr, tpr, thresholds = roc_curve(y_va_t, y_va_p)
    score_va = auc(fpr, tpr)
    
    # score_tr = auc(y_tr_t, y_tr_p)
    # score_va = auc(y_va_t, y_va_p)
    print(score_tr, score_va)
    
    return score_va if score_va else 0

In [8]:
import optuna
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

TRIALS = 500
TIMEOUT = 3600 * 6

sampler_ = TPESampler(seed=1)

study_ = optuna.create_study(
    direction='maximize',
    sampler=sampler_,
)

study_.optimize(
    objective_, 
    n_trials=TRIALS, 
    timeout=TIMEOUT, 
    n_jobs=1, 
    callbacks=None,
    show_progress_bar=True,
    )

print(study_.best_value)
print(study_.best_trial.params)

[32m[I 2022-03-28 04:28:32,192][0m A new study created in memory with name: no-name-6345aab4-ecd5-45d0-9c22-845f6effb047[0m


  0%|          | 0/500 [00:00<?, ?it/s]

0.7462053013043867 0.7440637564471527
[32m[I 2022-03-28 04:28:34,852][0m Trial 0 finished with value: 0.7440637564471527 and parameters: {'random_state': 1, 'learning_rate': 0.0028186170601294433, 'bagging_temperature': 7.608481233714797, 'max_depth': 2, 'random_strength': 30, 'l2_leaf_reg': 4.41120916560522e-06, 'min_child_samples': 13, 'max_bin': 227, 'od_type': 'Iter', 'eval_metric': 'AUC'}. Best is trial 0 with value: 0.7440637564471527.[0m
0.7575220522073746 0.7522007483016081
[32m[I 2022-03-28 04:28:44,020][0m Trial 1 finished with value: 0.7522007483016081 and parameters: {'random_state': 1, 'learning_rate': 0.007473621824976351, 'bagging_temperature': 0.47509237210306143, 'max_depth': 8, 'random_strength': 20, 'l2_leaf_reg': 2.6344741917364455e-05, 'min_child_samples': 7, 'max_bin': 687, 'od_type': 'Iter', 'eval_metric': 'AUC'}. Best is trial 1 with value: 0.7522007483016081.[0m
0.6826544013477811 0.6758956367867994
[32m[I 2022-03-28 04:28:54,238][0m Trial 2 finished wi

In [9]:
from catboost import CatBoostClassifier

best_params = study_.best_params

X_tr_t, X_va_t, y_tr_t, y_va_t = train_test_split(X_train, y_train, test_size=0.3, random_state=1)
model_tmp = CatBoostClassifier(**best_params, n_estimators=30000, verbose=1000)
model_tmp.fit(X_tr_t, y_tr_t, eval_set=[(X_va_t, y_va_t)], early_stopping_rounds=100)

0:	test: 0.5696956	best: 0.5696956 (0)	total: 70.2ms	remaining: 35m 6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7715292128
bestIteration = 517

Shrink model to first 518 iterations.


<catboost.core.CatBoostClassifier at 0x1af19d71a00>

In [10]:
model_final = CatBoostClassifier(**best_params, 
                                 n_estimators=model_tmp.get_best_iteration(), 
                                 verbose=1000)
model_final.fit(X_train, y_train)
y_pred = model_final.predict(X_test)

0:	total: 81.8ms	remaining: 42.2s
516:	total: 39.1s	remaining: 0us


In [14]:
p_v0_te = model_final.predict(X_test)

from collections import Counter
Counter(p_v0_te)

Counter({0.0: 23907, 1.0: 6093})

In [15]:
df_sub = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/sample_submission.csv', index_col='id')
df_sub[target_] = p_v0_te.astype(int)

In [16]:
import datetime
now = datetime.datetime.now()
str_datetime = now.strftime("%y%m%d_%H%M%S")
df_sub.to_csv(DIR_DATASET + 'submission-' + NAME_PROJECT + '-'+str_datetime+'.csv', index = 1)

In [95]:
df_train[target_].value_counts()

0    49127
1    20873
Name: target, dtype: int64

In [30]:
nda_ = np.asarray(lst_)

array([    2,     2,     6,     6,    12,     4,   104, 18432,   298,
        5970,  3313,  3325,     2,    26,     2,     2,     2,     2,
           2,     2,     2,     2,     2,     2,   298,     7,    33,
        9085,     3,    13,     2,     4,     4,    12,   359,    14,
          21,  2521,    10,     2,     3,     3,     2,    15,     4,
         153,    87,     2,     2,     6,     2,    83,    75,    30,
          17,     6,     3,     2,     2,   138,   112,    37,    21,
           9,     6,     5,     3,   138,   117,    40,    22,    10,
           7,     6,     2,     2,     2,     2])

In [34]:
lst_sub = []

tot_= 0
for idx_, num_ in enumerate(sorted(lst_)):
    tot_ += num_
    print(idx_, tot_)

0 2
1 4
2 6
3 8
4 10
5 12
6 14
7 16
8 18
9 20
10 22
11 24
12 26
13 28
14 30
15 32
16 34
17 36
18 38
19 40
20 42
21 44
22 46
23 48
24 50
25 53
26 56
27 59
28 62
29 65
30 69
31 73
32 77
33 81
34 86
35 92
36 98
37 104
38 110
39 116
40 122
41 129
42 136
43 145
44 155
45 165
46 177
47 189
48 202
49 216
50 231
51 248
52 269
53 290
54 312
55 338
56 368
57 401
58 438
59 478
60 553
61 636
62 723
63 827
64 939
65 1056
66 1194
67 1332
68 1485
69 1783
70 2081
71 2440
72 4961
73 8274
74 11599
75 17569
76 26654
77 45086


In [70]:
sum(nda_[nda_<1000])

2440

In [48]:
df_all['feature_229'].value_counts()

0        97345
6        36   
4        31   
5        30   
12       26   
3        26   
13       23   
11       21   
8        18   
10       18   
7        17   
16       16   
17       15   
43       15   
15       15   
9        14   
44       14   
20       14   
35       12   
34       12   
25       12   
23       11   
30       11   
22       10   
63       10   
27       10   
37       10   
21       10   
31       9    
32       9    
67       8    
57       8    
61       8    
90       8    
41       8    
71       8    
26       8    
29       8    
48       8    
85       8    
46       8    
109      7    
65       7    
73       7    
40       7    
28       7    
1        7    
50       7    
14       7    
36       7    
18       7    
24       7    
45       7    
51       7    
49       6    
103      6    
19       6    
58       6    
56       5    
42       5    
74       5    
47       5    
258      5    
222      5    
2        5    
92       5    
39       5

In [67]:
for attrib_ in df_train.columns[1:]:
    v_tr = df_train[attrib_].value_counts()
    v_te = df_test[attrib_].value_counts()
    dtype_ = df_all[attrib_].dtype
    if dtype_ == 'object':
        print("{:>12} {:>9}: {:>7d} {:>7d}, {:>7d} {:>7d}".format(attrib_, str(dtype_), sum(v_tr), len(v_tr), sum(v_te), len(v_te)))
        # if  len(v_tr) == 2 :
        #     print(v_tr)


   feature_0    object:   70000       2,   30000       2
   feature_1    object:   70000       2,   30000       2
   feature_2    object:   70000       6,   30000       6
   feature_3    object:   70000       6,   30000       6
   feature_4    object:   70000      12,   30000      12
  feature_16    object:   70000       4,   30000       4
  feature_17    object:   70000     103,   30000     103
  feature_18    object:   70000   15858,   30000   10212
  feature_19    object:   70000     260,   30000     200
  feature_20    object:   70000    5841,   30000    5205
  feature_21    object:   70000    2327,   30000     990
  feature_22    object:   70000    2339,   30000     991
  feature_23    object:   70000       2,   30000       2
  feature_24    object:   70000      25,   30000      25
  feature_27    object:   70000       2,   30000       2
  feature_28    object:   70000       2,   30000       2
  feature_29    object:   70000       2,   30000       2
  feature_30    object:   70000

In [None]:
from pycaret.classification import *

df_v0_tr = df_v0[:len_train]
df_v0_te = df_v0[len_train:]

reg = setup(
    data=df_v0_tr,
    target=target_,
    session_id=999,
    data_split_shuffle=True,
    create_clusters=False,
    use_gpu=True,
    silent=False,
    fold=4,
    n_jobs=-1
)

IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
feature_0,Categorical
feature_1,Categorical
feature_2,Categorical
feature_3,Categorical
feature_4,Categorical
...,...
feature_501,Categorical
feature_503,Categorical
feature_504,Categorical
feature_506,Categorical


 
