# Make ctgr catboost

【内容】
- data_cleaningで精緻化したデータを使用
- 数値データをカテゴリ化して検証
- 学習はCatboostを使用

In [1]:
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.model_selection import train_test_split, StratifiedKFold
import sklearn.metrics

## Setting

In [2]:
path_train = "../../data/input/preprocess_train.csv"
path_test = "../../data/input/preprocess_test.csv"
path_sample_submit = "../../data/input/sample_submit.csv"
path_output_train = f"../../data/stacking/catboost_stacking_train.csv"
path_output_test = f"../../data/stacking/catboost_stacking_test.csv"

## Data read

In [3]:
train_df = pd.read_csv(path_train)
test_df = pd.read_csv(path_test)

In [4]:
train_df.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,...,Designation,MonthlyIncome,ProdTaken,customer_marriage,customer_car,customer_child,age_cls,DurationOfPitch_cls,NumberOfFollowups_cls,MonthlyIncome_cls
0,0,50.0,Self Enquiry,2,900.0,Large Business,Male,1.0,4.0,basic,...,executive,253905.0,1,未婚,自動車未所有,子供なし,46~50歳,801~900s,4回,25~30万円
1,1,56.0,Company Invited,1,840.0,Salaried,Male,1.0,4.0,standard,...,senior manager,404475.0,0,離婚済み,自動車所有,子供なし,56歳以上,801~900s,4回,40万円以上
2,2,,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,basic,...,executive,278145.0,1,結婚済み,自動車未所有,子供なし,不明,501~600s,3回,25~30万円
3,3,37.0,Self Enquiry,2,1080.0,Small Business,Female,1.0,3.0,standard,...,senior manager,326805.0,0,離婚済み,自動車所有,子供なし,36~40歳,1001~1100s,3回,30~35万円
4,4,48.0,Company Invited,3,1020.0,Small Business,Female,1.0,3.0,basic,...,executive,258435.0,1,独身,自動車所有,子供なし,46~50歳,1001~1100s,3回,25~30万円


## データ確認

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3489 non-null   int64  
 1   Age                     3389 non-null   float64
 2   TypeofContact           3483 non-null   object 
 3   CityTier                3489 non-null   int64  
 4   DurationOfPitch         3368 non-null   float64
 5   Occupation              3489 non-null   object 
 6   Gender                  3489 non-null   object 
 7   NumberOfPersonVisiting  3489 non-null   float64
 8   NumberOfFollowups       3456 non-null   float64
 9   ProductPitched          3489 non-null   object 
 10  PreferredPropertyStar   3489 non-null   float64
 11  NumberOfTrips           3467 non-null   float64
 12  Passport                3489 non-null   int64  
 13  PitchSatisfactionScore  3489 non-null   int64  
 14  Designation             3489 non-null   

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3489 non-null   int64  
 1   Age                     3396 non-null   float64
 2   TypeofContact           3477 non-null   object 
 3   CityTier                3489 non-null   int64  
 4   DurationOfPitch         3358 non-null   float64
 5   Occupation              3489 non-null   object 
 6   Gender                  3489 non-null   object 
 7   NumberOfPersonVisiting  3489 non-null   float64
 8   NumberOfFollowups       3465 non-null   float64
 9   ProductPitched          3489 non-null   object 
 10  PreferredPropertyStar   3489 non-null   float64
 11  NumberOfTrips           3447 non-null   float64
 12  Passport                3489 non-null   int64  
 13  PitchSatisfactionScore  3489 non-null   int64  
 14  Designation             3489 non-null   

In [7]:
for feature in train_df.columns:
    print(train_df.groupby(feature).size())
    print("")

id
0       1
1       1
2       1
3       1
4       1
       ..
3484    1
3485    1
3486    1
3487    1
3488    1
Length: 3489, dtype: int64

Age
18.0      6
19.0     12
20.0     28
21.0     34
22.0     43
23.0     38
24.0     38
25.0     99
26.0     67
27.0     56
28.0     74
29.0     74
30.0     87
31.0    109
32.0    108
33.0    127
34.0    112
35.0    232
36.0    134
37.0    118
38.0    107
39.0    105
40.0     95
41.0     94
42.0     96
43.0     91
44.0     62
45.0    193
46.0     75
47.0     69
48.0     68
49.0     68
50.0     76
51.0     83
52.0     83
53.0     71
54.0     61
55.0    128
56.0     49
57.0     36
58.0     33
59.0     25
60.0     16
61.0      9
dtype: int64

TypeofContact
Company Invited    1289
Self Enquiry       2194
dtype: int64

CityTier
1    1475
2    1596
3     418
dtype: int64

DurationOfPitch
240.0       2
300.0      18
360.0      80
420.0     239
480.0     338
540.0     327
600.0     231
660.0     160
720.0     158
780.0     203
840.0     266
900.0     284


## カテゴリ化した数値カラム削除

In [8]:
train_df = train_df.drop(["Age", "DurationOfPitch", "NumberOfFollowups", "MonthlyIncome"], axis=1)
test_df = test_df.drop(["Age", "DurationOfPitch", "NumberOfFollowups", "MonthlyIncome"], axis=1)

## 欠損値補完 (最頻値補完)

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3489 non-null   int64  
 1   TypeofContact           3483 non-null   object 
 2   CityTier                3489 non-null   int64  
 3   Occupation              3489 non-null   object 
 4   Gender                  3489 non-null   object 
 5   NumberOfPersonVisiting  3489 non-null   float64
 6   ProductPitched          3489 non-null   object 
 7   PreferredPropertyStar   3489 non-null   float64
 8   NumberOfTrips           3467 non-null   float64
 9   Passport                3489 non-null   int64  
 10  PitchSatisfactionScore  3489 non-null   int64  
 11  Designation             3489 non-null   object 
 12  ProdTaken               3489 non-null   int64  
 13  customer_marriage       3489 non-null   object 
 14  customer_car            3489 non-null   

In [10]:
train_df.fillna(train_df.mode().iloc[0], inplace=True)
test_df.fillna(test_df.mode().iloc[0], inplace=True)

In [11]:
train_df.isnull().sum()

id                        0
TypeofContact             0
CityTier                  0
Occupation                0
Gender                    0
NumberOfPersonVisiting    0
ProductPitched            0
PreferredPropertyStar     0
NumberOfTrips             0
Passport                  0
PitchSatisfactionScore    0
Designation               0
ProdTaken                 0
customer_marriage         0
customer_car              0
customer_child            0
age_cls                   0
DurationOfPitch_cls       0
NumberOfFollowups_cls     0
MonthlyIncome_cls         0
dtype: int64

In [12]:
test_df.isnull().sum()

id                        0
TypeofContact             0
CityTier                  0
Occupation                0
Gender                    0
NumberOfPersonVisiting    0
ProductPitched            0
PreferredPropertyStar     0
NumberOfTrips             0
Passport                  0
PitchSatisfactionScore    0
Designation               0
customer_marriage         0
customer_car              0
customer_child            0
age_cls                   0
DurationOfPitch_cls       0
NumberOfFollowups_cls     0
MonthlyIncome_cls         0
dtype: int64

## データ分割

In [13]:
y = train_df["ProdTaken"]
X = train_df.drop(["id", "ProdTaken"], axis=1)

In [14]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

## 学習

### パラメータ最適化

In [15]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [16]:
def objective(trial):
    # パラメータの指定
    params = {
        'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50),
        'eval_metric' :'AUC',
    }

    # KFold
    cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=32)
    auc_score = []
    pred_score = []
    pred_test_df = test_df.drop(["id"], axis=1)

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), start=0):
        X_train = X.iloc[train_idx, :]
        y_train = y[train_idx]
        X_val = X.iloc[val_idx, :]
        y_val = y[val_idx]

        cb_model = CatBoostClassifier(**params)

        cb_model.fit(
             X_train, y_train, 
             cat_features=categorical_features,
             eval_set=(X_val, y_val),
             use_best_model=True,
             verbose=False,
            )

        # AUC算出
        auc_score.append(cb_model.get_best_score()["validation"]["AUC"])
        # # testデータのスコア算出
        # pred_score.append(cb_model.predict_proba(pred_test_df)[:,1])

    return sum(auc_score)/len(auc_score)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [18]:
print(study.best_trial)
print(study.best_params)

FrozenTrial(number=52, state=TrialState.COMPLETE, values=[0.844296920307989], datetime_start=datetime.datetime(2024, 9, 1, 13, 26, 36, 964594), datetime_complete=datetime.datetime(2024, 9, 1, 13, 27, 3, 699493), params={'iterations': 171, 'depth': 6, 'learning_rate': 0.29397858515878483, 'random_strength': 68, 'bagging_temperature': 1.809748091233205, 'od_type': 'IncToDec', 'od_wait': 30}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'iterations': IntDistribution(high=300, log=False, low=50, step=1), 'depth': IntDistribution(high=10, log=False, low=4, step=1), 'learning_rate': FloatDistribution(high=0.3, log=True, low=0.01, step=None), 'random_strength': IntDistribution(high=100, log=False, low=0, step=1), 'bagging_temperature': FloatDistribution(high=100.0, log=True, low=0.01, step=None), 'od_type': CategoricalDistribution(choices=('IncToDec', 'Iter')), 'od_wait': IntDistribution(high=50, log=False, low=10, step=1)}, trial_id=52, value=None)
{'iterations': 17

In [19]:
best_params = study.best_params

In [20]:
best_params['eval_metric'] = 'AUC'

In [21]:
best_params

{'iterations': 171,
 'depth': 6,
 'learning_rate': 0.29397858515878483,
 'random_strength': 68,
 'bagging_temperature': 1.809748091233205,
 'od_type': 'IncToDec',
 'od_wait': 30,
 'eval_metric': 'AUC'}

### KFold

In [22]:
train_pred_df = pd.DataFrame()
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=32)
auc_score = []
pred_score = []
pred_test_df = test_df.drop(["id"], axis=1)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), start=0):
    X_train = X.iloc[train_idx, :]
    y_train = y[train_idx]
    X_val = X.iloc[val_idx, :]
    y_val = y[val_idx]
    
    cb_model = CatBoostClassifier(**best_params)
    
    cb_model.fit(
         X_train, y_train, 
         cat_features=categorical_features,
         eval_set=(X_val, y_val),
         use_best_model=True,
         verbose=True,
        )
        
    # AUC算出
    auc_score.append(cb_model.get_best_score()["validation"]["AUC"])
    # testデータのスコア算出
    pred_score.append(cb_model.predict_proba(pred_test_df)[:,1])
    # trainデータのスコア算出
    pred_valid = cb_model.predict_proba(X_val)[:,1]
    tmp_df = pd.DataFrame(
        {
            "catboost_Score": pred_valid,
            "true": y_val,
        },
        index=X_val.index,
    )
    train_pred_df = pd.concat([train_pred_df, tmp_df], axis=0)
train_pred_df = train_pred_df.sort_index()

0:	test: 0.5999901	best: 0.5999901 (0)	total: 12.4ms	remaining: 2.1s
1:	test: 0.5999901	best: 0.5999901 (0)	total: 17.3ms	remaining: 1.46s
2:	test: 0.6179742	best: 0.6179742 (2)	total: 25.6ms	remaining: 1.43s
3:	test: 0.6193563	best: 0.6193563 (3)	total: 37ms	remaining: 1.54s
4:	test: 0.6193563	best: 0.6193563 (3)	total: 48.2ms	remaining: 1.6s
5:	test: 0.6193563	best: 0.6193563 (3)	total: 59.2ms	remaining: 1.63s
6:	test: 0.6193563	best: 0.6193563 (3)	total: 66.8ms	remaining: 1.56s
7:	test: 0.7816737	best: 0.7816737 (7)	total: 78.2ms	remaining: 1.59s
8:	test: 0.7816737	best: 0.7816737 (7)	total: 83.1ms	remaining: 1.5s
9:	test: 0.7816737	best: 0.7816737 (7)	total: 87.7ms	remaining: 1.41s
10:	test: 0.7816737	best: 0.7816737 (7)	total: 92ms	remaining: 1.34s
11:	test: 0.7735948	best: 0.7816737 (7)	total: 103ms	remaining: 1.37s
12:	test: 0.7712255	best: 0.7816737 (7)	total: 115ms	remaining: 1.39s
13:	test: 0.7620113	best: 0.7816737 (7)	total: 127ms	remaining: 1.42s
14:	test: 0.7861491	best: 

In [23]:
auc_score

[0.8587929445833882,
 0.8544820323812031,
 0.8240752928787679,
 0.8407494145199064,
 0.849622324108586,
 0.8452023617112512,
 0.8371540719728205]

In [24]:
sum(auc_score)/len(auc_score)

0.844296920307989

## CVモデルの結果の平均値で出力

In [25]:
average_pred = np.mean(pred_score, axis=0)

In [26]:
print(pred_score)
print(average_pred)

[array([0.10459695, 0.17743113, 0.27470821, ..., 0.683403  , 0.24457109,
       0.01461685]), array([0.08796416, 0.14000424, 0.28080759, ..., 0.72818812, 0.30797898,
       0.00443203]), array([0.09913188, 0.20622362, 0.28943189, ..., 0.57686552, 0.27126574,
       0.0024867 ]), array([0.05253394, 0.52175776, 0.20843719, ..., 0.72949162, 0.33342381,
       0.00526467]), array([0.07860216, 0.2478917 , 0.23527592, ..., 0.64622942, 0.23815092,
       0.02001418]), array([0.10784412, 0.31400049, 0.2084629 , ..., 0.73046858, 0.23289193,
       0.02230413]), array([0.10814342, 0.25016057, 0.1453782 , ..., 0.77598913, 0.24995842,
       0.016364  ])]
[0.09125952 0.26535279 0.23464313 ... 0.69580506 0.26832013 0.0122118 ]


In [27]:
submit_df = pd.read_csv(path_sample_submit, header=None)
submit_df.head()

Unnamed: 0,0,1
0,3489,0.119451
1,3490,0.343575
2,3491,0.119451
3,3492,0.119451
4,3493,0.119451


In [28]:
submit_df[1] = average_pred

In [29]:
submit_df.head()

Unnamed: 0,0,1
0,3489,0.09126
1,3490,0.265353
2,3491,0.234643
3,3492,0.383299
4,3493,0.28554


In [30]:
submit_df.to_csv(path_output_test, index=False, header=None)

## Trainの結果出力


In [31]:
train_pred_df

Unnamed: 0,catboost_Score,true
0,0.695894,1
1,0.221657,0
2,0.250077,1
3,0.069968,0
4,0.483216,1
...,...,...
3484,0.257938,1
3485,0.055181,0
3486,0.054758,0
3487,0.374946,1


In [32]:
train_pred_df = train_pred_df.drop(columns="true")

In [33]:
train_pred_df

Unnamed: 0,catboost_Score
0,0.695894
1,0.221657
2,0.250077
3,0.069968
4,0.483216
...,...
3484,0.257938
3485,0.055181
3486,0.054758
3487,0.374946


In [34]:
train_pred_df.to_csv(path_output_train, index=None, header=True)