In [1]:
import os
os.chdir("../")

In [16]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [3]:
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
import catboost as cb
from sklearn.pipeline import Pipeline

# Data Set

In [9]:
feature = [
    'Age',
    'TypeofContact',
    'CityTier',
    'DurationOfPitch',
    'Occupation',
    'Gender',
    'NumberOfPersonVisiting',
    'NumberOfFollowups',
    'ProductPitched',
    'PreferredPropertyStar',
    'NumberOfTrips',
    'Passport',
    'PitchSatisfactionScore',
    'Designation',
    'MonthlyIncome',
    'Marry',
    'Car',
    'Child'
]

In [29]:
df = pd.read_csv('data/collect/labeled/train_gbdt_labeled.csv')
X_submit = pd.read_csv('data/collect/labeled/test_gbdt_labeled.csv')

X = df[feature]
y = df['ProdTaken']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [44]:
# X_train = pd.read_csv('data/preprocessed/X_train_scaled.csv')
# X_test = pd.read_csv('data/preprocessed/X_test_scaled.csv')
# y_train = pd.read_csv('data/preprocessed/y_train.csv')
# y_test = pd.read_csv('data/preprocessed/y_test.csv')

# X = pd.concat([X_train, X_test], ignore_index=True)
# y = pd.concat([y_train, y_test], ignore_index=True)

# X_submit = pd.read_csv('data/preprocessed/X_submission_scaled.csv')

# Gradient Boosting Decision Tree

## xgboost

In [35]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.1,
    'gamma': 0.0,
    'alpha': 0.0,
    'lambda': 1.0,
    'min_child_weight': 1,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

num_round = 100

evals = [(dtrain, 'train'), (dtest, 'eval')]

model = xgb.train(params, dtrain, num_round, evals=evals, early_stopping_rounds=10)

y_pred = model.predict(dtest)

auc_score = roc_auc_score(y_test, y_pred)
print(f"AUC Score: {auc_score:.4f}")

[0]	train-logloss:0.39944	eval-logloss:0.39880
[1]	train-logloss:0.38384	eval-logloss:0.38478
[2]	train-logloss:0.37057	eval-logloss:0.37385


[3]	train-logloss:0.35844	eval-logloss:0.36391
[4]	train-logloss:0.34863	eval-logloss:0.35704
[5]	train-logloss:0.33959	eval-logloss:0.34971
[6]	train-logloss:0.33175	eval-logloss:0.34364
[7]	train-logloss:0.32456	eval-logloss:0.33831
[8]	train-logloss:0.31776	eval-logloss:0.33353
[9]	train-logloss:0.31146	eval-logloss:0.32926
[10]	train-logloss:0.30589	eval-logloss:0.32495
[11]	train-logloss:0.30107	eval-logloss:0.32250
[12]	train-logloss:0.29689	eval-logloss:0.31937
[13]	train-logloss:0.29258	eval-logloss:0.31693
[14]	train-logloss:0.28900	eval-logloss:0.31593
[15]	train-logloss:0.28546	eval-logloss:0.31477
[16]	train-logloss:0.28231	eval-logloss:0.31373
[17]	train-logloss:0.27909	eval-logloss:0.31273
[18]	train-logloss:0.27556	eval-logloss:0.31217
[19]	train-logloss:0.27245	eval-logloss:0.31186
[20]	train-logloss:0.26954	eval-logloss:0.31134
[21]	train-logloss:0.26661	eval-logloss:0.31058
[22]	train-logloss:0.26414	eval-logloss:0.31003
[23]	train-logloss:0.26204	eval-logloss:0.30929

## xgboost cross validation

In [27]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

# パラメータ設定
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.1,
    'gamma': 0.0,
    'alpha': 0.0,
    'lambda': 1.0,
    'min_child_weight': 1,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# 交差検証
cv_results = xgb.cv(dtrain=dtrain, params=params, nfold=5,
                    num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=42)

best_rounds = cv_results.shape[0]
final_model = xgb.train(params, dtrain, num_boost_round=best_rounds)

# 結果の表示
y_pred = final_model.predict(dtest)

auc_score = roc_auc_score(y_test, y_pred)
print(f"AUC Score: {auc_score:.4f}")

AUC Score: 0.8264


In [12]:
X_submit = xgb.DMatrix(X_submit)

y_submit = final_model.predict(X_submit)

## lightgbm

In [38]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# パラメータの設定
params = {
    'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
    'objective': 'binary',  # 二値分類問題
    'metric': 'auc',  # 評価指標としてAUCを使用
    'learning_rate': 0.1,
    'num_leaves': 31,  # 葉の数（XGBoostのmax_depthと異なり、より大きい値）
    'min_child_weight': 1,
    'lambda_l1': 0.0,  # L1正則化
    'lambda_l2': 1.0,  # L2正則化
    'subsample': 0.8,  # データのサブサンプリング
    'colsample_bytree': 0.8,  # 特徴量のサブサンプリング
    'seed': 42
}

# モデルの訓練
model = lgb.train(
    params, 
    train_data, 
    num_boost_round=100, 
    valid_sets=[train_data, test_data]
)

# 予測
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# AUCスコアの計算
auc_score = roc_auc_score(y_test, y_pred)
print(f"AUC Score: {auc_score:.4f}")

[LightGBM] [Info] Number of positive: 398, number of negative: 2393
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 410
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142601 -> initscore=-1.793851
[LightGBM] [Info] Start training from score -1.793851
AUC Score: 0.8245


## catboost

In [40]:
N_SPLITS = 5
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
cv_list = list(skf.split(X, y))

In [41]:
def train_catboost(X, y, cv, params: dict = None):
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records, ))
    for i, (tr_idx, va_idx) in enumerate(cv):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        model = cb.CatBoostRegressor(**params)
        model.fit(tr_x, tr_y, eval_set=(va_x, va_y), use_best_model=True, verbose=100)
        oof_pred[va_idx] = model.predict(va_x)
        models.append(model)
    return oof_pred, models

In [42]:
params = {
    'loss_function': 'RMSE',
    'eval_metric': 'AUC',
    'iterations': 50,
    'learning_rate': 0.005,
    'depth': 6,
    'verbose': 200,
    'random_seed': 42,
}

oof, models = train_catboost(X, y, cv_list, params)

0:	test: 0.7626338	best: 0.7626338 (0)	total: 228ms	remaining: 11.2s
49:	test: 0.8049498	best: 0.8073746 (31)	total: 452ms	remaining: 0us

bestTest = 0.8073745819
bestIteration = 31

Shrink model to first 32 iterations.
0:	test: 0.7909030	best: 0.7909030 (0)	total: 6.19ms	remaining: 303ms
49:	test: 0.8343144	best: 0.8344314 (48)	total: 211ms	remaining: 0us

bestTest = 0.8344314381
bestIteration = 48

Shrink model to first 49 iterations.
0:	test: 0.7329556	best: 0.7329556 (0)	total: 4.89ms	remaining: 240ms
49:	test: 0.7784692	best: 0.7787390 (46)	total: 177ms	remaining: 0us

bestTest = 0.7787389757
bestIteration = 46

Shrink model to first 47 iterations.
0:	test: 0.7867321	best: 0.7867321 (0)	total: 4.53ms	remaining: 222ms
49:	test: 0.8207787	best: 0.8228782 (3)	total: 225ms	remaining: 0us

bestTest = 0.8228781977
bestIteration = 3

Shrink model to first 4 iterations.
0:	test: 0.7822455	best: 0.7822455 (0)	total: 6.78ms	remaining: 332ms
49:	test: 0.8119658	best: 0.8127428 (27)	total: 24

In [46]:
def predict_with_average(models, X_submit):
    # 各モデルによる予測値を格納する配列
    preds = np.zeros((len(X_submit), len(models)))
    
    # 各モデルを利用して予測を行う
    for i, model in enumerate(models):
        preds[:, i] = model.predict(X_submit)
    
    # 予測値の平均を求める
    final_preds = preds.mean(axis=1)
    return final_preds

# X_submitに対する予測値を計算
y_submit = predict_with_average(models, X_submit)

# Nural Network

# 提出

In [13]:
index = pd.read_csv('data/test.csv')['id'].values

df_submit = pd.DataFrame({
    "id": index,
    "prediction": y_submit
})

df_submit.head()

Unnamed: 0,id,prediction
0,3489,0.073447
1,3490,0.14893
2,3491,0.36234
3,3492,0.353634
4,3493,0.282913


In [14]:
path = 'submission/submit_7_xgboost_crossval_labeled.csv'

In [15]:
df_submit.to_csv(path, index=False, header=None)