In [2]:
import os
os.chdir('../../')

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from scr.util import *

In [15]:
files = os.listdir('data/sampling/under_sampling/mean_gb')
df_trains = []
for file in files:
    file_path = os.path.join('data/sampling/under_sampling/mean_gb', file)
    df = pd.read_csv(file_path)
    df_trains.append(df)

In [19]:
df_train = df_trains[0]
df_test = pd.read_csv('data/feature_engineered/null_representative/test_null_mean.csv')
combination_columns = df_train.columns[44:110]

In [20]:
# 特徴量が多いので、落とす特徴量を選択
drop = [
    'EconomicSegment',
    'ContractRate_FM',
    'ContractRate_G1',
    'ContractRate_G2',
    'ContractRate_G3',
    'ContractRate_G4',
    'ContractRate_G5',
    'ContractRate_G6'
]

# catboostでカテゴリ変数を指定するためのリスト
category_columns = ['TypeofContact', 'CityTier', 'Occupation', 'Gender','NumberOfPersonVisiting','NumberOfFollowups', 'ProductPitched', 'PreferredPropertyStar', 'Passport','PitchSatisfactionScore', 'Designation', 'Marry', 'Car', 'Child'] + ['EconomicSegment', 'IsFamily', 'FreaqencySeg','MonetarySeg', 'ContractRate_FM', 'ContractRate_G1','ContractRate_G2', 'ContractRate_G3', 'ContractRate_G4', 'ContractRate_G5', 'ContractRate_G6']

##----------------------------------------------------------------------------------
df_train = mapping_columns_if_exist(df_train)
df_test = mapping_columns_if_exist(df_test)

def handle_unknown_label(train_series, test_series):
    unique_labels = train_series.unique()
    label_map = {label: idx for idx, label in enumerate(unique_labels)}
    train_encoded = train_series.map(label_map)
    test_encoded = test_series.map(lambda x: label_map.get(x, -1))
    return train_encoded, test_encoded

# 各カラムに対してカスタムエンコーディング関数を適用
for col in combination_columns:
    df_train[col], df_test[col] = handle_unknown_label(df_train[col], df_test[col])
##-----------------------------------------------------------------------------------

X = df_train.drop(columns=drop, axis=1)
y = df_train['ProdTaken']

test_feature = X.columns.drop('ProdTaken')
df_test = df_test[test_feature]

tmp = X.groupby(by=['AgeGroup', 'ProductPitched'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G4'})
df_test = df_test.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')

In [21]:
models = []
scores = []

params = {
    
    'n_estimators': 1000,  # ツリーの数
    'max_depth': 6,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'bootstrap': True,
    'random_state': 42
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for tr_idx, va_idx in skf.split(X, y):
    tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
    tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
    
    # 各foldでtarget encodingのやり直し
    tmp = tr_x.groupby(by=['AgeGroup', 'ProductPitched'], as_index=False)['ProdTaken'].mean()
    tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G4'})
    tr_x = tr_x.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')
    va_x = va_x.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')
    
    # tr_x = tr_x.drop(labels='AgeGroup', axis=1)
    # va_x = va_x.drop(labels='AgeGroup', axis=1)
    tr_x = tr_x.drop(labels='ProdTaken', axis=1)
    va_x = va_x.drop(labels='ProdTaken', axis=1)
    
    model = RandomForestClassifier(**params)
    model.fit(tr_x, tr_y)
    
    pred = model.predict_proba(va_x)[:, 1]
    models.append(model)
    score = roc_auc_score(va_y, pred)
    scores.append(score)

print('AUC : ', scores)

AUC :  [0.8600000000000001, 0.8278151260504202, 0.7861952861952862, 0.8254208754208754, 0.7740740740740741]
