# install interpret

In [None]:
!pip install interpret

# Data Loading

In [4]:
import pandas as pd
import numpy as np
from itertools import combinations
from copy import deepcopy
from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor

import os
import warnings
warnings.filterwarnings('ignore')

# load_data
train = pd.read_csv('train.csv').set_index("ID")
test = pd.read_csv('test.csv').set_index("ID")

# 결측값 체크
print(train.isna().sum().sum())
print(test.isna().sum().sum())

# log transform
train['Income'] = np.log1p(train['Income'])

0
1


# mean encoding (categorical variables)

In [5]:
# mean encoding for categorical values
obj_columns = train.select_dtypes(include='object').columns
target_columns = 'Income'

encode_dicts = {}
for c in obj_columns:
    mean_encodes = train[[c,target_columns]].groupby(c).mean()
    encode_dicts[c] = mean_encodes.to_dict()[target_columns]
    train[c] = train[c].map(encode_dicts[c])
    test[c] = test[c].map(encode_dicts[c])

test = test.fillna(0)
train['Income_Class'] = np.where(train['Income']==0, 1, 0)

# polynomial features (degree=2)

In [6]:
# 자기 자신을 제외한 polynomial features
col_lst = list(train.columns[:-2])
deg_2 = list(combinations(col_lst, 2))
for tup in deg_2:
  tup_lst = list(tup)
  new_cols = train[tup_lst].product(axis=1)
  temp = train[['Income']]
  temp['_'.join(tup_lst)] = new_cols
  corrs = temp.corr().iloc[0,-1]
  if abs(corrs) >= 0.5: # pearson correlation으로 filtering
    train['_'.join(tup_lst)] = new_cols
    test['_'.join(tup_lst)] = test[tup_lst].product(axis=1)

# Model Training

In [7]:
# 소득 존재 여부 분류 모델 fitting (EBM Classifier, default random_state=42)
train_classify = deepcopy(train.drop(columns=target_columns)) # income 컬럼 제외
model = ExplainableBoostingClassifier(validation_size=0.025).fit(train_classify.drop(columns='Income_Class'), train_classify['Income_Class'])
train = train.drop(columns='Income_Class')

# 소득 예측 모델 fitting (EBM Regressor, default random_state=42)
ov_0_sets = train.loc[train['Income']>0, :] # 실제 회귀 모델은 소득이 있는 sample 대상으로 진행
models_ov_0 = ExplainableBoostingRegressor(validation_size=0.025).fit(ov_0_sets.drop(columns='Income'), ov_0_sets['Income'])

# prediction

In [9]:
df_pred = pd.DataFrame({'probs':model.predict_proba(test)[:,1],
                        'test':models_ov_0.predict(test)}, index=test.index)
df_pred['result'] = np.exp(df_pred['test'])-1 # log transform 역변환
df_pred['result'] = df_pred['result']*(1-df_pred['probs']) # 분류모델 결과를 통한 회귀모델 예측치 보정
df_pred = df_pred.reset_index().rename(columns={'result':'Income'}).drop(columns=['probs','test'])
df_pred.to_csv('submission_test_6 (1).csv', encoding='utf-8-sig', index=False)