In [1]:
import pandas as pd
import numpy as np
import random
import os

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [4]:
# 'Industry_Status',
# 'Hispanic_Origin',
# 'Household_Status',
# 'Birth_Country',
# 'Birth_Country (Father)', 
# 'Birth_Country (Mother)'

In [5]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

trainval_x = train.drop(columns=['ID', 'Income', 
                                #  'Industry_Status',
                                 'Hispanic_Origin', 
                                 'Household_Status', 
                                 'Birth_Country', 
                                 'Birth_Country (Father)', 'Birth_Country (Mother)'
                                 ])
trainval_y = train['Income']

test_x = test.drop(columns=['ID',
                            # 'Industry_Status',
                            'Hispanic_Origin', 
                            'Household_Status', 
                            'Birth_Country', 
                            'Birth_Country (Father)', 'Birth_Country (Mother)'
                            ])

In [6]:
encoding_target = list(trainval_x.dtypes[trainval_x.dtypes == "object"].index)

for i in encoding_target:
    le = LabelEncoder()
    
    # train과 test 데이터셋에서 해당 열의 모든 값을 문자열로 변환
    trainval_x[i] = trainval_x[i].astype(str)
    test_x[i] = test_x[i].astype(str)
    
    le.fit(trainval_x[i])
    trainval_x[i] = le.transform(trainval_x[i])
    
    # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
    for case in np.unique(test_x[i]):
        if case not in le.classes_: 
            le.classes_ = np.append(le.classes_, case)
    
    test_x[i] = le.transform(test_x[i])

In [7]:
kf = KFold(n_splits=5)

In [8]:
num_fold = 1
total_train_error, total_val_error = 0, 0
test_preds = []
for train_idx, val_idx in kf.split(np.arange(len(trainval_x))):
    train_x = trainval_x.iloc[train_idx]
    train_y = trainval_y.iloc[train_idx]

    val_x = trainval_x.iloc[val_idx]
    val_y = trainval_y.iloc[val_idx]

    model = GradientBoostingRegressor(max_depth=7)
    model.fit(train_x, train_y)

    print(f'{num_fold} fold')

    train_y_hat = model.predict(train_x)
    val_y_hat = model.predict(val_x)

    pred = model.predict(test_x)
    test_preds.append(pred)

    train_error = mean_squared_error(train_y, train_y_hat) ** 0.5
    val_error = mean_squared_error(val_y, val_y_hat) ** 0.5

    total_train_error += train_error
    total_val_error += val_error

    print(f'train error : {train_error:.2f}')
    print(f'val error : {val_error:.2f}')
    
    print('-'*30)
    num_fold += 1

1 fold
train error : 438.96
val error : 662.13
------------------------------
2 fold
train error : 447.28
val error : 622.59
------------------------------
3 fold
train error : 460.00
val error : 616.50
------------------------------
4 fold
train error : 450.56
val error : 576.63
------------------------------
5 fold
train error : 453.76
val error : 590.34
------------------------------


In [9]:
print(f'final train error : {total_train_error/5:.2f}')
print(f'final val error : {total_val_error/5:.2f}')

final train error : 450.11
final val error : 613.64


In [10]:
final_pred = np.array(test_preds).mean(0)

In [11]:
submission = pd.read_csv('data/sample_submission.csv')
submission

Unnamed: 0,ID,Income
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
...,...,...
9995,TEST_9995,0
9996,TEST_9996,0
9997,TEST_9997,0
9998,TEST_9998,0


In [12]:
submission['Income'] = final_pred
submission

Unnamed: 0,ID,Income
0,TEST_0000,-6.154207
1,TEST_0001,-8.271913
2,TEST_0002,405.154783
3,TEST_0003,722.505777
4,TEST_0004,1.549360
...,...,...
9995,TEST_9995,860.164394
9996,TEST_9996,807.807321
9997,TEST_9997,367.145699
9998,TEST_9998,3.705860


In [13]:
os.makedirs('submission', exist_ok=True)
# submission.to_csv('submission/baseline_0314.csv', index=False)
submission.to_csv('submission/feat_engineering_0314.csv', index=False)