In [25]:
import pandas as pd
import numpy as np
import random
import os

In [26]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [27]:
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [29]:
# 'Industry_Status',
# 'Hispanic_Origin',
# 'Household_Status',
# 'Birth_Country',
# 'Birth_Country (Father)', 
# 'Birth_Country (Mother)'

In [30]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [31]:
train.columns

Index(['ID', 'Age', 'Gender', 'Education_Status', 'Employment_Status',
       'Working_Week (Yearly)', 'Industry_Status', 'Occupation_Status', 'Race',
       'Hispanic_Origin', 'Martial_Status', 'Household_Status',
       'Household_Summary', 'Citizenship', 'Birth_Country',
       'Birth_Country (Father)', 'Birth_Country (Mother)', 'Tax_Status',
       'Gains', 'Losses', 'Dividends', 'Income_Status', 'Income'],
      dtype='object')

### modify education

In [32]:
def modify_education(education):
    if education in ['Children', 'Kindergarten', 'Elementary (1-4)', 'Elementary (5-6)', 
                     'Middle (7-8)', 'High Freshman', 'High Sophomore', 
                     'High Junior', 'High Senior']:
        return 'dropout'
    elif education in ['College', 'Associates degree (Academic)', 'Associates degree (Vocational)']:
        return 'CommunityCollege'
    elif education in ['Professional degree', 'Masters degree']:
        return 'Masters degree'
    else:
        return education

In [33]:
train['Education'] = train['Education_Status']
train['Education'] = train['Education'].apply(modify_education)

In [34]:
test['Education'] = test['Education_Status']
test['Education'] = test['Education'].apply(modify_education)

### modify income status

In [35]:
def modify_income_status(status):
    if status in ['Unknown', 'Under Median']:
        return 'Under Median'
    else:
        return status

In [36]:
train['Income_Status'] = train['Income_Status'].apply(modify_income_status)
test['Income_Status'] = test['Income_Status'].apply(modify_income_status)

### main

In [37]:
trainval_x = train.drop(columns=['ID', 'Income', 
                                #  'Industry_Status',
                                 'Hispanic_Origin', 
                                 'Household_Status', 
                                 'Birth_Country', 
                                 'Birth_Country (Father)', 'Birth_Country (Mother)',
                                 'Education_Status',
                                #  'Gains', 'Losses', 'Dividends'
                                 ])

trainval_y = train['Income']
# trainval_y = np.log(train['Income']+1)

test_x = test.drop(columns=['ID',
                            # 'Industry_Status',
                            'Hispanic_Origin', 
                            'Household_Status', 
                            'Birth_Country', 
                            'Birth_Country (Father)', 'Birth_Country (Mother)',
                            'Education_Status',
                            # 'Gains', 'Losses', 'Dividends'
                            ])

In [38]:
trainval_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Age                    20000 non-null  int64 
 1   Gender                 20000 non-null  object
 2   Employment_Status      20000 non-null  object
 3   Working_Week (Yearly)  20000 non-null  int64 
 4   Industry_Status        20000 non-null  object
 5   Occupation_Status      20000 non-null  object
 6   Race                   20000 non-null  object
 7   Martial_Status         20000 non-null  object
 8   Household_Summary      20000 non-null  object
 9   Citizenship            20000 non-null  object
 10  Tax_Status             20000 non-null  object
 11  Gains                  20000 non-null  int64 
 12  Losses                 20000 non-null  int64 
 13  Dividends              20000 non-null  int64 
 14  Income_Status          20000 non-null  object
 15  Education          

In [39]:
encoding_target = list(trainval_x.dtypes[trainval_x.dtypes == "object"].index)

for i in encoding_target:
    le = LabelEncoder()
    
    # train과 test 데이터셋에서 해당 열의 모든 값을 문자열로 변환
    trainval_x[i] = trainval_x[i].astype(str)
    test_x[i] = test_x[i].astype(str)
    
    le.fit(trainval_x[i])
    trainval_x[i] = le.transform(trainval_x[i])
    
    # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
    for case in np.unique(test_x[i]):
        if case not in le.classes_: 
            print('case')
            le.classes_ = np.append(le.classes_, case)
    
    test_x[i] = le.transform(test_x[i])

In [40]:
kf = KFold(n_splits=5)

In [41]:
num_fold = 1
total_train_error, total_val_error = 0, 0
total_train_error_post, total_val_error_post = 0, 0
test_preds = []
for train_idx, val_idx in kf.split(np.arange(len(trainval_x))):
    train_x = trainval_x.iloc[train_idx]
    train_y = trainval_y.iloc[train_idx]

    val_x = trainval_x.iloc[val_idx]
    val_y = trainval_y.iloc[val_idx]

    model = GradientBoostingRegressor(max_depth=4)
    model.fit(train_x, train_y)

    print(f'{num_fold} fold')

    train_y_hat = model.predict(train_x)
    val_y_hat = model.predict(val_x)

    pred = model.predict(test_x)
    test_preds.append(pred)

    train_error = mean_squared_error(train_y, train_y_hat) ** 0.5
    val_error = mean_squared_error(val_y, val_y_hat) ** 0.5

    total_train_error += train_error
    total_val_error += val_error

    train_y_hat_post = np.where(train_y_hat<0, 0, train_y_hat)
    val_y_hat_post = np.where(val_y_hat<0, 0, val_y_hat)

    train_error_post = mean_squared_error(train_y, train_y_hat_post) ** 0.5
    val_error_post = mean_squared_error(val_y, val_y_hat_post) ** 0.5

    total_train_error_post += train_error_post
    total_val_error_post += val_error_post

    print(f'train error : {train_error:.2f} | val error : {val_error:.2f}')
    print(f'[post] train error : {train_error_post:.2f} | val error : {val_error_post:.2f}')
    
    print('-'*30)

    num_fold += 1

1 fold
train error : 543.39 | val error : 650.39
[post] train error : 543.33 | val error : 650.35
------------------------------
2 fold
train error : 550.07 | val error : 618.70
[post] train error : 550.02 | val error : 618.65
------------------------------
3 fold
train error : 561.39 | val error : 597.72
[post] train error : 561.35 | val error : 597.18
------------------------------
4 fold
train error : 567.42 | val error : 555.94
[post] train error : 567.36 | val error : 555.89
------------------------------
5 fold
train error : 567.07 | val error : 565.85
[post] train error : 567.03 | val error : 565.78
------------------------------


In [42]:
print(f'final train error : {total_train_error/5:.2f}')
print(f'final val error : {total_val_error/5:.2f}')

print(f'[post] final train error : {total_train_error_post/5:.2f}')
print(f'[post] final val error : {total_val_error_post/5:.2f}')

final train error : 557.87
final val error : 597.72
[post] final train error : 557.82
[post] final val error : 597.57


### post processing

### records

#### max depth 7
final train error : 459.07
final val error : 608.42

#### max depth 5
final train error : 530.45
final val error : 599.85

#### max depth 4
final train error : 557.87
final val error : 597.72

#### max depth 3
final train error : 579.80
final val error : 597.66

In [43]:
final_pred = np.array(test_preds).mean(0)

In [44]:
final_pred_post = np.where(final_pred<0, 0, final_pred)

In [45]:
# final_pred_exp = np.exp(final_pred)-1

In [46]:
submission = pd.read_csv('data/sample_submission.csv')

In [47]:
submission['Income'] = final_pred
submission['Income'] = final_pred_post
# submission['Income'] = final_pred_exp
submission

Unnamed: 0,ID,Income
0,TEST_0000,0.000000
1,TEST_0001,9.955983
2,TEST_0002,413.401847
3,TEST_0003,613.009230
4,TEST_0004,3.861380
...,...,...
9995,TEST_9995,813.293201
9996,TEST_9996,781.530637
9997,TEST_9997,363.059176
9998,TEST_9998,3.861380


In [48]:
os.makedirs('submission', exist_ok=True)
# submission.to_csv('submission/feat_engineering_0319_2.csv', index=False)
# submission.to_csv('submission/feat_engineering_0319_log.csv', index=False)