In [1]:
import pandas as pd
import numpy as np
import random
import os

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [5]:
# 'Industry_Status',
# 'Hispanic_Origin',
# 'Household_Status',
# 'Birth_Country',
# 'Birth_Country (Father)', 
# 'Birth_Country (Mother)'

In [6]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [7]:
train.columns

Index(['ID', 'Age', 'Gender', 'Education_Status', 'Employment_Status',
       'Working_Week (Yearly)', 'Industry_Status', 'Occupation_Status', 'Race',
       'Hispanic_Origin', 'Martial_Status', 'Household_Status',
       'Household_Summary', 'Citizenship', 'Birth_Country',
       'Birth_Country (Father)', 'Birth_Country (Mother)', 'Tax_Status',
       'Gains', 'Losses', 'Dividends', 'Income_Status', 'Income'],
      dtype='object')

### modify education

In [8]:
def modify_education(education):
    if education in ['Children', 'Kindergarten', 'Elementary (1-4)', 'Elementary (5-6)', 
                     'Middle (7-8)', 'High Freshman', 'High Sophomore', 
                     'High Junior', 'High Senior']:
        return 'dropout'
    elif education in ['College', 'Associates degree (Academic)', 'Associates degree (Vocational)']:
        return 'CommunityCollege'
    elif education in ['Professional degree', 'Masters degree']:
        return 'Masters degree'
    else:
        return education

In [9]:
train['Education'] = train['Education_Status']
train['Education'] = train['Education'].apply(modify_education)

In [10]:
test['Education'] = test['Education_Status']
test['Education'] = test['Education'].apply(modify_education)

### modify income status

In [11]:
def modify_income_status(status):
    if status in ['Unknown', 'Under Median']:
        return 'Under Median'
    else:
        return status

In [12]:
train['Income_Status'] = train['Income_Status'].apply(modify_income_status)
test['Income_Status'] = test['Income_Status'].apply(modify_income_status)

### income==0 예측

In [13]:
train['Income_nonzero'] = train['Income'] > 0

In [14]:
train.head()

Unnamed: 0,ID,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,...,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status,Income,Education,Income_nonzero
0,TRAIN_00000,63,M,Middle (7-8),Full-Time,4,Social Services,Services,White,All other,...,US,US,Nonfiler,0,0,0,Under Median,425,dropout,True
1,TRAIN_00001,37,M,Associates degree (Vocational),Full-Time,52,Entertainment,Services,White,All other,...,US,US,Single,0,0,0,Under Median,0,CommunityCollege,False
2,TRAIN_00002,58,F,High graduate,Full-Time,52,Manufacturing (Non-durable),Admin Support (include Clerical),Black,All other,...,US,US,Married Filling Jointly both under 65 (MFJ),3411,0,0,Under Median,860,High graduate,True
3,TRAIN_00003,44,M,High graduate,Full-Time,52,Retail,Technicians & Support,White,All other,...,US,US,Single,0,0,0,Under Median,850,High graduate,True
4,TRAIN_00004,37,F,High graduate,Full-Time,52,Retail,Sales,White,All other,...,US,US,Head of Household (HOH),0,0,0,Under Median,570,High graduate,True


### main

In [15]:
trainval_x = train.drop(columns=['ID', 'Income', 
                                 'Hispanic_Origin', 
                                 'Household_Status', 
                                 'Birth_Country', 
                                 'Birth_Country (Father)', 'Birth_Country (Mother)',
                                 'Education_Status',
                                 'Income_nonzero'
                                 ])
trainval_nonzero = train['Income_nonzero']
trainval_y = train['Income']

test_x = test.drop(columns=['ID',
                            'Hispanic_Origin', 
                            'Household_Status', 
                            'Birth_Country', 
                            'Birth_Country (Father)', 'Birth_Country (Mother)',
                            'Education_Status',
                            ])

In [16]:
trainval_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Age                    20000 non-null  int64 
 1   Gender                 20000 non-null  object
 2   Employment_Status      20000 non-null  object
 3   Working_Week (Yearly)  20000 non-null  int64 
 4   Industry_Status        20000 non-null  object
 5   Occupation_Status      20000 non-null  object
 6   Race                   20000 non-null  object
 7   Martial_Status         20000 non-null  object
 8   Household_Summary      20000 non-null  object
 9   Citizenship            20000 non-null  object
 10  Tax_Status             20000 non-null  object
 11  Gains                  20000 non-null  int64 
 12  Losses                 20000 non-null  int64 
 13  Dividends              20000 non-null  int64 
 14  Income_Status          20000 non-null  object
 15  Education          

In [17]:
encoding_target = list(trainval_x.dtypes[trainval_x.dtypes == "object"].index)

for i in encoding_target:
    le = LabelEncoder()
    
    # train과 test 데이터셋에서 해당 열의 모든 값을 문자열로 변환
    trainval_x[i] = trainval_x[i].astype(str)
    test_x[i] = test_x[i].astype(str)
    
    le.fit(trainval_x[i])
    trainval_x[i] = le.transform(trainval_x[i])
    
    # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
    for case in np.unique(test_x[i]):
        if case not in le.classes_: 
            print('case')
            le.classes_ = np.append(le.classes_, case)
    
    if i == 'Occupation_Status':
        save_le = le
        print(f'save {i}')
        
    test_x[i] = le.transform(test_x[i])

save Occupation_Status


In [18]:
kf = StratifiedKFold(n_splits=5, shuffle=True)

In [19]:
num_fold = 1
total_train_error, total_val_error = 0, 0
test_preds = []
for train_idx, val_idx in kf.split(trainval_x, trainval_nonzero):
    print(f'{num_fold} fold')

    train_x = trainval_x.iloc[train_idx]
    train_nonzero = trainval_nonzero[train_idx]
    train_y = trainval_y[train_idx]

    train_x.reset_index(drop=True, inplace=True)
    train_nonzero = train_nonzero.reset_index(drop=True)
    train_y = train_y.reset_index(drop=True)

    val_x = trainval_x.iloc[val_idx]
    val_nonzero = trainval_nonzero[val_idx]
    val_y = trainval_y[val_idx]

    val_x.reset_index(drop=True, inplace=True)
    val_nonzero = val_nonzero.reset_index(drop=True)
    val_y = val_y.reset_index(drop=True)

    #### step 1 - classification
    clf = GradientBoostingClassifier(max_depth=3)
    clf.fit(train_x, train_nonzero)

    train_nonzero_hat = clf.predict(train_x)
    val_nonzero_hat = clf.predict(val_x)

    train_acc = accuracy_score(train_nonzero, train_nonzero_hat)
    val_acc = accuracy_score(val_nonzero, val_nonzero_hat)

    print(f'train acc : {train_acc*100:.2f} | val acc : {val_acc*100:.2f}')
    ####

    train_nonzero_idx = np.where(train_nonzero_hat!=0)[0]
    val_nonzero_idx = np.where(val_nonzero_hat!=0)[0]

    #### step 2 - regression
    reg = GradientBoostingRegressor(max_depth=3)
    reg.fit(train_x.iloc[train_nonzero_idx], train_y[train_nonzero_idx])

    train_y_hat = np.zeros_like(train_y)
    val_y_hat = np.zeros_like(val_y)

    train_y_hat_nonzero = reg.predict(train_x.iloc[train_nonzero_idx])
    val_y_hat_nonzero = reg.predict(val_x.iloc[val_nonzero_idx])

    train_y_hat[train_nonzero_idx] = train_y_hat_nonzero
    val_y_hat[val_nonzero_idx] = val_y_hat_nonzero

    train_error = mean_squared_error(train_y, train_y_hat) ** 0.5
    val_error = mean_squared_error(val_y, val_y_hat) ** 0.5

    print(f'train error : {train_error:.2f} | val error : {val_error:.2f}')
    ####

    total_train_error += train_error
    total_val_error += val_error

    #### for test set
    test_nonzero_hat = clf.predict(test_x)

    test_nonzero_idx = np.where(test_nonzero_hat!=0)[0]

    test_y_hat = np.zeros(shape=(len(test_x), ))

    test_y_hat_nonzero = reg.predict(test_x.iloc[test_nonzero_idx])

    test_y_hat[test_nonzero_idx] = test_y_hat_nonzero

    test_preds.append(test_y_hat)

    print('-'*30)

    num_fold += 1

1 fold


train acc : 82.45 | val acc : 81.50
train error : 574.59 | val error : 640.29
------------------------------
2 fold
train acc : 82.50 | val acc : 82.42
train error : 586.90 | val error : 576.20
------------------------------
3 fold
train acc : 82.61 | val acc : 81.80
train error : 582.17 | val error : 594.71
------------------------------
4 fold
train acc : 82.44 | val acc : 82.15
train error : 581.12 | val error : 615.00
------------------------------
5 fold
train acc : 82.53 | val acc : 82.10
train error : 579.33 | val error : 612.97
------------------------------


In [20]:
print(f'final train error : {total_train_error/5:.2f}')
print(f'final val error : {total_val_error/5:.2f}')

final train error : 580.82
final val error : 607.83


### depth 3/3
final train error : 580.82
final val error : 607.83

### depth 4/4
final train error : 553.90
final val error : 613.42

### depth 5/5
final train error : 518.09
final val error : 622.05

### depth 7/7
final train error : 427.02
final val error : 643.59

In [21]:
final_pred = np.array(test_preds).mean(0)

In [22]:
submission = pd.read_csv('data/sample_submission.csv')
submission

Unnamed: 0,ID,Income
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
...,...,...
9995,TEST_9995,0
9996,TEST_9996,0
9997,TEST_9997,0
9998,TEST_9998,0


In [23]:
submission['Income'] = final_pred
submission

Unnamed: 0,ID,Income
0,TEST_0000,0.000000
1,TEST_0001,0.000000
2,TEST_0002,396.495658
3,TEST_0003,613.660784
4,TEST_0004,0.000000
...,...,...
9995,TEST_9995,896.896521
9996,TEST_9996,761.892146
9997,TEST_9997,346.917442
9998,TEST_9998,0.000000


In [24]:
os.makedirs('submission', exist_ok=True)
# submission.to_csv('submission/baseline_0314.csv', index=False)
submission.to_csv('submission/feat_engineering_0321.csv', index=False)