In [1]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

cuda:0


In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier

import pandas as pd
from category_encoders import OneHotEncoder, TargetEncoder

import warnings

# 경고 무시
warnings.filterwarnings("ignore", category=FutureWarning)

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# 한글 폰트 설정
plt.rcParams['font.family'] = 'Malgun Gothic'  # 예시로 'NanumGothic'을 사용

In [3]:
import random
import os

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [4]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [5]:
train = train.drop(['ID'], axis=1)
test = test.drop(['ID'], axis=1)

In [6]:
train

Unnamed: 0,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,Martial_Status,...,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status,Income
0,63,M,Middle (7-8),Full-Time,4,Social Services,Services,White,All other,Married,...,Native,US,US,US,Nonfiler,0,0,0,Unknown,425
1,37,M,Associates degree (Vocational),Full-Time,52,Entertainment,Services,White,All other,Separated,...,Native,US,US,US,Single,0,0,0,Under Median,0
2,58,F,High graduate,Full-Time,52,Manufacturing (Non-durable),Admin Support (include Clerical),Black,All other,Married,...,Native,US,US,US,Married Filling Jointly both under 65 (MFJ),3411,0,0,Under Median,860
3,44,M,High graduate,Full-Time,52,Retail,Technicians & Support,White,All other,Divorced,...,Native,US,US,US,Single,0,0,0,Under Median,850
4,37,F,High graduate,Full-Time,52,Retail,Sales,White,All other,Divorced,...,Native,US,US,US,Head of Household (HOH),0,0,0,Unknown,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,33,M,High graduate,Children or Armed Forces,52,Manufacturing (Durable),Handlers/Cleaners,White,All other,Single,...,Native,US,US,US,Single,0,0,0,Under Median,1300
19996,20,F,College,Full-Time,12,Education,Admin Support (include Clerical),White,Mexican-American,Single,...,Native,US,Mexico,Mexico,Nonfiler,0,0,0,Under Median,850
19997,22,M,College,Children or Armed Forces,52,Transportation,Technicians & Support,White,All other,Single,...,Native,US,US,US,Single,0,0,0,Unknown,999
19998,76,F,High graduate,Not Working,0,Not in universe or children,Unknown,White,All other,Widowed,...,Native,US,Scotland,England,Single,0,0,0,Under Median,0


In [7]:
test['Household_Status'].fillna('Nonfamily householder', inplace=True)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     20000 non-null  int64 
 1   Gender                  20000 non-null  object
 2   Education_Status        20000 non-null  object
 3   Employment_Status       20000 non-null  object
 4   Working_Week (Yearly)   20000 non-null  int64 
 5   Industry_Status         20000 non-null  object
 6   Occupation_Status       20000 non-null  object
 7   Race                    20000 non-null  object
 8   Hispanic_Origin         20000 non-null  object
 9   Martial_Status          20000 non-null  object
 10  Household_Status        20000 non-null  object
 11  Household_Summary       20000 non-null  object
 12  Citizenship             20000 non-null  object
 13  Birth_Country           20000 non-null  object
 14  Birth_Country (Father)  20000 non-null  object
 15  Bi

In [9]:
# 원-핫 인코딩을 적용할 열 지정
one_hot_cols = ['Gender', 'Income_Status']

# 원-핫 인코더 생성 및 train 데이터에 적용
one_hot_encoder = OneHotEncoder(cols=one_hot_cols, use_cat_names=True)
train_encoded = one_hot_encoder.fit_transform(train[one_hot_cols])

# 동일한 원-핫 인코더를 test 데이터에 적용
test_encoded = one_hot_encoder.transform(test[one_hot_cols])

In [10]:
# 레이블 인코더 객체 생성
label_encoder = LabelEncoder()

# one_hot_cols에 명시된 열을 제외한 카테고리형 변수를 인코딩 대상으로 설정
encoding_target = [col for col in train.columns if train[col].dtype == 'object' and col not in one_hot_cols]

# 인코딩 대상 열에 대해 레이블 인코딩 적용
for col in encoding_target:
    # train과 test 데이터셋에서 해당 열의 모든 값을 문자열로 변환
    test_encoded[col] = train[col].astype(str)
    test_encoded[col] = test[col].astype(str)

    # 레이블 인코더를 train 데이터에 적합시키고 변환
    label_encoder.fit(train[col])
    train_encoded[col] = label_encoder.transform(train[col])

    # test 데이터에도 동일한 인코더를 적용
    test_encoded[col] = label_encoder.transform(test[col])

In [11]:
train_encoded = pd.concat([train_encoded, train[['Working_Week (Yearly)','Age','Gains','Losses','Income']]], axis=1)
test_encoded = pd.concat([test_encoded, test[['Working_Week (Yearly)','Age','Gains','Losses']]], axis=1)

train = train_encoded
test = test_encoded

In [12]:
train = train.drop(train[train['Gains'] > 60000].index)

In [406]:
# # 로그 변환을 적용할 열 목록
# columns_to_log_transform = ['Working_Week (Yearly)', 'Age', 'Gains', 'Losses', 'Income']
# 
# # 각 열에 대해 로그 변환 적용
# for column in columns_to_log_transform:
#     train[column] = np.log1p(train[column])
# 
# columns_to_log_transform_test = ['Working_Week (Yearly)', 'Age', 'Gains', 'Losses']
# 
# for column in columns_to_log_transform_test:
#     test[column] = np.log1p(test[column])



In [13]:
train

Unnamed: 0,Gender_M,Gender_F,Income_Status_Unknown,Income_Status_Under Median,Income_Status_Over Median,Education_Status,Employment_Status,Industry_Status,Occupation_Status,Race,...,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Working_Week (Yearly),Age,Gains,Losses,Income
0,1,0,1,0,0,15,2,20,11,4,...,2,39,39,39,4,4,63,0,0,425
1,1,0,0,1,0,1,2,6,11,4,...,2,39,39,39,5,52,37,0,0,0
2,0,1,0,1,0,12,2,11,0,1,...,2,39,39,39,2,52,58,3411,0,860
3,1,0,0,1,0,12,2,19,12,4,...,2,39,39,39,5,52,44,0,0,850
4,0,1,1,0,0,12,2,19,10,4,...,2,39,39,39,0,52,37,0,0,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1,0,0,1,0,12,0,10,4,4,...,2,39,39,39,5,52,33,0,0,1300
19996,0,1,0,1,0,4,2,5,0,4,...,2,39,25,25,4,12,20,0,0,850
19997,1,0,1,0,0,4,0,21,12,4,...,2,39,39,39,5,52,22,0,0,999
19998,0,1,0,1,0,12,3,14,14,4,...,2,39,34,8,5,0,76,0,0,0


In [14]:
test

Unnamed: 0,Gender_M,Gender_F,Income_Status_Unknown,Income_Status_Under Median,Income_Status_Over Median,Education_Status,Employment_Status,Industry_Status,Occupation_Status,Race,...,Household_Summary,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Working_Week (Yearly),Age,Gains,Losses
0,1,0,0,1,0,9,0,14,14,4,...,4,2,39,40,40,5,0,79,0,0
1,1,0,0,1,0,7,0,14,14,4,...,0,2,39,39,39,4,0,47,0,0
2,0,1,0,1,0,9,0,19,11,4,...,0,2,39,39,39,5,52,18,0,0
3,0,1,1,0,0,1,2,12,11,4,...,7,2,39,39,39,2,30,39,0,0
4,1,0,1,0,0,3,0,14,14,4,...,2,2,39,39,39,4,0,6,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,0,0,1,0,2,0,15,12,4,...,4,2,39,39,39,5,52,31,0,0
9996,1,0,0,1,0,4,0,19,4,4,...,4,2,39,39,39,2,52,27,0,0
9997,1,0,0,1,0,12,0,19,11,1,...,0,2,39,39,39,4,7,18,0,0
9998,1,0,0,1,0,3,0,14,14,4,...,2,2,39,39,39,4,0,9,0,0


In [409]:
# train['Income'] = np.log1p(train['Income'])

In [15]:
X = train[train.columns.drop('Income')]
Y = train['Income']

In [16]:
# X와 Y로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42, shuffle=True)

In [29]:
import optuna
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# 데이터셋을 미리 준비하세요
# X_train, X_test, y_train, y_test = train_test_split(...)

def objective(trial):
    # Optuna가 탐색할 파라미터 공간 정의
    param = {
        'objective': 'reg:squarederror',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 9),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    # XGBoost 모델 초기화 및 훈련
    model = XGBRegressor(**param)
    model.fit(X_train, y_train)

    # 예측 및 RMSE 계산
    preds = model.predict(X_test)
    preds[preds < 100] = 0
    rmse = np.sqrt(mean_squared_error(y_test, preds))

    return rmse

# Optuna study 객체 생성 및 최적화 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100) # n_trials: 시도할 횟수

# 최적의 파라미터 출력
print(f"Best trial: {study.best_trial.params}")

[I 2024-03-28 11:11:41,652] A new study created in memory with name: no-name-bcd3bb18-c7f8-41f4-afd6-2736f372c2d0
[I 2024-03-28 11:11:44,442] Trial 0 finished with value: 614.1662625955245 and parameters: {'learning_rate': 0.10674538767591064, 'n_estimators': 934, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.9458949744866842, 'colsample_bytree': 0.9136686559326455}. Best is trial 0 with value: 614.1662625955245.
[I 2024-03-28 11:11:44,830] Trial 1 finished with value: 560.7078961020974 and parameters: {'learning_rate': 0.2395486145127404, 'n_estimators': 183, 'max_depth': 6, 'min_child_weight': 6, 'subsample': 0.7976796372393959, 'colsample_bytree': 0.6435252540838661}. Best is trial 1 with value: 560.7078961020974.
[I 2024-03-28 11:11:46,716] Trial 2 finished with value: 627.8605420814539 and parameters: {'learning_rate': 0.14855839170128185, 'n_estimators': 609, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.6822301785177693, 'colsample_bytree': 0.9193086738869833}. B

KeyboardInterrupt: 

In [30]:
from xgboost import XGBRegressor

# XGBRegressor 모델 초기화
xgb_model = XGBRegressor(objective='reg:squarederror',
                         learning_rate =0.022685065748377198,
                         n_estimators= 390,
                         max_depth= 6,
                         min_child_weight= 9,
                         subsample= 0.659653636923127,
                         colsample_bytree= 0.5006822194941765,
                         
)

# 모델 훈련
xgb_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred_xgb = xgb_model.predict(X_test)
y_pred_xgb[y_pred_xgb<100] = 0
# RMSE 계산
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
print(f"XGB RMSE: {rmse_xgb}")

XGB RMSE: 524.8821780389111


In [31]:
pred_final = xgb_model.predict(test)
# pred_final = np.expm1(pred_final)
pred_final[pred_final<100] = 0

In [32]:
submission = pd.read_csv("../data/sample_submission.csv")
submission['Income'] = pred_final
submission

Unnamed: 0,ID,Income
0,TEST_0000,0.000000
1,TEST_0001,0.000000
2,TEST_0002,420.344910
3,TEST_0003,619.469116
4,TEST_0004,0.000000
...,...,...
9995,TEST_9995,931.887573
9996,TEST_9996,791.552917
9997,TEST_9997,386.226105
9998,TEST_9998,0.000000


In [33]:
submission.to_csv("../submission/test8.csv", index = False)