In [1]:
import numpy as np
import random
import os

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [2]:
import pandas as pd

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

display(train.head(3))
display(test.head(3))

Unnamed: 0,ID,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,...,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status,Income
0,TRAIN_00000,63,M,Middle (7-8),Full-Time,4,Social Services,Services,White,All other,...,Native,US,US,US,Nonfiler,0,0,0,Unknown,425
1,TRAIN_00001,37,M,Associates degree (Vocational),Full-Time,52,Entertainment,Services,White,All other,...,Native,US,US,US,Single,0,0,0,Under Median,0
2,TRAIN_00002,58,F,High graduate,Full-Time,52,Manufacturing (Non-durable),Admin Support (include Clerical),Black,All other,...,Native,US,US,US,Married Filling Jointly both under 65 (MFJ),3411,0,0,Under Median,860


Unnamed: 0,ID,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,...,Household_Summary,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status
0,TEST_0000,79,M,High Junior,Children or Armed Forces,0,Not in universe or children,Unknown,White,All other,...,Householder,Native,US,Unknown,Unknown,Single,0,0,0,Under Median
1,TEST_0001,47,M,Elementary (5-6),Children or Armed Forces,0,Not in universe or children,Unknown,White,Other Spanish,...,Child 18 or older,Native,US,US,US,Nonfiler,0,0,0,Under Median
2,TEST_0002,18,F,High Junior,Children or Armed Forces,52,Retail,Services,White,All other,...,Child 18 or older,Native,US,US,US,Single,0,0,0,Under Median


In [3]:
train_x = train.drop(columns=['ID', 'Income'])
train_y = train['Income']

test_x = test.drop(columns=['ID'])

In [13]:
from sklearn.model_selection import train_test_split
train_x, X_test, train_y, y_test = train_test_split(train_x, train_y, test_size=0.1, random_state=42, shuffle=True)

In [14]:
from sklearn.preprocessing import LabelEncoder

encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

for i in encoding_target:
    le = LabelEncoder()

    # train과 test 데이터셋에서 해당 열의 모든 값을 문자열로 변환
    train_x[i] = train_x[i].astype(str)
    test_x[i] = test_x[i].astype(str)

    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
    for case in np.unique(test_x[i]):
        if case not in le.classes_:
            le.classes_ = np.append(le.classes_, case)

    test_x[i] = le.transform(test_x[i])

In [15]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(train_x, train_y)

In [22]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)
rmse_dt = mean_squared_error(y_test, y_pred, squared=False)
print(f"dt RMSE: {rmse_dt}")

dt RMSE: 835.1271730605033


In [23]:
preds = model.predict(test_x)

In [24]:
preds

array([   0.,    0.,  500., ...,  425.,    0., 1400.])

In [25]:
submission = pd.read_csv("../data/sample_submission.csv")
submission['Income'] = preds
submission

Unnamed: 0,ID,Income
0,TEST_0000,0.0
1,TEST_0001,0.0
2,TEST_0002,500.0
3,TEST_0003,625.0
4,TEST_0004,0.0
...,...,...
9995,TEST_9995,1700.0
9996,TEST_9996,1200.0
9997,TEST_9997,425.0
9998,TEST_9998,0.0


In [26]:
submission.to_csv("../submission/test4.csv", index = False)