In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# mpl.rc('font', family='NanumBarunGothic') # 혹은 다른 설치한 Nanum 폰트 사용 - 리눅스 사용할때
import scipy
import scipy.stats as stats

import os
import re
import time
import datetime as dt # 사용시 datetime.dt 로 사용 # 현재 시간 사용
from datetime import datetime # 사용시 함수로 바로 사용 #-> 시간 비교

In [2]:
# 경고창 무시
import warnings
warnings.filterwarnings('ignore')

In [4]:
df_sample = pd.read_csv("../datasets/sample_submission.csv")
df_train = pd.read_csv("../datasets/train.csv")
df_test = pd.read_csv("../datasets/test.csv")

In [5]:
df_train.head(5)

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        7497 non-null   object 
 1   제조사       7497 non-null   object 
 2   모델        7497 non-null   object 
 3   차량상태      7497 non-null   object 
 4   배터리용량     4786 non-null   float64
 5   구동방식      7497 non-null   object 
 6   주행거리(km)  7497 non-null   int64  
 7   보증기간(년)   7497 non-null   int64  
 8   사고이력      7497 non-null   object 
 9   연식(년)     7497 non-null   int64  
 10  가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 644.4+ KB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        846 non-null    object 
 1   제조사       846 non-null    object 
 2   모델        846 non-null    object 
 3   차량상태      846 non-null    object 
 4   배터리용량     542 non-null    float64
 5   구동방식      846 non-null    object 
 6   주행거리(km)  846 non-null    int64  
 7   보증기간(년)   846 non-null    int64  
 8   사고이력      846 non-null    object 
 9   연식(년)     846 non-null    int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 66.2+ KB


## 데이터 전처리

In [8]:
# 배터리 NaN값에 대해 평균값으로 대체
df_train['배터리용량'].fillna(df_train['배터리용량'].mean(), inplace=True)
df_test['배터리용량'].fillna(df_test['배터리용량'].mean(), inplace=True)

df_train = df_train.drop(columns=["ID"])
df_test = df_test.drop(columns=["ID"])

In [9]:
# 범주형 컬럼 지정
categorical_columns = ["제조사","모델","차량상태","구동방식","사고이력"]

In [10]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# df_train 레이블 인코딩
for column in categorical_columns:
    df_train[column] = label_encoder.fit_transform(df_train[column])

# df_test 레이블 인코딩
for column in categorical_columns:
    df_test[column] = label_encoder.fit_transform(df_test[column])

## 모델 훈련

In [11]:
from sklearn.model_selection import train_test_split
# 훈련 데이터셋 - df_train
labeled_x_train = df_train.drop(columns=["가격(백만원)"])
labeled_y_train = df_train["가격(백만원)"]
# 테스트 데이터셋 - df_test
labeled_x_test = df_test

In [12]:
# 하이퍼파라미터 튜닝
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [14]:
# 훈련용 데이터 학습
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
# XGBRegressor 모델 초기화
xgb_model = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')

# 그리드 서치 설정
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# 모델 학습
grid_search.fit(labeled_x_train, labeled_y_train)

In [16]:
from sklearn.metrics import mean_squared_error

# 최적의 모델로 예측 및 성능 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(labeled_x_train)
mse = mean_squared_error(labeled_y_train, y_pred)
rmse = mse ** 0.5

print("훈련 데이터셋에 대한 RMSE:", rmse)

훈련 데이터셋에 대한 RMSE: 1.137137813501262


## 모델 적용

In [18]:
predictions = grid_search.predict(labeled_x_test)
print(predictions)

[129.941     80.173256  64.757385  34.854393  47.489243 100.05583
  21.835234  40.03086   81.45757   25.119244  64.18162   62.70643
  40.35535   99.78688   91.32694   22.992388 125.82331   52.476078
  59.873306  34.81568   37.75619  159.44054   47.10892   55.976624
  23.106306  47.79192   78.80637   79.90024   27.900635 126.2678
  63.884182  40.956203 114.94112   87.98005   35.558006  79.95297
  22.123285  39.763927  24.572237 121.0515   126.98672   21.135044
 119.59533   60.72162   23.144615  99.83811   21.552511  72.67173
  62.513397  77.62859   35.52095   16.719791  38.65703  126.54276
  23.728601  27.869568 155.7903    41.027203 156.85625   40.245663
  55.956    125.51614   40.487885 121.254776  40.115845  34.20275
  71.99775   50.59164  100.14562   23.061571  47.946533  27.799019
  63.995174  38.631992 125.205826  23.19559   41.119576 120.69994
 160.40855   79.86527   27.339483  40.97389   17.044357  34.093746
  33.71669   40.129463  57.355     88.02707   64.603165  53.477356
  41

## CSV 내보내기

In [19]:
# 파일명 : submission_날짜
df_sample["가격(백만원)"] = predictions
submission_20250110 = df_sample
submission_20250110.to_csv("../submissions/submission_20250110.csv", index=False)