In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# mpl.rc('font', family='NanumBarunGothic') # 혹은 다른 설치한 Nanum 폰트 사용 - 리눅스 사용할때
import scipy
import scipy.stats as stats

import os
import re
import time
import datetime as dt # 사용시 datetime.dt 로 사용 # 현재 시간 사용
from datetime import datetime # 사용시 함수로 바로 사용 #-> 시간 비교

In [2]:
# 경고창 무시
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_sample = pd.read_csv("../datasets/sample_submission.csv")
df_train = pd.read_csv("../datasets/train.csv")
df_test = pd.read_csv("../datasets/test.csv")

In [4]:
df_train.head(5)

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        7497 non-null   object 
 1   제조사       7497 non-null   object 
 2   모델        7497 non-null   object 
 3   차량상태      7497 non-null   object 
 4   배터리용량     4786 non-null   float64
 5   구동방식      7497 non-null   object 
 6   주행거리(km)  7497 non-null   int64  
 7   보증기간(년)   7497 non-null   int64  
 8   사고이력      7497 non-null   object 
 9   연식(년)     7497 non-null   int64  
 10  가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 644.4+ KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        846 non-null    object 
 1   제조사       846 non-null    object 
 2   모델        846 non-null    object 
 3   차량상태      846 non-null    object 
 4   배터리용량     542 non-null    float64
 5   구동방식      846 non-null    object 
 6   주행거리(km)  846 non-null    int64  
 7   보증기간(년)   846 non-null    int64  
 8   사고이력      846 non-null    object 
 9   연식(년)     846 non-null    int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 66.2+ KB


## 데이터 전처리

In [7]:
# 배터리 NaN값에 대해 평균값으로 대체
df_train['배터리용량'].fillna(df_train['배터리용량'].mean(), inplace=True)
df_test['배터리용량'].fillna(df_test['배터리용량'].mean(), inplace=True)

df_train = df_train.drop(columns=["ID"])
df_test = df_test.drop(columns=["ID"])

In [8]:
# 범주형 컬럼 지정
categorical_columns = ["제조사","모델","차량상태","구동방식","사고이력"]

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# df_train 레이블 인코딩
for column in categorical_columns:
    df_train[column] = label_encoder.fit_transform(df_train[column])

# df_test 레이블 인코딩
for column in categorical_columns:
    df_test[column] = label_encoder.fit_transform(df_test[column])

## 모델 훈련

In [10]:
from sklearn.model_selection import train_test_split
labeled_x = df_train.drop(columns=["가격(백만원)"])
labeled_y = df_train["가격(백만원)"]

In [11]:
labeled_x_train,labeled_x_test= train_test_split(labeled_x,test_size = 0.3, random_state = 42)
labeled_y_train,labeled_y_test= train_test_split(labeled_y,test_size = 0.3, random_state = 42)

In [12]:
# 훈련용 데이터 학습
from xgboost import XGBRegressor
# XGBRegressor 모델 초기화
xgb_model = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')
# 모델 학습
xgb_model.fit(labeled_x_train, labeled_y_train)

In [13]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict= xgb_model.predict(labeled_x_test)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
# 성능 평가
mse = mean_squared_error(labeled_y_test, labeled_y_test_predict)
rmse = mean_squared_error(labeled_y_test, labeled_y_test_predict, squared=False)
mae = mean_absolute_error(labeled_y_test, labeled_y_test_predict)
r2 = r2_score(labeled_y_test, labeled_y_test_predict)

# 결과 출력
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R² 점수: {r2}")

MSE: 2.3876694098868807
RMSE: 1.54520853281584
MAE: 0.7430274618699815
R² 점수: 0.9982758373456443


## 모델 적용

In [14]:
predictions = xgb_model.predict(df_test)
print(predictions)

[130.54425   80.240746  64.853546  35.132507  47.894897 104.136765
  22.109385  39.7823    81.21609   24.530767  64.112076  62.87505
  39.956436  99.77154   91.047104  22.90922  126.91882   52.719475
  59.599052  34.73073   37.74694  161.37956   45.778652  55.744057
  23.11535   48.1167    79.130646  78.701035  27.93429  126.085335
  63.937294  40.662468 114.111694  88.31602   35.69029   79.5245
  21.461233  39.931274  24.742193 121.12307  127.71859   20.351593
 119.26764   60.653492  23.192116  99.87213   21.578592  73.65101
  62.58979   78.19418   35.472286  15.43287   38.744366 125.716
  23.613     27.954199 156.47722   41.179585 159.15967   40.37555
  55.812397 126.40466   40.273888 120.84404   39.91346   34.085396
  71.84619   49.802956  99.89654   22.843475  48.36014   27.802437
  64.34081   38.565544 125.830025  23.202646  41.495674 121.045586
 160.16179   79.65108   27.77565   41.15315   16.859701  33.84177
  33.81533   40.14336   57.687447  88.59428   64.5701    53.70768
  40.

## CSV 내보내기

In [17]:
df_sample["가격(백만원)"] = predictions
submission_20250104 = df_sample
submission_20250104.to_csv("../submissions/submission_20250104.csv", index=False)