In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# mpl.rc('font', family='NanumBarunGothic') # 혹은 다른 설치한 Nanum 폰트 사용 - 리눅스 사용할때
import scipy
import scipy.stats as stats

import os
import re
import time
import datetime as dt # 사용시 datetime.dt 로 사용 # 현재 시간 사용
from datetime import datetime # 사용시 함수로 바로 사용 #-> 시간 비교

In [2]:
# 경고창 무시
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_sample = pd.read_csv("../datasets/sample_submission.csv")
df_train = pd.read_csv("../datasets/train.csv")
df_test = pd.read_csv("../datasets/test.csv")

In [4]:
df_train.head(5)

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        7497 non-null   object 
 1   제조사       7497 non-null   object 
 2   모델        7497 non-null   object 
 3   차량상태      7497 non-null   object 
 4   배터리용량     4786 non-null   float64
 5   구동방식      7497 non-null   object 
 6   주행거리(km)  7497 non-null   int64  
 7   보증기간(년)   7497 non-null   int64  
 8   사고이력      7497 non-null   object 
 9   연식(년)     7497 non-null   int64  
 10  가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 644.4+ KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        846 non-null    object 
 1   제조사       846 non-null    object 
 2   모델        846 non-null    object 
 3   차량상태      846 non-null    object 
 4   배터리용량     542 non-null    float64
 5   구동방식      846 non-null    object 
 6   주행거리(km)  846 non-null    int64  
 7   보증기간(년)   846 non-null    int64  
 8   사고이력      846 non-null    object 
 9   연식(년)     846 non-null    int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 66.2+ KB


## 데이터 전처리

In [7]:
# 배터리 NaN값에 대해 0으로 대체
df_train['배터리용량'].fillna(0, inplace=True)
df_test['배터리용량'].fillna(0, inplace=True)

df_train = df_train.drop(columns=["ID"])
df_test = df_test.drop(columns=["ID"])

In [8]:
# 범주형 컬럼 지정
categorical_columns = ["제조사","모델","차량상태","구동방식","사고이력"]

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# df_train 레이블 인코딩
for column in categorical_columns:
    df_train[column] = label_encoder.fit_transform(df_train[column])

# df_test 레이블 인코딩
for column in categorical_columns:
    df_test[column] = label_encoder.fit_transform(df_test[column])

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer

numeric_features = ["주행거리(km)","배터리용량"]

# 표준화
standard_scaler = StandardScaler()
df_standardized_train = df_train.copy()
df_standardized_train[numeric_features] = standard_scaler.fit_transform(df_train[numeric_features])

# 정규화 (Min-Max Scaling)
min_max_scaler = MinMaxScaler()
df_normalized_train = df_train.copy()
df_normalized_train[numeric_features] = min_max_scaler.fit_transform(df_train[numeric_features])

# Robust Scaler
robust_scaler = RobustScaler()
df_robust_scaled_train = df_train.copy()
df_robust_scaled_train[numeric_features] = robust_scaler.fit_transform(df_train[numeric_features])

In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer

numeric_features = ["주행거리(km)","배터리용량"]

# 표준화
standard_scaler = StandardScaler()
df_standardized_test = df_test.copy()
df_standardized_test[numeric_features] = standard_scaler.fit_transform(df_test[numeric_features])

# 정규화 (Min-Max Scaling)
min_max_scaler = MinMaxScaler()
df_normalized_test = df_test.copy()
df_normalized_test[numeric_features] = min_max_scaler.fit_transform(df_test[numeric_features])

# Robust Scaler
robust_scaler = RobustScaler()
df_robust_scaled_test = df_test.copy()
df_robust_scaled_test[numeric_features] = robust_scaler.fit_transform(df_test[numeric_features])

In [12]:
df_standardized_test

Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년)
0,4,15,1,0.883981,0,-0.551627,2,0,0
1,1,20,0,1.275113,0,-0.668955,8,0,0
2,1,19,0,-1.256126,2,-0.675263,7,1,0
3,2,2,1,0.669838,0,-0.618311,7,0,1
4,3,0,0,-1.256126,1,-0.668054,10,0,0
...,...,...,...,...,...,...,...,...,...
841,4,16,2,0.563076,0,1.309066,2,0,0
842,6,1,2,0.306127,0,0.498220,0,0,0
843,6,1,2,0.306127,0,1.439533,0,0,0
844,0,11,1,0.702660,0,-0.523295,4,0,0


In [13]:
df_normalized_test

Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년)
0,4,15,1,0.762455,0,0.070294,2,0,0
1,1,20,0,0.901804,0,0.037735,8,0,0
2,1,19,0,0.000000,2,0.035985,7,1,0
3,2,2,1,0.686162,0,0.051789,7,0,1
4,3,0,0,0.000000,1,0.037985,10,0,0
...,...,...,...,...,...,...,...,...,...
841,4,16,2,0.648126,0,0.586638,2,0,0
842,6,1,2,0.556583,0,0.361628,0,0,0
843,6,1,2,0.556583,0,0.622843,0,0,0
844,0,11,1,0.697856,0,0.078156,4,0,0


In [14]:
df_robust_scaled_test

Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년)
0,4,15,1,0.281683,0,-0.059220,2,0,0
1,1,20,0,0.476644,0,-0.179810,8,0,0
2,1,19,0,-0.785061,2,-0.186293,7,1,0
3,2,2,1,0.174943,0,-0.127758,7,0,1
4,3,0,0,-0.785061,1,-0.178884,10,0,0
...,...,...,...,...,...,...,...,...,...
841,4,16,2,0.121727,0,1.853190,2,0,0
842,6,1,2,-0.006351,0,1.019807,0,0,0
843,6,1,2,-0.006351,0,1.987283,0,0,0
844,0,11,1,0.191303,0,-0.030101,4,0,0


## 모델 훈련

In [15]:
from sklearn.model_selection import train_test_split
# 훈련 데이터셋 - df_train
labeled_x_train = df_normalized_train.drop(columns=["가격(백만원)"])
labeled_y_train = df_normalized_train["가격(백만원)"]
# 테스트 데이터셋 - df_test
labeled_x_test = df_normalized_test

In [16]:
# 훈련용 데이터 학습
from xgboost import XGBRegressor
# XGBRegressor 모델 초기화
xgb_model = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')
# 모델 학습
xgb_model.fit(labeled_x_train, labeled_y_train)

In [17]:
from sklearn.metrics import mean_squared_error

y_pred = xgb_model.predict(labeled_x_train)
mse = mean_squared_error(labeled_y_train, y_pred)
rmse = mse ** 0.5

print("훈련 데이터셋에 대한 RMSE:", rmse)

훈련 데이터셋에 대한 RMSE: 0.8232713504146087


## 모델 적용

In [18]:
predictions = xgb_model.predict(labeled_x_test)
print(predictions)

[130.6085    79.985634  65.4354    34.67609   47.920757 102.538055
  21.93267   40.37148   81.21628   24.943634  63.898197  62.870235
  40.510437 100.41974   90.62326   23.15054  124.84866   52.691624
  59.862152  34.716274  37.745632 160.37726   46.95983   55.77642
  23.18413   48.191177  79.000336  80.005295  27.89854  125.57763
  63.869446  40.677185 114.67506   88.254     35.70595   79.88709
  21.375746  39.887497  24.530487 121.06311  128.61415   18.301117
 119.529274  60.57205   23.123087 100.1186    21.845833  73.975685
  62.69927   77.849464  35.715984  16.937187  38.790802 125.94822
  23.702093  27.776833 155.95299   41.199635 159.52345   40.214584
  55.751064 125.163414  40.91155  121.025246  39.704697  34.137535
  71.61586   50.238537 100.22821   22.952236  48.2068    28.066185
  64.28339   38.718185 123.856125  23.298695  41.20778  121.185036
 160.15527   80.27956   27.486893  41.02292   17.623154  33.830452
  34.158947  39.892155  57.3438    88.21488   64.8984    53.715057

## CSV 내보내기

In [19]:
# 파일명 : submission_날짜
df_sample["가격(백만원)"] = predictions
df_sample.to_csv("../submissions/XGBRregressor_NaN_0_Normalizedscaler.csv", index=False)