In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# mpl.rc('font', family='NanumBarunGothic') # 혹은 다른 설치한 Nanum 폰트 사용 - 리눅스 사용할때
import scipy
import scipy.stats as stats

import os
import re
import time
import datetime as dt # 사용시 datetime.dt 로 사용 # 현재 시간 사용
from datetime import datetime # 사용시 함수로 바로 사용 #-> 시간 비교

In [2]:
# 경고창 무시
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_sample = pd.read_csv("../datasets/sample_submission.csv")
df_train = pd.read_csv("../datasets/train.csv")
df_test = pd.read_csv("../datasets/test.csv")

In [4]:
df_train.head(5)

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        7497 non-null   object 
 1   제조사       7497 non-null   object 
 2   모델        7497 non-null   object 
 3   차량상태      7497 non-null   object 
 4   배터리용량     4786 non-null   float64
 5   구동방식      7497 non-null   object 
 6   주행거리(km)  7497 non-null   int64  
 7   보증기간(년)   7497 non-null   int64  
 8   사고이력      7497 non-null   object 
 9   연식(년)     7497 non-null   int64  
 10  가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 644.4+ KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        846 non-null    object 
 1   제조사       846 non-null    object 
 2   모델        846 non-null    object 
 3   차량상태      846 non-null    object 
 4   배터리용량     542 non-null    float64
 5   구동방식      846 non-null    object 
 6   주행거리(km)  846 non-null    int64  
 7   보증기간(년)   846 non-null    int64  
 8   사고이력      846 non-null    object 
 9   연식(년)     846 non-null    int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 66.2+ KB


## 데이터 전처리

In [7]:
# 배터리 NaN값에 대해 평균값으로 대체
df_train['배터리용량'].fillna(df_train['배터리용량'].mean(), inplace=True)
df_test['배터리용량'].fillna(df_test['배터리용량'].mean(), inplace=True)

df_train = df_train.drop(columns=["ID"])
df_test = df_test.drop(columns=["ID"])

In [8]:
# 범주형 컬럼 지정
categorical_columns = ["제조사","모델","차량상태","구동방식","사고이력"]

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# df_train 레이블 인코딩
for column in categorical_columns:
    df_train[column] = label_encoder.fit_transform(df_train[column])

# df_test 레이블 인코딩
for column in categorical_columns:
    df_test[column] = label_encoder.fit_transform(df_test[column])

## 모델 훈련

In [10]:
from sklearn.model_selection import train_test_split
# 훈련 데이터셋 - df_train
labeled_x_train = df_train.drop(columns=["가격(백만원)"])
labeled_y_train = df_train["가격(백만원)"]
# 테스트 데이터셋 - df_test
labeled_x_test = df_test

In [11]:
# 훈련용 데이터 학습
from xgboost import XGBRegressor
# XGBRegressor 모델 초기화
xgb_model = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')
# 모델 학습
xgb_model.fit(labeled_x_train, labeled_y_train)

## 모델 적용

In [12]:
predictions = xgb_model.predict(df_test)
print(predictions)

[130.4626    80.27461   65.18376   34.718346  47.60428  100.28873
  21.752691  40.423832  81.53175   24.562336  64.26888   62.344887
  40.28664   99.98927   90.36405   23.039127 125.077156  52.74526
  60.13844   34.69845   37.823    160.60591   47.041916  55.731117
  23.108208  47.866714  78.696526  79.852005  28.021896 125.95868
  64.0908    40.645386 114.504486  88.41099   35.451313  79.6649
  21.637901  39.74549   24.649706 120.95547  128.28171   22.469324
 119.20014   60.385834  22.906282 100.13307   21.774328  73.716736
  62.572834  77.8415    35.67283   18.335604  38.715202 126.23784
  23.654026  27.721035 155.96198   41.164753 159.41803   40.127316
  55.826466 125.38084   40.20499  120.89401   39.7886    34.132458
  71.983955  49.992413 100.30163   22.969631  48.11144   27.74189
  64.455894  38.76053  123.68456   23.363583  41.34187  121.06994
 160.2754    79.94763   27.897783  40.98211   17.233082  33.91818
  33.869827  40.19235   57.464077  88.339325  64.45321   53.720238
  41

## CSV 내보내기

In [13]:
# 파일명 : submission_날짜
df_sample["가격(백만원)"] = predictions
submission_20250105 = df_sample
submission_20250105.to_csv("../submissions/submission_20250105.csv", index=False)