In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# mpl.rc('font', family='NanumBarunGothic') # 혹은 다른 설치한 Nanum 폰트 사용 - 리눅스 사용할때
import scipy
import scipy.stats as stats

import os
import re
import time
import datetime as dt # 사용시 datetime.dt 로 사용 # 현재 시간 사용
from datetime import datetime # 사용시 함수로 바로 사용 #-> 시간 비교

In [2]:
# 경고창 무시
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_sample = pd.read_csv("../datasets/sample_submission.csv")
df_train = pd.read_csv("../datasets/train.csv")
df_test = pd.read_csv("../datasets/test.csv")

In [4]:
df_train.head(5)

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        7497 non-null   object 
 1   제조사       7497 non-null   object 
 2   모델        7497 non-null   object 
 3   차량상태      7497 non-null   object 
 4   배터리용량     4786 non-null   float64
 5   구동방식      7497 non-null   object 
 6   주행거리(km)  7497 non-null   int64  
 7   보증기간(년)   7497 non-null   int64  
 8   사고이력      7497 non-null   object 
 9   연식(년)     7497 non-null   int64  
 10  가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 644.4+ KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        846 non-null    object 
 1   제조사       846 non-null    object 
 2   모델        846 non-null    object 
 3   차량상태      846 non-null    object 
 4   배터리용량     542 non-null    float64
 5   구동방식      846 non-null    object 
 6   주행거리(km)  846 non-null    int64  
 7   보증기간(년)   846 non-null    int64  
 8   사고이력      846 non-null    object 
 9   연식(년)     846 non-null    int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 66.2+ KB


## 데이터 전처리

In [7]:
# 배터리 NaN값에 대해 0으로 대체
df_train['배터리용량'].fillna(0, inplace=True)
df_test['배터리용량'].fillna(0, inplace=True)

df_train = df_train.drop(columns=["ID"])
df_test = df_test.drop(columns=["ID"])

In [8]:
# 범주형 컬럼 지정
categorical_columns = ["제조사","모델","차량상태","구동방식","사고이력"]

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# df_train 레이블 인코딩
for column in categorical_columns:
    df_train[column] = label_encoder.fit_transform(df_train[column])

# df_test 레이블 인코딩
for column in categorical_columns:
    df_test[column] = label_encoder.fit_transform(df_test[column])

## 모델 훈련

In [10]:
from sklearn.model_selection import train_test_split
# 훈련 데이터셋 - df_train
labeled_x_train = df_train.drop(columns=["가격(백만원)"])
labeled_y_train = df_train["가격(백만원)"]
# 테스트 데이터셋 - df_test
labeled_x_test = df_test

In [11]:
# 훈련용 데이터 학습
import lightgbm as lgb
# LGBMRegressor 모델 초기화
lgb_model = lgb.LGBMRegressor(random_state=42)
# 모델 학습
lgb_model.fit(labeled_x_train, labeled_y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 407
[LightGBM] [Info] Number of data points in the train set: 7497, number of used features: 9
[LightGBM] [Info] Start training from score 62.331949


In [12]:
from sklearn.metrics import mean_squared_error

y_pred = lgb_model.predict(labeled_x_train)
mse = mean_squared_error(labeled_y_train, y_pred)
rmse = mse ** 0.5

print("훈련 데이터셋에 대한 RMSE:", rmse)

훈련 데이터셋에 대한 RMSE: 1.1465815698267428


## 모델 적용

In [13]:
predictions = lgb_model.predict(labeled_x_test)
print(predictions)

[130.37540275  80.04801966  64.89309254  34.94246403  47.90285062
 101.46542025  21.91128317  40.04569472  81.26141579  24.67064304
  64.21536565  62.70408366  40.19198423 100.20973202  90.80061081
  22.76241609 125.9935386   52.65197155  60.07166138  34.69304758
  37.77834427 160.46441127  47.12281755  55.88841767  23.16167317
  47.8874571   78.88893476  79.97701689  27.92224035 126.06861405
  63.96976954  40.78883938 114.9456538   88.0555461   35.62458754
  80.01780659  21.55550112  40.15100423  24.6384997  121.21118715
 126.97606844  17.7174165  119.41136227  60.6017461   23.15367901
 100.25968518  21.60001915  74.11032401  62.44075324  77.80768724
  35.74799946  17.03694717  38.74500413 126.60444019  23.7456485
  27.9320475  155.84988283  41.14008267 159.64241179  40.19266233
  55.86312383 124.90782561  40.8783143  120.91191061  40.19817921
  34.24073849  72.23017356  50.19243596  99.96616537  22.99446514
  47.86229755  27.9228205   64.34969006  38.74521627 124.22337249
  23.238351

## CSV 내보내기

In [14]:
# 파일명 : submission_날짜
df_sample["가격(백만원)"] = predictions
df_sample.to_csv("../submissions/LGBMRegressor_NaN_0.csv", index=False)