In [3]:
import os
import pandas as pd
import numpy as np

cur_path = '.'
data_path = 'data'
train_file_name = 'train.csv'
test_file_name = 'test.csv'
sample_file_name = 'sample_submission.csv'

In [5]:
sample = pd.read_csv(os.path.join(cur_path, data_path, sample_file_name),encoding='utf-8')
train = pd.read_csv(os.path.join(cur_path, data_path, train_file_name), encoding='utf-8')

In [6]:
sample

Unnamed: 0,ID,가격(백만원)
0,TEST_000,0
1,TEST_001,0
2,TEST_002,0
3,TEST_003,0
4,TEST_004,0
...,...,...
841,TEST_841,0
842,TEST_842,0
843,TEST_843,0
844,TEST_844,0


In [7]:
train

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.000,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.200,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02
...,...,...,...,...,...,...,...,...,...,...,...
7492,TRAIN_7492,H사,ION5,Brand New,,AWD,3773,10,No,0,35.95
7493,TRAIN_7493,B사,i3,Pre-Owned,46.000,RWD,135411,2,No,0,23.40
7494,TRAIN_7494,P사,TayCT,Brand New,,AWD,1363,2,No,0,120.00
7495,TRAIN_7495,B사,i3,Nearly New,56.000,RWD,39445,6,No,2,24.00


In [90]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
배터리용량,7497.0,69.3839,13.100509,46.0,58.577,67.3984,78.3108,99.8
주행거리(km),7497.0,44287.979458,55204.064386,3.0,5465.0,17331.0,61252.0,199827.0
보증기간(년),7497.0,4.960918,3.155342,0.0,2.0,5.0,8.0,10.0
연식(년),7497.0,0.222489,0.569232,0.0,0.0,0.0,0.0,2.0
가격(백만원),7497.0,62.331949,36.646759,9.0,34.39,56.0,80.05,161.09


In [8]:
train.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        7497 non-null   object 
 1   제조사       7497 non-null   object 
 2   모델        7497 non-null   object 
 3   차량상태      7497 non-null   object 
 4   배터리용량     4786 non-null   float64
 5   구동방식      7497 non-null   object 
 6   주행거리(km)  7497 non-null   int64  
 7   보증기간(년)   7497 non-null   int64  
 8   사고이력      7497 non-null   object 
 9   연식(년)     7497 non-null   int64  
 10  가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 644.4+ KB


In [89]:
train.isna().sum() # 결측치 확인

ID          0
제조사         0
모델          0
차량상태        0
배터리용량       0
구동방식        0
주행거리(km)    0
보증기간(년)     0
사고이력        0
연식(년)       0
가격(백만원)     0
dtype: int64

In [10]:
train['연식(년)'].unique()

array([2, 0, 1])

# Data Preprocessing

### Column '배터리용량' - 결측치 존재O 
- 연속형 변수 (Standard Scaling)
- 결측치 처리
    - 제조사 별 평균으로 채우기

### Column '주행거리(km)' 
- 연속형 변수 (Standard Scaling)

### Column '모델'
- 명목형 변수 (One - Hot Encoding)
- 모델 개수 파악 후 상위 5개 이외는 나머지로 분류

### Column 차량상태
- 주행거리가 큰 의미를 가지므로 사용X

### Column 보증기간(년)

### Column 사고이력
- 명목형 변수 (One - Hot Encoding)

### 연식(년)
- 명목형 변수(One - Hot Encoding)
- 값이 3개

In [69]:
def battery_capacity_col_preprocessing(train_df: pd.DataFrame)-> pd.DataFrame:
    preprocessed = train_df[['ID']]
    manufacturer = train_df['제조사'].unique()
    concat_table = pd.DataFrame(columns=['ID', '제조사', '배터리용량'])
    for m in manufacturer:
        tmp_df = train_df[train_df['제조사']==m][['ID', '제조사', '배터리용량']]
        manufacturer_mean = np.round(np.mean(tmp_df['배터리용량']),4)
        tmp_df.fillna({'배터리용량': manufacturer_mean}, inplace = True)
        if concat_table.empty:
            concat_table = tmp_df.copy()
            continue
        concat_table = pd.concat([concat_table, tmp_df])

    preprocessed = pd.merge(preprocessed, concat_table, on = 'ID', how = 'inner')
    return preprocessed

In [70]:
battery_capacity_col_preprocessing(train)

Unnamed: 0,ID,제조사,배터리용량
0,TRAIN_0000,P사,86.0770
1,TRAIN_0001,K사,56.0000
2,TRAIN_0002,A사,91.2000
3,TRAIN_0003,A사,78.7756
4,TRAIN_0004,B사,61.0180
...,...,...,...
7492,TRAIN_7492,H사,67.3984
7493,TRAIN_7493,B사,46.0000
7494,TRAIN_7494,P사,75.2653
7495,TRAIN_7495,B사,56.0000


In [71]:
prep = battery_capacity_col_preprocessing(train)
train['배터리용량'] = prep['배터리용량']
train

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.0770,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.0000,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.2000,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,78.7756,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.0180,AWD,178205,1,No,0,62.02
...,...,...,...,...,...,...,...,...,...,...,...
7492,TRAIN_7492,H사,ION5,Brand New,67.3984,AWD,3773,10,No,0,35.95
7493,TRAIN_7493,B사,i3,Pre-Owned,46.0000,RWD,135411,2,No,0,23.40
7494,TRAIN_7494,P사,TayCT,Brand New,75.2653,AWD,1363,2,No,0,120.00
7495,TRAIN_7495,B사,i3,Nearly New,56.0000,RWD,39445,6,No,2,24.00


In [72]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        7497 non-null   object 
 1   제조사       7497 non-null   object 
 2   모델        7497 non-null   object 
 3   차량상태      7497 non-null   object 
 4   배터리용량     7497 non-null   float64
 5   구동방식      7497 non-null   object 
 6   주행거리(km)  7497 non-null   int64  
 7   보증기간(년)   7497 non-null   int64  
 8   사고이력      7497 non-null   object 
 9   연식(년)     7497 non-null   int64  
 10  가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 644.4+ KB


In [None]:
x_train = train[['배터리용량','주행거리(km)', '보증기간(년)', '사고이력', '연식(년)']]
y = train[[ '가격(백만원)']]

In [80]:
x_train

Unnamed: 0,ID,배터리용량,주행거리(km),보증기간(년),사고이력,연식(년)
0,TRAIN_0000,86.0770,13642,0,No,2
1,TRAIN_0001,56.0000,10199,6,No,0
2,TRAIN_0002,91.2000,2361,7,No,0
3,TRAIN_0003,78.7756,21683,3,No,0
4,TRAIN_0004,61.0180,178205,1,No,0
...,...,...,...,...,...,...
7492,TRAIN_7492,67.3984,3773,10,No,0
7493,TRAIN_7493,46.0000,135411,2,No,0
7494,TRAIN_7494,75.2653,1363,2,No,0
7495,TRAIN_7495,56.0000,39445,6,No,2


In [79]:
y

Unnamed: 0,ID,가격(백만원)
0,TRAIN_0000,159.66
1,TRAIN_0001,28.01
2,TRAIN_0002,66.27
3,TRAIN_0003,99.16
4,TRAIN_0004,62.02
...,...,...
7492,TRAIN_7492,35.95
7493,TRAIN_7493,23.40
7494,TRAIN_7494,120.00
7495,TRAIN_7495,24.00


In [88]:
from sklearn.preprocessing import OneHotEncoder

accident = x_train['사고이력'].unique()
model_year = x_train['연식(년)'].unique()
encoder = OneHotEncoder(categories=[accident, model_year])

encoder.fit(x_train)


ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# linear regression

from sklearn.linear_model import LinearRegression

In [None]:
# xgboost
from xgboost import XGBClassifier