In [1]:
# 1. 문제 정의
# 평가: RMSLE
# target: Price
# 최종 파일: result.csv(컬럼 1개 pred, 1 확률값)

# 2. 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv("car_train.csv")
test = pd.read_csv("car_test.csv")

# 3. 탐색적 데이터 분석(EDA)
print("===== 데이터 크기 =====")
print(train.shape, test.shape)

print("\n===== train 데이터 샘플 =====")
print(train.head(1))

print("\n===== test 데이터 샘플 =====")
print(test.head(1))

print("\n===== 데이터 정보(자료형) =====")
print(train.info())
      
print("\n===== train 결측치 수 =====")
print(train.isnull().sum().sum())

print("\n===== test 결측치 수 =====")
print(test.isnull().sum().sum())

print("\n===== 카테고리 비교 =====")
cols = train.select_dtypes(include='object').columns
for col in cols:
    set_train = set(train[col])
    set_test = set(test[col])
    same = (set_train == set_test)
    if same:
        print(col, "\t카테코리 동일함")
    else:
        print(col, "\t카테고리 동일하지 않음")

print("\n===== target 기술 통계 =====")
print(train['Price'].describe())

===== 데이터 크기 =====
(6732, 17) (5772, 16)

===== train 데이터 샘플 =====
   Price Levy Manufacturer   Model  Prod. year Category Leather interior  \
0  13956  603        LEXUS  RX 450        2015     Jeep              Yes   

  Fuel type Engine volume    Mileage  Cylinders Gear box type Drive wheels  \
0    Hybrid           3.5  143619 km        6.0     Automatic          4x4   

    Doors       Wheel  Color  Airbags  
0  04-May  Left wheel  Black       12  

===== test 데이터 샘플 =====
  Levy Manufacturer   Model  Prod. year Category Leather interior Fuel type  \
0  730    SSANGYONG  Actyon        2016     Jeep              Yes    Petrol   

  Engine volume   Mileage  Cylinders Gear box type Drive wheels   Doors  \
0           1.6  70940 km        4.0     Automatic        Front  04-May   

        Wheel  Color  Airbags  
0  Left wheel  Black        4  

===== 데이터 정보(자료형) =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6732 entries, 0 to 6731
Data columns (total 17 columns):
 #   Column 

In [7]:
# 4. 데이터 전처리
target = train.pop('Price')

# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
combined = pd.concat([train, test])
cols = train.select_dtypes(include='object').columns

for col in cols:
    le = LabelEncoder()
    combined[col]=le.fit_transform(combined[col])

n_train = len(train)
train = combined[:n_train]
test = combined[n_train:]

# 5. 검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 6. 머신러닝 학습 및 평가
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=0)
rf.fit(X_tr, y_tr)
pred = rf.predict(X_val)

# RMSLE
from sklearn.metrics import root_mean_squared_log_error
result = root_mean_squared_log_error(y_val, pred)
print('rmsle:', result)

# 7. 예측 및 결과 파일 생성
pred = rf.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv("result.csv", index=False)

rmsle: 1.1008952910276844


In [9]:
# 성능 개선

# 2. 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv("car_train.csv")
test = pd.read_csv("car_test.csv")

# 4. 데이터 전처리
target = train.pop('Price')

# Engine volume 자료형 변경 및 Turbo 컬럼 생성
train['Turbo'] = train['Engine volume'].str.contains('Turbo').astype(int)
train['Engine volume'] = train['Engine volume'].str.replace('Turbo', '').astype(float)

test['Turbo'] = test['Engine volume'].str.contains('Turbo').astype(int)
test['Engine volume'] = test['Engine volume'].str.replace('Turbo', '').astype(float)

# Mileage 자료형 변경(km 제거)
train['Mileage'] = train['Mileage'].str.split().str[0].astype(int)
test['Mileage'] = test['Mileage'].str.split().str[0].astype(int)

# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
combined = pd.concat([train, test])
cols = train.select_dtypes(include='object').columns

for col in cols:
    le = LabelEncoder()
    combined[col]=le.fit_transform(combined[col])

n_train = len(train)
train = combined[:n_train]
test = combined[n_train:]

# 5. 검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 6. 머신러닝 학습 및 평가
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200, random_state=0)
rf.fit(X_tr, y_tr)
pred = rf.predict(X_val)

# RMSLE
from sklearn.metrics import root_mean_squared_log_error
result = root_mean_squared_log_error(y_val, pred)
print('rmsle:', result)

# 7. 예측 및 결과 파일 생성
pred = rf.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv("result.csv", index=False)

rmsle: 1.082016203127291
