# 環境準備

## 取得資料集

In [None]:
# 不檢查檔案是否存在，暴力下載
# ! wget https://raw.githubusercontent.com/cnchi/datasets/master/CarEvaluation.csv

# 先檢查檔案是否存在，再決定是否下載
import os

Dataset_File = "CarEvaluation.csv"
if not os.path.isfile(Dataset_File):
  os.system("wget https://raw.githubusercontent.com/cnchi/datasets/master/" + Dataset_File)

# 資料集前處理

## 讀入 CSV 檔

In [None]:
import numpy as np
import pandas as pd

dataset = pd.read_csv("CarEvaluation.csv")

## 切分自變數、應變數

In [None]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

## 處理缺失資料

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(X[:, 1:4])
X[:, 1:4] = imputer.transform(X[:, 1:4])

## 類別資料數位化

In [None]:
# 使用標籤編碼器，將應變數 Y 數位化
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
Y = labelEncoder.fit_transform(Y).astype("float64")

In [None]:
# 使用獨熱編碼器，將自變數 X 數位化
ary_dummies = pd.get_dummies(X[:, 0]).values
X = np.concatenate((ary_dummies, X[:, 1:4]), axis=1).astype("float64")

## 切分訓練集、測試集

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

## 特徵縮放

In [None]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler().fit(X_train)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)