# Standard Version

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot

In [54]:
import os
dataset_file = 'CarEvaluation.csv'
if not os.path.isfile(dataset_file):
  os.system('wget https://raw.githubusercontent.com/cnchi/datasets/master/' + dataset_file)

dataset = pd.read_csv('CarEvaluation.csv')

In [55]:
dataset

Unnamed: 0,City,Children,Age,Salary,ToBuy
0,Taipei,,44.0,72000.0,No
1,Taichung,0.0,27.0,48000.0,Yes
2,Kaohsiung,0.0,30.0,54000.0,No
3,Taichung,1.0,38.0,61000.0,No
4,Kaohsiung,2.0,40.0,,Yes
5,Taipei,2.0,35.0,58000.0,Yes
6,Taichung,1.0,,52000.0,No
7,Taipei,2.0,48.0,79000.0,Yes
8,Kaohsiung,1.0,50.0,83000.0,No
9,Taipei,2.0,37.0,67000.0,Yes


切分自變數、應變數

In [56]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [57]:
X, Y

(array([['Taipei', nan, 44.0, 72000.0],
        ['Taichung', 0.0, 27.0, 48000.0],
        ['Kaohsiung', 0.0, 30.0, 54000.0],
        ['Taichung', 1.0, 38.0, 61000.0],
        ['Kaohsiung', 2.0, 40.0, nan],
        ['Taipei', 2.0, 35.0, 58000.0],
        ['Taichung', 1.0, nan, 52000.0],
        ['Taipei', 2.0, 48.0, 79000.0],
        ['Kaohsiung', 1.0, 50.0, 83000.0],
        ['Taipei', 2.0, 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

檢查是否有缺失資料, 再算每列有幾個, 再看看整個dataframe總共有幾個缺失資料

In [58]:
if sum(dataset.isnull().sum()) > 0:
  print('要補缺失資料')

要補缺失資料


處理缺失資料

In [59]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean') # 將NaN值用平均值代替.
imputer = imputer.fit(X[:, 1:4])
X[:, 1:4] = imputer.transform(X[:, 1:4])

In [60]:
print(X[:, 1:4])

[[1.2222222222222223 44.0 72000.0]
 [0.0 27.0 48000.0]
 [0.0 30.0 54000.0]
 [1.0 38.0 61000.0]
 [2.0 40.0 63777.77777777778]
 [2.0 35.0 58000.0]
 [1.0 38.77777777777778 52000.0]
 [2.0 48.0 79000.0]
 [1.0 50.0 83000.0]
 [2.0 37.0 67000.0]]


類別資料數位化.

In [23]:
# 使用標籤編碼器，將應變數 Y 數位化
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder() # NO 換成 0, Yes 換成 1.
Y = labelEncoder.fit_transform(Y).astype("float64") # 轉成小數

In [24]:
print(Y)

[0. 1. 0. 0. 1. 1. 0. 1. 0. 1.]


In [25]:
# 使用獨熱編碼器(one-hot encoding)，將自變數 X 數位化
ary_dummies = pd.get_dummies(X[:, 0]).values
X = np.concatenate((ary_dummies, X[:, 1:4]), axis=1).astype("float64")

In [26]:
print(X)

[[0.00000000e+00 0.00000000e+00 1.00000000e+00 1.22222222e+00
  4.40000000e+01 7.20000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  2.70000000e+01 4.80000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  3.00000000e+01 5.40000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 1.00000000e+00
  3.80000000e+01 6.10000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 2.00000000e+00
  4.00000000e+01 6.37777778e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.00000000e+00
  3.50000000e+01 5.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 1.00000000e+00
  3.87777778e+01 5.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.00000000e+00
  4.80000000e+01 7.90000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  5.00000000e+01 8.30000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.00000000e+00
  3.70000000e+01 6.70000000e+04]]


切割訓練集和測試集

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [29]:
print(len(X_train))

8


特徵縮放

In [30]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler().fit(X_train)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)

In [32]:
print(X_train)

[[ 2.64575131 -0.77459667 -1.          0.87211946  0.26306757  0.12381479]
 [-0.37796447 -0.77459667  1.          0.87211946 -0.25350148  0.46175632]
 [-0.37796447  1.29099445 -1.         -2.04846663 -1.97539832 -1.53093341]
 [-0.37796447  1.29099445 -1.         -0.58817359  0.05261351 -1.11141978]
 [-0.37796447 -0.77459667  1.          0.87211946  1.64058505  1.7202972 ]
 [-0.37796447  1.29099445 -1.         -0.58817359 -0.0813118  -0.16751412]
 [-0.37796447 -0.77459667  1.         -0.26366402  0.95182631  0.98614835]
 [-0.37796447 -0.77459667  1.          0.87211946 -0.59788085 -0.48214934]]


# Simple Version

取得資料集

In [33]:
# 不檢查檔案是否存在，暴力下載
# ! wget https://raw.githubusercontent.com/cnchi/datasets/master/CarEvaluation.csv

# 先檢查檔案是否存在，再決定是否下載
import os

Dataset_File = "CarEvaluation.csv"
if not os.path.isfile(Dataset_File):
  os.system("wget https://raw.githubusercontent.com/cnchi/datasets/master/" + Dataset_File)

安裝快樂版函式庫

In [34]:
# 先檢查是否存在 HappyML 這個資料夾，若沒有，則下載
if not os.path.isdir("HappyML"):
  os.system("git clone https://github.com/cnchi/HappyML.git")

讀入 CSV 檔

In [35]:
import HappyML.preprocessor as pp

dataset = pp.dataset(file="CarEvaluation.csv")

In [39]:
dataset

Unnamed: 0,City,Children,Age,Salary,ToBuy
0,Taipei,,44.0,72000.0,No
1,Taichung,0.0,27.0,48000.0,Yes
2,Kaohsiung,0.0,30.0,54000.0,No
3,Taichung,1.0,38.0,61000.0,No
4,Kaohsiung,2.0,40.0,,Yes
5,Taipei,2.0,35.0,58000.0,Yes
6,Taichung,1.0,,52000.0,No
7,Taipei,2.0,48.0,79000.0,Yes
8,Kaohsiung,1.0,50.0,83000.0,No
9,Taipei,2.0,37.0,67000.0,Yes


切分自變數、應變數

In [36]:
X, Y = pp.decomposition(dataset, x_columns=[i for i in range(4)], y_columns=[4])

In [41]:
X, Y

(        City  Children        Age        Salary
 0     Taipei  1.222222       44.0       72000.0
 1   Taichung       0.0       27.0       48000.0
 2  Kaohsiung       0.0       30.0       54000.0
 3   Taichung       1.0       38.0       61000.0
 4  Kaohsiung       2.0       40.0  63777.777778
 5     Taipei       2.0       35.0       58000.0
 6   Taichung       1.0  38.777778       52000.0
 7     Taipei       2.0       48.0       79000.0
 8  Kaohsiung       1.0       50.0       83000.0
 9     Taipei       2.0       37.0       67000.0,
   ToBuy
 0    No
 1   Yes
 2    No
 3    No
 4   Yes
 5   Yes
 6    No
 7   Yes
 8    No
 9   Yes)

處理缺失資料

In [37]:
X = pp.missing_data(X, strategy="mean")

In [42]:
X

Unnamed: 0,City,Children,Age,Salary
0,Taipei,1.222222,44.0,72000.0
1,Taichung,0.0,27.0,48000.0
2,Kaohsiung,0.0,30.0,54000.0
3,Taichung,1.0,38.0,61000.0
4,Kaohsiung,2.0,40.0,63777.777778
5,Taipei,2.0,35.0,58000.0
6,Taichung,1.0,38.777778,52000.0
7,Taipei,2.0,48.0,79000.0
8,Kaohsiung,1.0,50.0,83000.0
9,Taipei,2.0,37.0,67000.0


類別資料數位化

In [43]:
# 使用標籤編碼器，將應變數 Y 數位化
Y, Y_mapping = pp.label_encoder(Y, mapping=True)

In [45]:
Y, Y_mapping

(   ToBuy
 0      0
 1      1
 2      0
 3      0
 4      1
 5      1
 6      0
 7      1
 8      0
 9      1,
 {0: 'No', 1: 'Yes'})

In [44]:
# 使用獨熱編碼器，將自變數 X 數位化
X = pp.onehot_encoder(X, columns=[0])

In [46]:
X

Unnamed: 0,City_Kaohsiung,City_Taichung,City_Taipei,Children,Age,Salary
0,0,0,1,1.222222,44.0,72000.0
1,0,1,0,0.0,27.0,48000.0
2,1,0,0,0.0,30.0,54000.0
3,0,1,0,1.0,38.0,61000.0
4,1,0,0,2.0,40.0,63777.777778
5,0,0,1,2.0,35.0,58000.0
6,0,1,0,1.0,38.777778,52000.0
7,0,0,1,2.0,48.0,79000.0
8,1,0,0,1.0,50.0,83000.0
9,0,0,1,2.0,37.0,67000.0


切分訓練集、測試集

In [47]:
X_train, X_test, Y_train, Y_test = pp.split_train_test(X, Y, train_size=0.8, random_state=0)

In [50]:
X_train, X_test, Y_train, Y_test

(   City_Kaohsiung  City_Taichung  City_Taipei  Children       Age    Salary
 4        2.645751      -0.774597         -1.0  0.872119  0.263068  0.123815
 9       -0.377964      -0.774597          1.0  0.872119 -0.253501  0.461756
 1       -0.377964       1.290994         -1.0 -2.048467 -1.975398 -1.530933
 6       -0.377964       1.290994         -1.0 -0.588174  0.052614 -1.111420
 7       -0.377964      -0.774597          1.0  0.872119  1.640585  1.720297
 3       -0.377964       1.290994         -1.0 -0.588174 -0.081312 -0.167514
 0       -0.377964      -0.774597          1.0 -0.263664  0.951826  0.986148
 5       -0.377964      -0.774597          1.0  0.872119 -0.597881 -0.482149,
    City_Kaohsiung  City_Taichung  City_Taipei  Children       Age    Salary
 2        2.645751      -0.774597         -1.0 -2.048467 -1.458829 -0.901663
 8        2.645751      -0.774597         -1.0 -0.588174  1.984964  2.139811,
    ToBuy
 4      1
 9      1
 1      1
 6      0
 7      1
 3      0
 0  

特徵縮放

In [48]:
X_train, X_test = pp.feature_scaling(X_train, transform_arys=(X_train, X_test))

In [49]:
X_train, X_test

(   City_Kaohsiung  City_Taichung  City_Taipei  Children       Age    Salary
 4        2.645751      -0.774597         -1.0  0.872119  0.263068  0.123815
 9       -0.377964      -0.774597          1.0  0.872119 -0.253501  0.461756
 1       -0.377964       1.290994         -1.0 -2.048467 -1.975398 -1.530933
 6       -0.377964       1.290994         -1.0 -0.588174  0.052614 -1.111420
 7       -0.377964      -0.774597          1.0  0.872119  1.640585  1.720297
 3       -0.377964       1.290994         -1.0 -0.588174 -0.081312 -0.167514
 0       -0.377964      -0.774597          1.0 -0.263664  0.951826  0.986148
 5       -0.377964      -0.774597          1.0  0.872119 -0.597881 -0.482149,
    City_Kaohsiung  City_Taichung  City_Taipei  Children       Age    Salary
 2        2.645751      -0.774597         -1.0 -2.048467 -1.458829 -0.901663
 8        2.645751      -0.774597         -1.0 -0.588174  1.984964  2.139811)