In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
#載入相關套件
np.set_printoptions(suppress=True)

# 1) 載入資料集

In [None]:
wine = load_wine()
df_data = pd.DataFrame(data= np.c_[wine['data'], wine['target']],
                     columns= ["alcohol","malic_acid","ash","alcalinity_of_ash","magnesium","total_phenols","flavanoids","nonflavanoid_phenols","proanthocyanins","color_intensity","hue","od280/od315_of_diluted_wines","proline","age"])
df_data

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,age
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2.0


# 2) 檢查缺失值
使用 numpy 所提供的函式來檢查是否有 NA 缺失值，假設有缺失值使用dropna()來移除。使用的時機在於當只有少量的缺失值適用，若遇到有大量缺失值的情況，或是本身的資料量就很少的情況下建議可以透過機器學習的方法補值來預測缺失值。

# 移除缺失值
df_data=df_data.dropna()

In [None]:
X = df_data.drop(labels=['age'],axis=1).values # 移除Species並取得剩下欄位資料
y = df_data['age']
# checked missing data
print("checked missing data(NAN mount):",len(np.where(np.isnan(X))[0]))

checked missing data(NAN mount): 0


In [None]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  age  

# 3) 切割訓練集與測試集

In [None]:
from sklearn.model_selection import train_test_split
#train_test_split() 來為我們的資料進行訓練集與測試集的切割
#test_size 參數就是設定測試集的比例，範例中我們設定 0.3 即代表訓練集與測試集的比例為 7:3
#隨機切割 shuffle=True 對原始數據進行隨機抽樣，以保證隨機性
# random_state 並給予一個隨機數值
#stratify 分層隨機抽樣
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print('train shape:', X_train.shape)
print('test shape:', X_test.shape)

train shape: (124, 13)
test shape: (54, 13)


# 資料前處理
*   Standardization平均&變異數標準化

*   MinMaxScaler最小最大值標準化
*   MaxAbsScaler絕對值最大標準化
*   RobustScaler


# Standardization平均&變異數標準化
將所有特徵標準化，也就是高斯分佈。使得數據的平均值為0，方差為1。 適合的使用時機於當有些特徵的方差過大時，使用標準化能夠有效地讓模型快速收斂。

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
# scaled之後的資料零均值，單位方差  
print('資料集 X 的平均值 : ', X_train.mean(axis=0))
print('資料集 X 的標準差 : ', X_train.std(axis=0))

print('\nStandardScaler 縮放過後訓練集的平均值 : ', X_train_scaled.mean(axis=0))
print('StandardScaler 縮放過後訓練集的標準差 : ', X_train_scaled.std(axis=0))

資料集 X 的平均值 :  [ 12.9558871    2.28701613   2.36346774  19.66854839 100.04032258
   2.29306452   2.00709677   0.36         1.60556452   4.99427419
   0.95359677   2.60104839 748.33064516]
資料集 X 的標準差 :  [  0.80810413   1.0556736    0.26804563   3.37324308  15.17234283
   0.62454393   0.95128417   0.11730853   0.57654603   2.37149712
   0.2290519    0.68281063 306.82459846]

StandardScaler 縮放過後訓練集的平均值 :  [ 0.  0.  0.  0. -0.  0.  0. -0.  0. -0.  0.  0. -0.]
StandardScaler 縮放過後訓練集的標準差 :  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [None]:
X_test_scaled = scaler.transform(X_test)

print('\nStandardScaler 縮放過後測試集的平均值 : ', X_test_scaled.mean(axis=0))
print('StandardScaler 縮放過後測試集的標準差 : ', X_test_scaled.std(axis=0))


StandardScaler 縮放過後測試集的平均值 :  [ 0.18245945  0.15403767  0.03749651 -0.16964449 -0.0649054   0.01080836
  0.07683131  0.05209435 -0.08384811  0.08870154  0.05544386  0.05135058
 -0.01544222]
StandardScaler 縮放過後測試集的標準差 :  [0.99426175 1.1652469  1.06599739 0.94673515 0.77803461 0.99756436
 1.14604365 1.17941677 0.96392269 0.91134966 0.98265309 1.11614834
 1.07530799]


In [None]:
# inverse_transform()將縮放的資料還原
X_test_inverse = scaler.inverse_transform(X_test_scaled)

In [None]:
X_test_inverse[:3]

array([[  13.16,    2.36,    2.67,   18.6 ,  101.  ,    2.8 ,    3.24,
           0.3 ,    2.81,    5.68,    1.03,    3.17, 1185.  ],
       [  12.17,    1.45,    2.53,   19.  ,  104.  ,    1.89,    1.75,
           0.45,    1.03,    2.95,    1.45,    2.23,  355.  ],
       [  14.19,    1.59,    2.48,   16.5 ,  108.  ,    3.3 ,    3.93,
           0.32,    1.86,    8.7 ,    1.23,    2.82, 1680.  ]])

In [None]:
X_test[:3]

array([[  13.16,    2.36,    2.67,   18.6 ,  101.  ,    2.8 ,    3.24,
           0.3 ,    2.81,    5.68,    1.03,    3.17, 1185.  ],
       [  12.17,    1.45,    2.53,   19.  ,  104.  ,    1.89,    1.75,
           0.45,    1.03,    2.95,    1.45,    2.23,  355.  ],
       [  14.19,    1.59,    2.48,   16.5 ,  108.  ,    3.3 ,    3.93,
           0.32,    1.86,    8.7 ,    1.23,    2.82, 1680.  ]])

# MinMaxScaler最小最大值標準化
在MinMaxScaler中是給定了一個明確的最大值與最小值。每個特徵中的最小值變成了0，最大值變成了1。數據會縮放到到[0,1]之間

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
# scaled 之後的資料最小值、最大值  
print('資料集 X 的最小值 : ', X_train.min(axis=0))
print('資料集 X 的最大值 : ', X_train.max(axis=0))

print('\nStandardScaler 縮放過後訓練集的最小值 : ', X_train_scaled.min(axis=0))
print('StandardScaler 縮放過後訓練集的最大值 : ', X_train_scaled.max(axis=0))

資料集 X 的最小值 :  [ 11.03   0.74   1.36  10.6   70.     0.98   0.34   0.13   0.42   1.28
   0.48   1.27 278.  ]
資料集 X 的最大值 :  [  14.83    5.8     3.22   30.    162.      3.88    3.74    0.63    3.58
   13.      1.71    3.92 1515.  ]

StandardScaler 縮放過後訓練集的最小值 :  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
StandardScaler 縮放過後訓練集的最大值 :  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [None]:
X_test_scaled = scaler.transform(X_test)

print('\nStandardScaler 縮放過後測試集的最小值 : ', X_test_scaled.min(axis=0))
print('StandardScaler 縮放過後測試集的最大值 : ', X_test_scaled.max(axis=0))


StandardScaler 縮放過後測試集的最小值 :  [ 0.13947368  0.02964427  0.1827957   0.07216495  0.08695652  0.05862069
  0.03823529  0.08       -0.00316456  0.07167235  0.07317073  0.00754717
  0.00970089]
StandardScaler 縮放過後測試集的最大值 :  [0.88157895 0.97035573 1.00537634 0.92268041 0.63043478 0.87586207
 1.39411765 1.06       0.80379747 0.80204778 0.78861789 1.03018868
 1.13338723]


# MaxAbsScaler絕對值最大標準化
MaxAbsScaler 與 MinMaxScaler 類似，所有數據都會除以該列絕對值後的最大值。 數據會縮放到到[-1,1]之間。

In [None]:
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler().fit(X)
X_scaled = scaler.transform(X)

In [None]:
X_test_scaled = scaler.transform(X_test)

# RobustScaler
可以有效的縮放帶有outlier的數據，透過Robust如果數據中含有異常值在縮放中會捨去

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler().fit(X)
X_scaled = scaler.transform(X)

In [None]:
X_test_scaled = scaler.transform(X_test)