In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

# 讀入紅酒資料/了解欄位資訊
* 固定酸度(Fixed Acidity): 與葡萄酒有關的多數酸，為固定或非揮發性（不易蒸發）
* 揮發性酸度(Volatile Acidity): 葡萄酒中乙酸的含量，含量過高會導致令人不快的醋味
* 檸檬酸(Citric Acid): 少量檸檬酸可以增加葡萄酒的“清爽度”和風味
* 殘糖量(Residual Sugar): 
發酵停止後剩餘的糖量，很少發現低於1克/升的葡萄酒，而高於45克/升的葡萄酒被認為是甜的
* 氯化物(Chlorides): 酒中鹽的含量
* 游離二氧化硫(Free Sulfur Dioxide): 存在於分子SO2和亞硫酸氫根離子間平衡的游離形式，可以防止微生物生長和葡萄酒的氧化
* 總二氧化硫(Total Sulfur Dioxide): S02的自由和結合形式的數量
* 密度(Density): 水的密度(根據酒精和糖含量的百分比計算)
* 酸鹼度(pH): 描述葡萄酒的酸性或鹼性程度，從0（非常酸性）到14（非常鹼性）；大多數葡萄酒的pH值在3-4之間
* 硫酸鹽(Sulphates): 一種葡萄酒添加劑，可提高二氧化硫氣體(SO2)水平，起到抗菌和抗氧化劑的作用
* 酒精含量(Alchole): 葡萄酒的酒精含量百分比
* 品質(Quality): 輸出變量（基於感官數據，得分在0到10之間）
* (參考資料: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009)

In [3]:
data = pd.read_csv('/content/drive/MyDrive/winequality-red.csv')
rows = data.shape[0]
cols = data.shape[1] - 1
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 問題轉換為二元分類(品質是否高於平均)

In [None]:
idxs = (data['quality'] < np.mean(data['quality']))
data.loc[:,'label'] = 1
data.loc[idxs, 'label'] = 0
data.loc[:,['quality', 'label']]

Unnamed: 0,quality,label
0,5,0
1,5,0
2,5,0
3,6,1
4,5,0
...,...,...
1594,5,0
1595,6,1
1596,6,1
1597,5,0


# 提取出特徵和預測目標

In [None]:
X = data.iloc[:,:-2]
Y = data.iloc[:,-1]

# 設定損失函數(吉尼係數)

In [None]:
def gini(groups, classes):
  ## 計算父節點樣本數

	n = float(sum([len(group) for group in groups]))
	g = 0.0
  
  ## 計算分支吉尼係數

	for group in groups:
		s = float(len(group))
		if s == 0: continue
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / s
			g = g + p * (1 - p) * (s / n)
	return(g)

# 模擬分支

In [None]:
def sim_split(idx, value, data):
	left, right = list(), list()
	for row in data:
		if row[idx] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

# 評估所有分支點表現並回傳最佳選擇

In [None]:
def get_split(data):
  
  ## 最佳分支特徵、分支點、吉尼值、左右子群

  opt_col = None
  opt_val = None
  opt_gvalue = np.Infinity
  opt_groups = None

  ## 評估所有特徵-值組合之吉尼值，選擇最佳回傳

  class_values = list(set(row[-1] for row in data))
  for idx in range(len(data[0]) - 1):
    for row in data:
      groups = sim_split(idx, row[idx], data)
      gvalue = gini(groups, class_values)
      if gvalue < opt_gvalue:
        opt_col, opt_val, opt_gvalue, opt_groups = idx, row[idx], gvalue, groups
  return({'column': opt_col, 'value': opt_val, 'groups': opt_groups})

# 為當前節點建立葉節點

In [None]:
def leaf(group):

  ## 紀錄葉節點的多數類別(做為預測)

  outcomes = [row[-1] for row in group]
  majority = max(set(outcomes), key = outcomes.count)
  return(majority)

# 正式進行分支

In [None]:
def split(node, max_depth, min_size, depth):
  
  ## 取得最佳分支的左右子群樣本

  left, right = node['groups']
  del(node['groups'])

  ## 判斷左右子群是否存在，若否，合併子群並建立葉節點

  if not left or not right:
    node['left'] = node['right'] = leaf(left + right)
    return

  ## 判斷是否達大最大深度，若是，則子群各自建立葉節點

  if depth >= max_depth:
    node['left'], node['right'] = leaf(left), leaf(right)
    return

  ## 若左子群滿足分支最少樣本數，則繼續"遞迴"進行分支

  if len(left) <= min_size:
    node['left'] = leaf(left)
  else:
    node['left'] = get_split(left)
    split(node['left'], max_depth, min_size, depth+1)
  
  ## 若右子群滿足分支最少樣本數，則繼續"遞迴"進行分支

  if len(right) <= min_size:
    node['right'] = leaf(right)
  else:
    node['right'] = get_split(right)
    split(node['right'], max_depth, min_size, depth+1)

# 建立決策樹模型

In [None]:
def build_tree(data, max_depth, min_size):
  root = get_split(data)
  split(root, max_depth, min_size, 1)
  return(root)

# 決策樹模型預測

In [None]:
def predict(node, row):
  if row[node['column']] < node['value']:
    if isinstance(node['left'], dict):
      return predict(node['left'], row)
    else:
      return node['left']
  else:
    if isinstance(node['right'], dict):
      return predict(node['right'], row)
    else:
      return node['right']

# 樹狀結構繪圖

In [None]:
def print_tree(node, depth = 0):
  if isinstance(node, dict):
    print('%s[%s < %.3f]' % ((depth * '－', cols[node['column']], node['value'])))
    print_tree(node['left'], depth+1)
    print_tree(node['right'], depth+1)
  else:
    print('%s[%s]' % ((depth * '－', node)))

# 決策樹模型訓練

In [None]:
train = X
train['label'] = Y
train = np.array(train)
model = build_tree(train, 5, 5)

# 繪製樹狀結構

In [None]:
print_tree(model)

[alcohol < 0.292]
－[sulphates < 0.150]
－－[chlorides < 0.114]
－－－[volatile acidity < 0.240]
－－－－[fixed acidity < 0.168]
－－－－－[0.0]
－－－－－[1.0]
－－－－[fixed acidity < 0.274]
－－－－－[0.0]
－－－－－[0.0]
－－－[alcohol < 0.179]
－－－－[pH < 0.441]
－－－－－[0.0]
－－－－－[0.0]
－－－－[sulphates < 0.132]
－－－－－[0.0]
－－－－－[1.0]
－－[total sulfur dioxide < 0.159]
－－－[fixed acidity < 0.549]
－－－－[volatile acidity < 0.301]
－－－－－[1.0]
－－－－－[0.0]
－－－－[volatile acidity < 0.205]
－－－－－[1.0]
－－－－－[1.0]
－－－[total sulfur dioxide < 0.318]
－－－－[volatile acidity < 0.171]
－－－－－[1.0]
－－－－－[0.0]
－－－－[pH < 0.157]
－－－－－[1.0]
－－－－－[0.0]
－[alcohol < 0.477]
－－[sulphates < 0.156]
－－－[volatile acidity < 0.151]
－－－－[fixed acidity < 0.239]
－－－－－[1.0]
－－－－－[1.0]
－－－－[free sulfur dioxide < 0.099]
－－－－－[0.0]
－－－－－[0.0]
－－－[total sulfur dioxide < 0.283]
－－－－[sulphates < 0.251]
－－－－－[1.0]
－－－－－[1.0]
－－－－[pH < 0.465]
－－－－－[1.0]
－－－－－[0.0]
－－[volatile acidity < 0.260]
－－－[fixed acidity < 1.000]
－－－－[residual sugar < 0.822]
－－－－－[1.0]
－－－－－[0.0]
－－－－[0.0

# 計算混淆矩陣

In [None]:
Y_hat = [predict(model, row) for row in train]
Y_hat = np.array(Y_hat).astype(int)

TP = sum((Y_hat == 1) & (Y == 1))
TN = sum((Y_hat == 0) & (Y == 0))
FP = sum((Y_hat == 1) & (Y == 0))
FN = sum((Y_hat == 0) & (Y == 1))

accuracy = (TP + TN) / rows
precision = TP / (TP + FP)
recall = TP / (TP + FN)

print('TP =', TP)
print('TN =', TN)
print('FP =', FP)
print('FN =', FN)
print('Accuracy =', np.round(accuracy, 3))
print('Precision =', np.round(precision, 3))
print('Recall =', np.round(recall, 3))

TP = 677
TN = 581
FP = 163
FN = 178
Accuracy = 0.787
Precision = 0.806
Recall = 0.792
