# Julia 機器學習：DecisionTree 決策樹

## 作業 030：乳癌預測資料集

請使用隨機森林模型建立一個分類模型，來預測乳癌資料集中，為良性或是惡性的腫瘤。

In [9]:
using DecisionTree, RDatasets, DataFrames, MLDataUtils, Statistics

## 讀取資料

In [10]:
biopsy = dataset("MASS", "biopsy")
first(biopsy, 10)

Unnamed: 0_level_0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,Class
Unnamed: 0_level_1,String,Int32,Int32,Int32,Int32,Int32,Int32?,Int32,Int32,Int32,Cat…
1,1000025,5,1,1,1,2,1,3,1,1,benign
2,1002945,5,4,4,5,7,10,3,2,1,benign
3,1015425,3,1,1,1,2,2,3,1,1,benign
4,1016277,6,8,8,1,3,4,3,7,1,benign
5,1017023,4,1,1,3,2,1,3,1,1,benign
6,1017122,8,10,10,8,7,10,9,7,1,malignant
7,1018099,1,1,1,1,2,10,3,1,1,benign
8,1018561,2,1,2,1,2,1,3,1,1,benign
9,1033078,2,1,1,1,2,1,1,1,5,benign
10,1033078,4,2,1,1,2,1,2,1,1,benign


In [11]:
biopsy = dropmissing(biopsy)
first(biopsy, 10)

Unnamed: 0_level_0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,Class
Unnamed: 0_level_1,String,Int32,Int32,Int32,Int32,Int32,Int32,Int32,Int32,Int32,Cat…
1,1000025,5,1,1,1,2,1,3,1,1,benign
2,1002945,5,4,4,5,7,10,3,2,1,benign
3,1015425,3,1,1,1,2,2,3,1,1,benign
4,1016277,6,8,8,1,3,4,3,7,1,benign
5,1017023,4,1,1,3,2,1,3,1,1,benign
6,1017122,8,10,10,8,7,10,9,7,1,malignant
7,1018099,1,1,1,1,2,10,3,1,1,benign
8,1018561,2,1,2,1,2,1,3,1,1,benign
9,1033078,2,1,1,1,2,1,1,1,5,benign
10,1033078,4,2,1,1,2,1,2,1,1,benign


## 切分訓練與測試資料集

In [12]:
indecies = MLDataUtils.shuffleobs(collect(1: nrow(biopsy)))
train_ind, test_ind = MLDataUtils.splitobs(indecies, at = 0.8);

## 轉成矩陣型態

In [13]:
features = Matrix{Float64}(biopsy[!, 2:10])
labels = string.(biopsy[!, :Class]);

## 隨機森林模型

In [14]:
model = DecisionTree.RandomForestClassifier(n_trees = 500, max_depth = 9)

RandomForestClassifier
n_trees:             500
n_subfeatures:       -1
partial_sampling:    0.7
max_depth:           9
min_samples_leaf:    1
min_samples_split:   2
min_purity_increase: 0.0
classes:             nothing
ensemble:            nothing

## 訓練

In [15]:
DecisionTree.fit!(model, features[train_ind, :], labels[train_ind])

RandomForestClassifier
n_trees:             500
n_subfeatures:       -1
partial_sampling:    0.7
max_depth:           9
min_samples_leaf:    1
min_samples_split:   2
min_purity_increase: 0.0
classes:             ["benign", "malignant"]
ensemble:            Ensemble of Decision Trees
Trees:      500
Avg Leaves: 18.522
Avg Depth:  7.884

## 預測

In [16]:
y = DecisionTree.predict(model, features[test_ind, :])

137-element Array{String,1}:
 "benign"
 "malignant"
 "benign"
 "benign"
 "malignant"
 "malignant"
 "benign"
 "benign"
 "benign"
 "benign"
 "malignant"
 "malignant"
 "benign"
 ⋮
 "benign"
 "benign"
 "benign"
 "benign"
 "benign"
 "benign"
 "malignant"
 "benign"
 "benign"
 "benign"
 "malignant"
 "benign"

## 評估模型

In [17]:
accuracy(xs, ys) = mean(xs .== ys)

accuracy (generic function with 1 method)

In [18]:
accuracy(y, labels[test_ind])

0.9781021897810219