In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [65]:
def min_max_scaler(data):
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)
    # noise term prevents the zero division
    return numerator / (denominator + 1e-7)

In [66]:
abalone_data = pd.read_csv('data/abalone.csv',  names = ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings'])
abalone_data.head(10)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [67]:
# min-max 정규화
abalone_data['Rings'] = min_max_scaler(abalone_data['Rings'])
print(abalone_data)

     Sex  Length  Diameter  Height  Whole weight  Shucked weight  \
0      M   0.455     0.365   0.095        0.5140          0.2245   
1      M   0.350     0.265   0.090        0.2255          0.0995   
2      F   0.530     0.420   0.135        0.6770          0.2565   
3      M   0.440     0.365   0.125        0.5160          0.2155   
4      I   0.330     0.255   0.080        0.2050          0.0895   
5      I   0.425     0.300   0.095        0.3515          0.1410   
6      F   0.530     0.415   0.150        0.7775          0.2370   
7      F   0.545     0.425   0.125        0.7680          0.2940   
8      M   0.475     0.370   0.125        0.5095          0.2165   
9      F   0.550     0.440   0.150        0.8945          0.3145   
10     F   0.525     0.380   0.140        0.6065          0.1940   
11     M   0.430     0.350   0.110        0.4060          0.1675   
12     M   0.490     0.380   0.135        0.5415          0.2175   
13     F   0.535     0.405   0.145        0.6845

In [68]:
data = np.array(abalone_data)
print(data[0:5,:])
print(data.shape)

[['M' 0.455 0.365 0.095 0.514 0.2245 0.10099999999999999 0.15
  0.4999999982142857]
 ['M' 0.35 0.265 0.09 0.2255 0.0995 0.0485 0.07 0.21428571352040815]
 ['F' 0.53 0.42 0.135 0.677 0.2565 0.1415 0.21 0.28571428469387755]
 ['M' 0.44 0.365 0.125 0.516 0.2155 0.114 0.155 0.3214285702806122]
 ['I' 0.33 0.255 0.08 0.205 0.0895 0.0395 0.055 0.21428571352040815]]
(4177, 9)


In [69]:
nvar = data.shape[1]
datax = data[:,1:nvar]
datay = data[:,0]
print(datax[0:5,:])
print(datay[0:5])

[[0.455 0.365 0.095 0.514 0.2245 0.10099999999999999 0.15
  0.4999999982142857]
 [0.35 0.265 0.09 0.2255 0.0995 0.0485 0.07 0.21428571352040815]
 [0.53 0.42 0.135 0.677 0.2565 0.1415 0.21 0.28571428469387755]
 [0.44 0.365 0.125 0.516 0.2155 0.114 0.155 0.3214285702806122]
 [0.33 0.255 0.08 0.205 0.0895 0.0395 0.055 0.21428571352040815]]
['M' 'M' 'F' 'M' 'I']


In [70]:
from sklearn.model_selection import train_test_split
trnx, tstx, trny, tsty = train_test_split(datax, datay, test_size=0.3, random_state=510)
print(trnx.shape)
print(tstx.shape)
print(trny.shape)
print(tsty.shape)

(2923, 8)
(1254, 8)
(2923,)
(1254,)


In [71]:
from sklearn import tree
tree_model = tree.DecisionTreeClassifier(max_depth=4, min_samples_split=3)
tree_model.fit(X=trnx, y=trny)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [72]:
tree_pred = tree_model.predict(X=tstx)

In [73]:
tree_model.feature_importances_

array([0.04676463, 0.        , 0.01213066, 0.68118879, 0.03633871,
       0.03525328, 0.01010567, 0.17821826])

In [74]:
from sklearn.tree import export_graphviz
export_graphviz(tree_model, out_file='TreeClf.dot')

In [75]:
from sklearn.metrics import confusion_matrix
confusion_matrix(tsty, tree_model.predict(tstx))

array([[271,  50,  76],
       [ 68, 281,  38],
       [280,  71, 119]])

In [76]:
from sklearn.metrics import accuracy_score
accuracy_score(tsty, tree_pred)

0.5350877192982456