In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor, export_graphviz, ExtraTreeRegressor

<h1>Model Training and Improvement - Regression</h1>

In [3]:
mpg_data = pd.read_fwf("data/autompg/auto-mpg.data", header = None, na_vals = ["?"])

In [4]:
mpg_data.columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"]

In [5]:
mpg_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [6]:
mpg_data.horsepower.unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [7]:
mpg_data = mpg_data.drop(mpg_data[mpg_data.horsepower == "?"].index)

In [8]:
mpg_data.horsepower = mpg_data.horsepower.astype(float)

In [9]:
mpg_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight          float64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [10]:
mpg_data.shape

(392, 9)

In [11]:
mpg_data = mpg_data.drop("car name", axis = 1)

In [12]:
mpg_data_attributes = mpg_data.drop("mpg", axis = 1)
mpg_data_target = mpg_data["mpg"]

In [13]:
mpg_data_attributes_scaled = MinMaxScaler().fit_transform(mpg_data_attributes)

In [14]:
mpg_data_attributes_scaled.min(axis = 0), mpg_data_attributes_scaled.max(axis = 0)

(array([0., 0., 0., 0., 0., 0., 0.]), array([1., 1., 1., 1., 1., 1., 1.]))

In [15]:
attributes_train, attributes_test, targets_train, targets_test = train_test_split(
    mpg_data_attributes_scaled,
    mpg_data_target,
    test_size = 0.2,
    random_state = 42
)

In [16]:
tree = DecisionTreeRegressor()

In [17]:
tree.fit(attributes_train, targets_train)

DecisionTreeRegressor()

In [18]:
tree.score(attributes_train, targets_train)

1.0

In [19]:
tree.score(attributes_test, targets_test)

0.7901148939167892

In [20]:
export_graphviz(tree)

'digraph Tree {\nnode [shape=box] ;\n0 [label="X[1] <= 0.337\\nmse = 63.1\\nsamples = 313\\nvalue = 23.599"] ;\n1 [label="X[2] <= 0.209\\nmse = 35.353\\nsamples = 176\\nvalue = 28.994"] ;\n0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;\n2 [label="X[5] <= 0.542\\nmse = 24.234\\nsamples = 101\\nvalue = 32.242"] ;\n1 -> 2 ;\n3 [label="X[3] <= 0.138\\nmse = 11.145\\nsamples = 30\\nvalue = 28.283"] ;\n2 -> 3 ;\n4 [label="X[6] <= 0.75\\nmse = 4.718\\nsamples = 16\\nvalue = 30.781"] ;\n3 -> 4 ;\n5 [label="X[2] <= 0.095\\nmse = 2.745\\nsamples = 7\\nvalue = 28.929"] ;\n4 -> 5 ;\n6 [label="X[3] <= 0.091\\nmse = 1.556\\nsamples = 3\\nvalue = 27.333"] ;\n5 -> 6 ;\n7 [label="X[3] <= 0.063\\nmse = 0.25\\nsamples = 2\\nvalue = 26.5"] ;\n6 -> 7 ;\n8 [label="mse = 0.0\\nsamples = 1\\nvalue = 27.0"] ;\n7 -> 8 ;\n9 [label="mse = 0.0\\nsamples = 1\\nvalue = 26.0"] ;\n7 -> 9 ;\n10 [label="mse = 0.0\\nsamples = 1\\nvalue = 29.0"] ;\n6 -> 10 ;\n11 [label="X[2] <= 0.122\\nmse = 0.297\\nsamples

In [21]:
tree.decision_path(attributes_train[:10])

<10x523 sparse matrix of type '<class 'numpy.int64'>'
	with 110 stored elements in Compressed Sparse Row format>

In [22]:
# Max depth is not configured in this case
tree.max_depth

In [23]:
# Most probably 16 questions are too much
tree.get_depth()

16

In [24]:
tree.feature_importances_

array([0.00401329, 0.62892859, 0.17428133, 0.06264555, 0.02064186,
       0.10350182, 0.00598755])

In [25]:
list(zip(tree.feature_importances_, mpg_data_attributes.columns))

[(0.004013293198243126, 'cylinders'),
 (0.6289285905008706, 'displacement'),
 (0.17428133474668797, 'horsepower'),
 (0.06264555173409418, 'weight'),
 (0.02064185702886951, 'acceleration'),
 (0.1035018247214819, 'model year'),
 (0.005987548069752713, 'origin')]