In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeRegressor

# Model Training Regretion

### Read data, naming of columns

In [3]:
mpg_data = pd.read_fwf('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data', header=None, na_vals=['?'])

In [4]:
mpg_data.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

### Observing and cleaning

In [5]:
mpg_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""


In [6]:
mpg_data.shape

(398, 9)

In [7]:
mpg_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
car_name         object
dtype: object

In [8]:
mpg_data = mpg_data.drop(mpg_data[mpg_data.horsepower == '?'].index)

In [9]:
mpg_data.horsepower = mpg_data.horsepower.astype(np.float64)

In [10]:
mpg_data = mpg_data.drop('car_name', axis=1)

### Defining attributes and target, scaling of attributes

In [11]:
mpg_data_atributes = mpg_data.drop('mpg', axis=1)
mpg_data_target = mpg_data['mpg']

In [12]:
mpg_data_atributes.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504.0,12.0,70,1
1,8,350.0,165.0,3693.0,11.5,70,1
2,8,318.0,150.0,3436.0,11.0,70,1
3,8,304.0,150.0,3433.0,12.0,70,1
4,8,302.0,140.0,3449.0,10.5,70,1


In [13]:
# по принцим скалера се запазва в променива. тук не го правим
mpg_data_atributes_scaled = MinMaxScaler().fit_transform(mpg_data_atributes)

In [14]:
mpg_data_atributes_scaled.min(axis=0), mpg_data_atributes_scaled.max(axis=0)

(array([0., 0., 0., 0., 0., 0., 0.]), array([1., 1., 1., 1., 1., 1., 1.]))

In [18]:
attributes_train, attributes_test, targets_train, targets_test = train_test_split(
    mpg_data_atributes_scaled, 
    mpg_data_target,
    test_size=0.2,
    random_state=42
)

### Create a model

In [19]:
tree = DecisionTreeRegressor()

In [20]:
tree.fit(attributes_train,targets_train)

DecisionTreeRegressor()

In [21]:
tree.score(attributes_train, targets_train)

1.0

In [22]:
tree.score(attributes_test, targets_test)

0.7753363833750209

In [24]:
tree.get_depth()

16

In [25]:
tree.feature_importances_

array([4.55831775e-04, 6.30644723e-01, 1.71895165e-01, 6.21740822e-02,
       2.51365295e-02, 1.04226824e-01, 5.46684501e-03])

In [26]:
list(zip(mpg_data_atributes, tree.feature_importances_))

[('cylinders', 0.0004558317748374961),
 ('displacement', 0.6306447227238449),
 ('horsepower', 0.1718951650996981),
 ('weight', 0.062174082198848804),
 ('acceleration', 0.025136529514326315),
 ('model_year', 0.10422682368258358),
 ('origin', 0.005466845005860895)]

In [27]:
dict(zip(mpg_data_atributes, tree.feature_importances_))

{'cylinders': 0.0004558317748374961,
 'displacement': 0.6306447227238449,
 'horsepower': 0.1718951650996981,
 'weight': 0.062174082198848804,
 'acceleration': 0.025136529514326315,
 'model_year': 0.10422682368258358,
 'origin': 0.005466845005860895}