# Demo 1 task 2

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error as sk_mse
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

print('All packages imported!')

All packages imported!


## The data

In [17]:
df = pd.read_csv(r"http://users.jyu.fi/~olkhriye/ties4911/demos/demo1/Automobile_price_data_Raw_set.csv")
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


In [18]:
features = ['make', 'body-style', 'wheel-base', 'engine-size', 'horsepower',
            'peak-rpm', 'highway-mpg']
target = ['price']
data = df[features+target]
data

Unnamed: 0,make,body-style,wheel-base,engine-size,horsepower,peak-rpm,highway-mpg,price
0,alfa-romero,convertible,88.6,130,111.0,5000.0,27,13495.0
1,alfa-romero,convertible,88.6,130,111.0,5000.0,27,16500.0
2,alfa-romero,hatchback,94.5,152,154.0,5000.0,26,16500.0
3,audi,sedan,99.8,109,102.0,5500.0,30,13950.0
4,audi,sedan,99.4,136,115.0,5500.0,22,17450.0
...,...,...,...,...,...,...,...,...
200,volvo,sedan,109.1,141,114.0,5400.0,28,16845.0
201,volvo,sedan,109.1,141,160.0,5300.0,25,19045.0
202,volvo,sedan,109.1,173,134.0,5500.0,23,21485.0
203,volvo,sedan,109.1,145,106.0,4800.0,27,22470.0


In [19]:
# How many NA values per column?
sum(data.isna().values)

array([4, 1, 0, 0, 2, 2, 0, 4])

In [20]:
# Get rid of NA's.
data = data.dropna()
# Convert to NumPy array.
data = data.values
X = data[:, :-1]
Y = data[:, -1].astype('float32')  # Type conversion for TensorFlow.

print(X)

[['alfa-romero' 'convertible' 88.6 ... 111.0 5000.0 27]
 ['alfa-romero' 'convertible' 88.6 ... 111.0 5000.0 27]
 ['alfa-romero' 'hatchback' 94.5 ... 154.0 5000.0 26]
 ...
 ['volvo' 'sedan' 109.1 ... 134.0 5500.0 23]
 ['volvo' 'sedan' 109.1 ... 106.0 4800.0 27]
 ['volvo' 'sedan' 109.1 ... 114.0 5400.0 25]]


### One hot encoding

In [21]:
# An ordinary NumPy array would be easier to handle,
# so I want the output to be that, not the sparse matrix,
# which is default.
enc = OneHotEncoder(sparse_output=False)
enc.fit(X[:, :2])  # The two first columns contain categorical data.
encoded = enc.transform(X[:, :2])
encoded

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

Above are the one-hot encodings. The names of the categories can be found as follows:

In [22]:
enc.categories_

[array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
        'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury',
        'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche', 'saab',
        'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object),
 array(['convertible', 'hardtop', 'hatchback', 'sedan', 'wagon'],
       dtype=object)]

In [23]:
encoded.shape

(194, 26)

Now we have to create an array that contains the same information as ```X```but in the place of the first two columns should be ```encoded```. Therefore, its' shape will be $(194, 7-2+26) = (194, 31)$. 

In [24]:
# Allocate the array...
X_enc = np.zeros(shape=(194, 31))
# ...and fill in.
X_enc[:, :26] = encoded
X_enc[:, 26:] = X[:, 2:]
X_enc.astype('float32')  # Type conversion for TensorFlow compatibility.

array([[1.00e+00, 0.00e+00, 0.00e+00, ..., 1.11e+02, 5.00e+03, 2.70e+01],
       [1.00e+00, 0.00e+00, 0.00e+00, ..., 1.11e+02, 5.00e+03, 2.70e+01],
       [1.00e+00, 0.00e+00, 0.00e+00, ..., 1.54e+02, 5.00e+03, 2.60e+01],
       ...,
       [0.00e+00, 0.00e+00, 0.00e+00, ..., 1.34e+02, 5.50e+03, 2.30e+01],
       [0.00e+00, 0.00e+00, 0.00e+00, ..., 1.06e+02, 4.80e+03, 2.70e+01],
       [0.00e+00, 0.00e+00, 0.00e+00, ..., 1.14e+02, 5.40e+03, 2.50e+01]],
      dtype=float32)

## Split

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X_enc, Y, 
                                                    test_size=.2, 
                                                    random_state=5)


## Scale

In [26]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
X_train

array([[-0.14048787, -0.14048787, -0.23328474, ..., -0.99280208,
         0.81413916,  1.1374623 ],
       [ 7.11805217, -0.14048787, -0.23328474, ...,  0.12343035,
        -0.20682823, -0.48138012],
       [-0.14048787, -0.14048787, -0.23328474, ...,  0.20130703,
         0.60994568, -0.33421263],
       ...,
       [-0.14048787, -0.14048787, -0.23328474, ...,  0.95411495,
         0.81413916, -0.48138012],
       [-0.14048787, -0.14048787, -0.23328474, ...,  0.20130703,
         0.60994568, -0.33421263],
       [-0.14048787, -0.14048787, -0.23328474, ...,  1.39541614,
         0.20155872, -0.7757151 ]])

## Scikit-learn

In [28]:
regressor = LinearRegression()
regressor = regressor.fit(X_train, Y_train)

### MSE for the Scikit-learn model

In [29]:
pred_test_sk = regressor.predict(X_test)
pred_test_sk

array([15997.9640989, 19210.9640989,  6756.4640989,  6034.4640989,
       17266.9640989, 10543.4640989,  6136.4640989, 11754.4640989,
        7882.4640989, 18003.4640989, 14507.4640989, 16224.9640989,
        6210.4640989,  8220.9640989,  7962.4640989, 18590.9640989,
       32386.9640989,  8679.4640989,  6257.4640989,  8418.4640989,
        9957.4640989,  9590.4640989,  7826.4640989,  9466.4640989,
       13347.4640989, 13054.4640989, 12535.4640989, 12249.4640989,
        9597.4640989,  9817.4640989,  5983.4640989,  5674.4640989,
        6372.9640989,  5972.4640989,  8662.9640989, 11023.9640989,
       11772.4640989, 10299.9640989,  9377.4640989])

In [30]:
mse_scikit = sk_mse(pred_test_sk, Y_test)
print('MSE: {}'.format(mse_scikit))

MSE: 3897813.408543849


## Keras

In [31]:
# Build and compile.
input_layer = Input((31,))
hidden = Dense(64, activation='relu')(input_layer)
output_layer = Dense(1)(input_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=.01), loss='mean_squared_error')
model.summary()
model.fit(X_train, Y_train,
          batch_size=10, epochs=200)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 31)]              0         
                                                                 
 dense_1 (Dense)             (None, 1)                 32        
                                                                 
Total params: 32
Trainable params: 32
Non-trainable params: 0
_________________________________________________________________
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Ep

<keras.callbacks.History at 0x1fd252daa90>

In [17]:
pred_test_keras = model.predict(X_test)



### MSE for the Keras model
The shapes of ```pred_test_keras``` and ```Y_test``` are different...

In [18]:
print(pred_test_keras.shape)
print(Y_test.shape)

(39, 1)
(39,)


...so one of them is reshaped just to be sure.

In [19]:
mse_keras = sk_mse(Y_test.reshape(39,), pred_test_keras)
print('MSE: {}'.format(mse_keras)) 

MSE: 148508624.0


## Predictions

In [20]:
X_task = np.array(['audi', 'hatchback', 99.5, 131, 160, 5500, 22]).reshape(1, 7)
X_task_categs = enc.transform(X_task[:, :2]).astype('float32')
X_task_categs

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]], dtype=float32)

In [21]:
X_task_encoded = np.concatenate((X_task_categs, X_task[:, 2:].astype('float32')),
                                axis=1)

In [22]:
X_task_encoded

array([[0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00,
        0.00e+00, 0.00e+00, 9.95e+01, 1.31e+02, 1.60e+02, 5.50e+03,
        2.20e+01]], dtype=float32)

In [33]:
X_task_scaled = scaler.transform(X_task_encoded)
X_task_scaled

NameError: name 'X_task_encoded' is not defined

In [32]:
task_predict_scikit = regressor.predict(X_task_scaled)[0]
task_predict_keras = model.predict(X_task_scaled)[0][0]
print('Predictions:\n'
      '------------')
print('Scikit-learn: {}\n'
      'Keras:        {}'.format(task_predict_scikit,
                                task_predict_keras))

NameError: name 'X_task_scaled' is not defined