In [1]:
# Use seaborn for pairplot
!pip install -q seaborn

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

In [3]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)

2.4.1


In [13]:
# This is the dataset AUTO MPG. It provides a description of the automobiles in the period of
# late -1970s and early 1980. It provides a description of automobiles at that time.
# The description includes: cylinders, displacement, horsepower, and weight.

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)

In [14]:
# .copy() function is a pandas function that creates a copy of a dataframe
# .tail() function is a pandas function that gets the last 5 rows of the dataframe

dataset = raw_dataset.copy()
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


In [15]:
# Clean the data
# .isna() is a pandas function that detects missing values
# There are 6 missing values in the Horsepower field
# .sum() is a pandas function that returns the sum of the values over the requested axis
# in this case the axis is the data fields

dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [16]:
# Clean the data
# .dropna()is a pandas function that removes missing values

dataset = dataset.dropna()

In [17]:
# Clean the data
# We can see now that the 6 missing values in Horsepower were eliminated
dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      0
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [18]:
# Data Transformation
# The Origin column is categorical not numeric so it needs to be converted to
# one-hot-encoding
# This is a Pandas dataframe using the .map() function. The .map() function maps values of a series
# according to input correspondence. It is used for substituting each value in a series with
# another value that might be derived from a function, a dictionary or a series


dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})


In [19]:
# Data Transformation
# After using the function .map() for pandas dataframes, we did the one-hot-encoding for the column
# Origin and we mapped 1 to USA, 2 to Europe and 3 to Japan. Now the colum origin has the values of
# USA, Europe and Japan instead of 1, 2 and 3 that originally had.
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,USA
394,44.0,4,97.0,52.0,2130.0,24.6,82,Europe
395,32.0,4,135.0,84.0,2295.0,11.6,82,USA
396,28.0,4,120.0,79.0,2625.0,18.6,82,USA
397,31.0,4,119.0,82.0,2720.0,19.4,82,USA


In [21]:
# Transform data
# Now we use the function .getdummies() for pandas dataframes.
# .getdummies() converts categorical variable into dummy/indicator variables

dataset = pd.get_dummies(dataset, prefix='', prefix_sep='')


In [22]:
# Transform data
# Now we don't have the Origin column in the dataset the function .get_dummies changed that
# It created a one-hot-encoding and created a 3 new columns instead of Origin and if
# the data record belongs ot that category it puts a 1 otherwise it puts a 0

dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan,USA
393,27.0,4,140.0,86.0,2790.0,15.6,82,0,0,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,1,0,0
395,32.0,4,135.0,84.0,2295.0,11.6,82,0,0,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,0,0,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,0,0,1


In [24]:
# Split data into train set and test set
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [25]:
# Train set
train_dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan,USA
281,19.8,6,200.0,85.0,2990.0,18.2,79,0,0,1
229,16.0,8,400.0,180.0,4220.0,11.1,77,0,0,1
150,26.0,4,108.0,93.0,2391.0,15.5,74,0,1,0
145,32.0,4,83.0,61.0,2003.0,19.0,74,0,1,0
182,28.0,4,107.0,86.0,2464.0,15.5,76,1,0,0


In [26]:
# Test set

test_dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan,USA
369,34.0,4,112.0,88.0,2395.0,18.0,82,0,0,1
375,36.0,4,105.0,74.0,1980.0,15.3,82,1,0,0
382,34.0,4,108.0,70.0,2245.0,16.9,82,0,1,0
384,32.0,4,91.0,67.0,1965.0,15.7,82,0,1,0
396,28.0,4,120.0,79.0,2625.0,18.6,82,0,0,1
