In [41]:
import pandas as pd
import numpy as np

In [42]:
automobile_data=pd.read_csv('Dataset/Automobile_data.csv')

In [43]:
'''
This contains different makes and models of cars and a variety of 
features about each of these cars, such as the fuel system, engine size, 
compression ratio, horsepower, and so on. Any dataset from the real world 
is likely to have some missing fields. 

In this dataset, these are represented by question marks. 
This dataset is useful, price prediction. The very last column here 
is the price of that particular automobile given all its features. 
'''
automobile_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [44]:
'''
A common practice when dealing with missing data is simply to drop 
those records which have information missing. 

This is a technique that works for us for the simple model that 
we are building here. We'll first replace all of the question marks by np.nan. 
'''

automobile_data=automobile_data.replace('?',np.nan)
automobile_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [45]:
'''
 The next step is to drop those records which have missing fields and 
 this can now be done easily in Pandas using the drop any function. 
 
 Our dataset now contains only those records which have values for all columns.
'''

automobile_data=automobile_data.dropna()
automobile_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
6,1,158,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
8,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
10,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430


In [46]:
'''
we are going to make use of just four features, the make, fuel type, body style, 
and horsepower. Let's say we expect these to affect the price of the car the most.

Now in the real world, you'll probably perform some exploratory 
analysis to see what features are most significant. 
'''
col=['make','fuel-type','body-style','horsepower']
automobile_features=automobile_data[col]
automobile_features.head()

Unnamed: 0,make,fuel-type,body-style,horsepower
3,audi,gas,sedan,102
4,audi,gas,sedan,115
6,audi,gas,sedan,110
8,audi,gas,sedan,140
10,bmw,gas,sedan,101


In [47]:
'''
hese are the features, or Xvalues, the target, or Ylabels is the 
automobile price and we extract that into the automobile_target dataframe.
'''

automobile_target=automobile_data['price']
automobile_target.head()

3     13950
4     17450
6     17710
8     23875
10    16430
Name: price, dtype: object

In [48]:
'''
All of the fields that we've read into this Pandas data frame are in the 
string format. 

So if you take a look at the horsepower by calling the describe function 
of the horsepower column, you can see that there are 159 unique values. 

Horsepower, however, is a numeric value and has numeric meaning 
so we need to convert it to numeric form. 
'''
automobile_features['horsepower'].describe()

count     159
unique     48
top        68
freq       18
Name: horsepower, dtype: object

In [49]:
'''
we'll turn off a certain warning that Pandas throws up, 
the SettingWithCopyWarning, which warns of unpredictable results when 
we perform chained assignments on our data frame. 
'''
pd.options.mode.chained_assignment=None

In [50]:
'''
We'll then use the pd. to_numeric function in order to convert the 
horsepower field to a numeric form. 

If you now run the describe function on the horsepower column, 
you'll get statistics for numeric data, mean, median, standard deviation, 
and so on.
'''
automobile_features['horsepower']=pd.to_numeric(automobile_features['horsepower'])
automobile_features['horsepower'].describe()

count    159.000000
mean      95.836478
std       30.718583
min       48.000000
25%       69.000000
50%       88.000000
75%      114.000000
max      200.000000
Name: horsepower, dtype: float64

In [51]:
'''
Similarly, the price column in our automobile target data frame is also in 
a string format. 
'''
automobile_data['price'].describe()

count      159
unique     145
top       7898
freq         2
Name: price, dtype: object

In [52]:
'''
We convert that to a numeric form as well by calling astype float. We're simply casting the price strings to be of type float. Run the describe function on our automobile prices and you'll see statistics for numeric values
'''

automobile_target=automobile_target.astype(float)
automobile_target.describe()

count      159.000000
mean     11445.729560
std       5877.856195
min       5118.000000
25%       7372.000000
50%       9233.000000
75%      14719.500000
max      35056.000000
Name: price, dtype: float64

In [53]:
'''
. Our input X features contains values for other columns such as make 
fuel-type and body-style. 
All three of these columns contain values that are categorical in nature. 
They belong to these three categories. 

The inputs that you pass into neural networks and outputs 
from neural networks can only be numeric. 

That means you need to convert these categorical values to some numeric form. 

For this, we'll use one-hot encoding. One-hot encoding is a 
standard technique used to encode categorical data in numeric form.
'''

automobile_features=pd.get_dummies(automobile_features,columns=['make','fuel-type','body-style'])
automobile_features.head()

Unnamed: 0,horsepower,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_jaguar,make_mazda,make_mercedes-benz,make_mitsubishi,...,make_toyota,make_volkswagen,make_volvo,fuel-type_diesel,fuel-type_gas,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon
3,102,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,115,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
6,110,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
8,140,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
10,101,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [59]:
'''
Before we feed in these input values to our neural network, we perform some preprocessing. 

When you're working with neural networks on any learning model, 
it's best practice to standardize the numeric values that you 
feed into your ML model. 

This allows the ML model to be more stable during this training 
phase and it just performs better. 

The preprocessing. scale function will standardize the horsepower 
numeric values by subtracting the mean and dividing by the standard deviation.

And here are our standardized values for horsepower. 

Remember, it's not just neural networks, but all ML algorithms 
work better when the numeric values are standardized so that 
they're roughly in the same range. 

We've now set up the input features that we are going to 
feed into a neural network and specify the output targets as well. 
'''
from sklearn import preprocessing
automobile_features[['horsepower']]=\
    preprocessing.scale(automobile_features[['horsepower']])
automobile_features[['horsepower']].head()

Unnamed: 0,horsepower
3,0.201279
4,0.625812
6,0.46253
8,1.442223
10,0.168622


In [61]:
'''
We'll use the test train split library function from sklearn 
in order to split our dataset into training data and test data. 

We'll use 80% of the dataset for training purposes and 20% to 
test how our model performs.
'''
from sklearn.model_selection import train_test_split

X_train,x_test,Y_train,y_test=\
    train_test_split(automobile_features,
    automobile_target,
    test_size=0.2,
    random_state=0)

In [64]:
X_train

Unnamed: 0,horsepower,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_jaguar,make_mazda,make_mercedes-benz,make_mitsubishi,...,make_toyota,make_volkswagen,make_volvo,fuel-type_diesel,fuel-type_gas,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon
185,-0.353881,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
171,0.658469,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
26,-0.909040,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
137,2.095352,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
148,-0.451850,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19,-0.843727,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
144,-0.451850,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
97,-0.876384,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
158,-1.300918,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0


In [65]:
x_test

Unnamed: 0,horsepower,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_jaguar,make_mazda,make_mercedes-benz,make_mitsubishi,...,make_toyota,make_volkswagen,make_volvo,fuel-type_diesel,fuel-type_gas,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon
13,0.821751,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
61,-0.386537,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
136,2.095352,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
68,0.887064,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
151,-1.104979,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
161,-0.843727,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
89,-0.876384,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
172,0.658469,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0
50,-0.90904,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
117,1.507536,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [66]:
Y_train

185     8195.0
171    11549.0
26      7609.0
137    18620.0
148     8013.0
        ...   
19      6295.0
144     9233.0
97      7999.0
158     7898.0
70     31600.0
Name: price, Length: 127, dtype: float64

In [67]:
y_test

13     21105.0
61     10595.0
136    18150.0
68     28248.0
151     6338.0
161     8358.0
89      5499.0
172    17669.0
50      5195.0
117    18150.0
169     9989.0
175     9988.0
90      7099.0
167     8449.0
92      6849.0
54      7395.0
67     25552.0
184     7995.0
119     7957.0
37      7895.0
91      6649.0
81      8499.0
120     6229.0
186     8495.0
93      7349.0
18      5151.0
35      7295.0
86      8189.0
160     7738.0
194    12940.0
30      6479.0
125    22018.0
Name: price, dtype: float64