# Task: Car Price Prediction

## Import libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data Set Information:

This data set consists of three types of entities: (a) the specification of an auto in terms of various characteristics, (b) its assigned insurance risk rating, (c) its normalized losses in use as compared to other cars. The second rating corresponds to the degree to which the auto is more risky than its price indicates. Cars are initially assigned a risk factor symbol associated with its price. Then, if it is more risky (or less), this symbol is adjusted by moving it up (or down) the scale. Actuarians call this process "symboling". A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.

The third factor is the relative average loss payment per insured vehicle year. This value is normalized for all autos within a particular size classification (two-door small, station wagons, sports/speciality, etc...), and represents the average loss per car per year.

Note: Several of the attributes in the database could be used as a "class" attribute.

## Data Dictionary

1. symboling: -3, -2, -1, 0, 1, 2, 3.
2. normalized-losses: continuous from 65 to 256.
3. make:
alfa-romero, audi, bmw, chevrolet, dodge, honda,
isuzu, jaguar, mazda, mercedes-benz, mercury,
mitsubishi, nissan, peugot, plymouth, porsche,
renault, saab, subaru, toyota, volkswagen, volvo

4. fuel-type: diesel, gas.
5. aspiration: std, turbo.
6. num-of-doors: four, two.
7. body-style: hardtop, wagon, sedan, hatchback, convertible.
8. drive-wheels: 4wd, fwd, rwd.
9. engine-location: front, rear.
10. wheel-base: continuous from 86.6 120.9.
11. length: continuous from 141.1 to 208.1.
12. width: continuous from 60.3 to 72.3.
13. height: continuous from 47.8 to 59.8.
14. curb-weight: continuous from 1488 to 4066.
15. engine-type: dohc, dohcv, l, ohc, ohcf, ohcv, rotor.
16. num-of-cylinders: eight, five, four, six, three, twelve, two.
17. engine-size: continuous from 61 to 326.
18. fuel-system: 1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.
19. bore: continuous from 2.54 to 3.94.
20. stroke: continuous from 2.07 to 4.17.
21. compression-ratio: continuous from 7 to 23.
22. horsepower: continuous from 48 to 288.
23. peak-rpm: continuous from 4150 to 6600.
24. city-mpg: continuous from 13 to 49.
25. highway-mpg: continuous from 16 to 54.
26. price: continuous from 5118 to 45400.		

**Source: https://archive.ics.uci.edu/ml/datasets/Automobile


## Read data

In [None]:
df = pd.read_csv('CarPrice.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

## Data Preprocessing

In [None]:
df['CarName'].nunique()

## Label encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

### encode the categorical features

In [None]:
cat_features = [feature for feature in df.columns if df[feature].dtype == 'object']
cat_features

In [None]:
encoder = LabelEncoder()

for feature in cat_features:
    df[feature] = encoder.fit_transform(df[feature])

In [None]:
df['CarName'].nunique()

## Get the x and y data

In [None]:
x = df.iloc[:, 1:-1]
x = x.drop('CarName', axis = 1)
y = pd.DataFrame(df['price'])

In [None]:
x.head()

## Scaling

### Standard scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
x_scaled = sc.fit_transform(x)

In [None]:
pd.DataFrame(x_scaled)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.2, random_state = 0)

## size/shape of dataframe

In [None]:
n_samples = x_train.shape[0]
n_features = x_train.shape[1]
print(f'n_samples: {n_samples}, n_features: {n_features}')

## Make the NN using Sequential API

In [None]:
# Importing the Keras libraries and packages
from keras.layers import Dense
from keras.models import Sequential

In [None]:
# define and initialize the model
model = Sequential()

# Adding the input layer AND the first hidden layer (Pay attention to this)
model.add(Dense(units = 16, activation = 'relu', input_dim = n_featuers))

# second hidden layer
model.add(Dense(units = 8, activation = 'relu'))

# Adding the last (output) layer
model.add(Dense(units = 1, activation = 'relu'))

## Compiling the ANN

### Custom loss function

In [None]:
import tensorflow.keras.backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
model.compile(optimizer = "adam", loss = root_mean_squared_error, 
              metrics =[tf.keras.metrics.RootMeanSquaredError()])

## Fitting the ANN to the Training set

In [None]:
from math import sqrt

history = model.fit(x_train, y_train, validation_split=0.2,
                            batch_size = 10, epochs = 100)

### list all the data in history

In [None]:
# list all the data in history
print(history.history.keys())

### Plot the metrics for both train and validation set

In [None]:
# Plot the accuracy for both train and validation set
plt.subplots() # open a new plot
plt.plot(history.history['root_mean_squared_error'])
plt.plot(history.history['val_root_mean_squared_error'])
plt.title('Model RMSE')
plt.ylabel('RMSE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'])
plt.show()

### Plot the loss for both train and validation set

In [None]:
# Plot the loss for both train and validation set
plt.subplots() # open a new plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.show()

## Let's try more complex model

In [None]:
model = Sequential()
model.add(Dense(128 , activation='relu', input_shape=(n_featuers,))) # Input Layer

model.add(Dense(64 , activation='relu'))

model.add(Dense(32 , activation='relu'))

model.add(Dense(32 , activation='relu'))

model.add(Dense(8 , activation='relu'))

model.add(Dense(8 , activation='relu'))

model.add(Dense(1))
model.compile(optimizer='adam',
              loss=root_mean_squared_error,
              metrics=[tf.keras.metrics.RootMeanSquaredError()]) # Compiling Model

In [None]:
model.summary()

In [None]:
history = model.fit(x_train, y_train, validation_split=0.1,
                            batch_size = 64, epochs = 100)

In [None]:
history.history.keys()

### Plot the metrics for both train and validation set

In [None]:
# Plot the RMSE for both train and validation set
plt.subplots()
plt.plot(history.history['root_mean_squared_error'])
plt.plot(history.history['val_root_mean_squared_error'])
plt.title('Model RMSE')
plt.ylabel('RMSE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'])
plt.show()


### Plot the loss for both train and validation set

In [None]:
# Plot the loss for both train and validation set
plt.subplots() # open a new plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.show()

## Reduce overfitting

### Add L2 regularization

In [None]:
model = Sequential()
model.add(Dense(128 , activation='relu', input_shape=(n_featuers,), kernel_regularizer='l2')) 

model.add(Dense(64 , activation='relu',kernel_regularizer='l2'))

model.add(Dense(32 , activation='relu', kernel_regularizer='l2'))

model.add(Dense(32 , activation='relu', kernel_regularizer='l2'))

model.add(Dense(8 , activation='relu'))

model.add(Dense(8 , activation='relu'))

model.add(Dense(1))
model.compile(optimizer='adam',
              loss=root_mean_squared_error,
              metrics=[tf.keras.metrics.RootMeanSquaredError()]) # Compiling Model

In [None]:
history = model.fit(x_train, y_train, validation_split=0.1,
                            batch_size = 64, epochs = 100)

In [None]:
# Plot the accuracy for both train and validation set
plt.subplots() # open a new plot
plt.plot(history.history['root_mean_squared_error'])
plt.plot(history.history['val_root_mean_squared_error'])
plt.title('model metrics')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.show()

In [None]:
# Plot the loss for both train and validation set
plt.subplots() # open a new plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.show()

### Add dropout layers

In [None]:
from keras.layers import Dropout

In [None]:
model = Sequential()
model.add(Dense(128 , activation='relu', input_shape=(n_featuers,))) 
model.add(Dropout(0.4)) # Dropout Layer
model.add(Dense(64 , activation='relu'))
model.add(Dropout(0.4)) # Dropout Layer
model.add(Dense(32 , activation='relu'))
model.add(Dropout(0.4)) # Dropout Layer
model.add(Dense(32 , activation='relu'))
model.add(Dropout(0.4)) # Dropout Layer
model.add(Dense(8 , activation='relu'))

model.add(Dense(8 , activation='relu'))

model.add(Dense(1))
model.compile(optimizer='adam',
              loss=root_mean_squared_error,
              metrics=['mae']) # Compiling Model

In [None]:
history = model.fit(x_train, y_train, validation_split=0.1,
                            batch_size = 128, epochs = 100)

In [None]:
# Plot the metrics for both train and validation set
plt.subplots() # open a new plot
plt.plot(history.history['mae'])
plt.plot(history.history['val_mae'])
plt.title('model metrics')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.show()

In [None]:
# Plot the loss for both train and validation set
plt.subplots() # open a new plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.show()

## Make the NN using Functional API

In [None]:
# Multilayer Perceptron
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
import tensorflow as tf
visible = Input(shape=(n_features,))
hidden1 = Dense(128, activation='relu')(visible)
hidden2 = Dense(64, activation='relu')(hidden1)
hidden3 = Dense(32, activation='relu')(hidden2)
hidden4 = Dense(32, activation='relu')(hidden3)
output = Dense(1, activation='sigmoid')(hidden4)
model = Model(inputs=visible, outputs=output)
# summarize layers
print(model.summary())
# plot graph