In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy.random import default_rng

### **Load dataset**

Link dataset: <a href='https://www.kaggle.com/datasets/hellbuoy/car-price-prediction'>Here</a>

In [33]:
data = pd.read_csv("CarPrice_Assignment.csv")
data.head(10)

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0
5,6,2,audi fox,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250.0
6,7,1,audi 100ls,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710.0
7,8,1,audi 5000,gas,std,four,wagon,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,18920.0
8,9,1,audi 4000,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875.0
9,10,0,audi 5000s (diesel),gas,turbo,two,hatchback,4wd,front,99.5,...,131,mpfi,3.13,3.4,7.0,160,5500,16,22,17859.167


### **Preprocessing**

In this dataset, we see that the 'carID' and 'CarName' features are useless

There are 3 type of feature:
+ First is some features which need using replace()-method
+ Second is some features which need using oneHotEncoding
+ Third is some features which need normalizing

In [34]:
# All Columns we will convert it into one-hot-vector or just use replace() method
print(f"--Aspiration-column:        {len(data['aspiration'].unique())}")
print(f"--Doornumber-column:        {len(data['doornumber'].unique())}")
print(f"--CarBody-column:           {len(data['carbody'].unique())}")
print(f"--Drivewheel-column:        {len(data['drivewheel'].unique())}")
print(f"--Enginelocation-column:    {len(data['enginelocation'].unique())}")
print(f"--Enginetype-column:        {len(data['enginetype'].unique())}")
print(f"--Cylindernumber-column:    {len(data['cylindernumber'].unique())}")
print(f"--Fuelsystem-column:        {len(data['fuelsystem'].unique())}")

--Aspiration-column:        2
--Doornumber-column:        2
--CarBody-column:           5
--Drivewheel-column:        3
--Enginelocation-column:    2
--Enginetype-column:        7
--Cylindernumber-column:    7
--Fuelsystem-column:        8


In [35]:
listReplace = ['fueltype', 'aspiration', 'doornumber', 'enginelocation', 'cylindernumber']
listOneHot = ['carbody', 'drivewheel', 'enginetype', 'fuelsystem']
listNormalization = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'price']

In [36]:
# Drop useless columns
data = data.drop(columns=['car_ID', 'CarName'])
# Replace value of each item in listReplace
data['fueltype'] = data['fueltype'].replace('gas', 0).replace('diesel', 1)
data['aspiration'] = data['aspiration'].replace('std', 0).replace('turbo', 1)
data['doornumber'] = data['doornumber'].replace('four', 0).replace('two', 1)
data['enginelocation'] = data['enginelocation'].replace('front', 0).replace('rear', 1)
data['cylindernumber'] = data['cylindernumber'].replace('four', 4).replace('six', 6).replace('five', 5).replace('eight', 8).replace('two', 2).replace('three', 3).replace('twelve', 12)
# Use one-hot-encoding for all item in listOneHot
for item in listOneHot:
    data = pd.concat([
                     data,
                     pd.get_dummies(data[item], prefix=item)
    ], axis=1).drop(columns=[item])
# Normalize all item in listNormalization
for item in listNormalization:
    data[item] = ( data[item] - np.average(data[item]) ) / (np.max(data[item]) - np.min(data[item]))

In [39]:
# After dropping: 26-2=24
# After replacing: 24
# After oneHotEncoding: (24-4) [drop] + 5 + 3 + 7 + 8 = 43
# After normalizing: 43
# Note: There is just one column having no preprocess on, that is 'symboling' column
print(f"Shape of dataset after preprocessing: {data.shape}")

Shape of dataset after preprocessing: (205, 43)


### **Train-test split**

In [40]:
X = data.drop(columns=['price'])
y = data['price']
# Get random rows (80% for training, 20% for testing)
rng = default_rng()
train_index = rng.choice(data.shape[0], size=int((data.shape[0])*0.8), replace=False)
test_index = np.array([i for i in range(data.shape[0]) if i not in train_index])
# Get train-test based on train_index, test_index
X_train, X_test = X.copy().loc[train_index], X.copy().loc[test_index]
y_train, y_test = y.copy().loc[train_index], y.copy().loc[test_index]
X_train, X_test, y_train, y_test = X_train.values, X_test.values, y_train.values, y_test.values
y_train, y_test = y_train.reshape((y_train.shape[0], 1)), y_test.reshape((y_test.shape[0], 1))

### **Generate parameters and hyperparameters**

In [57]:
# Hyperparameters
alpha = 7e-3                    # Learning rate
epsilon = 1e-3                  # Threshold for stopping training
iter, printIter = 0, 10000      # Print information after printIter iter
# Parameters
theta = np.array([ [1] for i in range(X_train.shape[1]) ])
bias = 1
der_bias, der_theta = 1, [1 for i in range(X_train.shape[1])]

### **Training**

In [58]:
while True:
    # Calculate loss using Mean Square Error loss function
    Loss = np.mean((np.dot(X_train, theta) + bias) ** 2)
    # Derivative
    der_bias = np.mean(np.subtract(np.dot(X_train, theta) + bias, y_train))
    der_theta = (1/X_train.shape[0]) * np.dot( X_train.T, np.subtract(np.dot(X_train, theta) + bias, y_train) )
    # Update parameters
    bias = np.subtract(bias, alpha * der_bias)
    theta = np.subtract(theta, alpha * der_theta)
    # Print loss and sum([der_bias, der_theta])
    if iter%printIter==0:
        print(f"----Iter {iter}:")
        print(f"    Loss: {Loss}")
        print(f"    Sum der: {abs(sum([der_bias, sum(der_theta)]))}")
        # print(f"    der_bias: {der_bias}")
        # print(f"    der_theta: {der_theta}")
    iter += 1
    if abs(sum([der_bias, sum(der_theta)])) < epsilon:
        break

----Iter 0:
    Loss: 123.71464627969236
    Sum der: [123.52948646]
----Iter 10000:
    Loss: 0.06777487725564646
    Sum der: [0.06840639]
----Iter 20000:
    Loss: 0.04659134473814343
    Sum der: [0.03575608]
----Iter 30000:
    Loss: 0.039634507668454125
    Sum der: [0.02197661]
----Iter 40000:
    Loss: 0.036447469491219756
    Sum der: [0.01466811]
----Iter 50000:
    Loss: 0.03471073285104753
    Sum der: [0.01040294]
----Iter 60000:
    Loss: 0.03368244732409933
    Sum der: [0.00772875]
----Iter 70000:
    Loss: 0.03304704819214307
    Sum der: [0.00594984]
----Iter 80000:
    Loss: 0.03264443798423035
    Sum der: [0.00470874]
----Iter 90000:
    Loss: 0.03238527902476233
    Sum der: [0.00380959]
----Iter 100000:
    Loss: 0.03221688926573722
    Sum der: [0.00313839]
----Iter 110000:
    Loss: 0.032107031697736925
    Sum der: [0.00262505]
----Iter 120000:
    Loss: 0.032035432270571026
    Sum der: [0.00222439]
----Iter 130000:
    Loss: 0.03198906873139564
    Sum der: 

In [59]:
y_true = y_test
y_pre = np.dot(X_test, theta) + bias
u = np.sum(np.subtract(y_true, y_pre)**2)
v = np.sum(np.subtract(y_true, np.mean(y_true)**2 ))
print(f"Loss (MSE) on test-set: {u}")
print(f"R2 score on test-set: {1 - (u/v)}")

Loss (MSE) on test-set: 0.3123957660447368
R2 score on test-set: 0.7473069814355771
