In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
automobile_df = pd.read_csv('/content/automobile.csv', index_col = False)

# Data Exploratory Analysis

## Correlation Matrix in all columns of the dataset

In [None]:
corr = automobile_df.corr()

cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_precision(2)\

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
car_ID,1.0,-0.15,0.13,0.17,0.05,0.26,0.07,-0.03,0.26,-0.16,0.15,-0.02,-0.2,0.02,0.01,-0.11
symboling,-0.15,1.0,-0.53,-0.36,-0.23,-0.54,-0.23,-0.11,-0.13,-0.01,-0.18,0.07,0.27,-0.04,0.03,-0.08
wheelbase,0.13,-0.53,1.0,0.87,0.8,0.59,0.78,0.57,0.49,0.16,0.25,0.35,-0.36,-0.47,-0.54,0.58
carlength,0.17,-0.36,0.87,1.0,0.84,0.49,0.88,0.68,0.61,0.13,0.16,0.55,-0.29,-0.67,-0.7,0.68
carwidth,0.05,-0.23,0.8,0.84,1.0,0.28,0.87,0.74,0.56,0.18,0.18,0.64,-0.22,-0.64,-0.68,0.76
carheight,0.26,-0.54,0.59,0.49,0.28,1.0,0.3,0.07,0.17,-0.06,0.26,-0.11,-0.32,-0.05,-0.11,0.12
curbweight,0.07,-0.23,0.78,0.88,0.87,0.3,1.0,0.85,0.65,0.17,0.15,0.75,-0.27,-0.76,-0.8,0.84
enginesize,-0.03,-0.11,0.57,0.68,0.74,0.07,0.85,1.0,0.58,0.2,0.03,0.81,-0.24,-0.65,-0.68,0.87
boreratio,0.26,-0.13,0.49,0.61,0.56,0.17,0.65,0.58,1.0,-0.06,0.01,0.57,-0.25,-0.58,-0.59,0.55
stroke,-0.16,-0.01,0.16,0.13,0.18,-0.06,0.17,0.2,-0.06,1.0,0.19,0.08,-0.07,-0.04,-0.04,0.08


## Datatypes of each column

In [None]:
automobile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CompanyName       205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

# Data Preprocessing

## Checking for missing values

We firstly see the number of entries in each column to see what fraction of the data is missing.

In [None]:
DATASET_SIZE = len(automobile_df['car_ID'])

print("Total number of entries in the data:", DATASET_SIZE)

Total number of entries in the data: 205


We then count the number of cells in each column that are missing.

In [None]:
automobile_df.isna().sum()

car_ID              0
symboling           0
CompanyName         0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

Hence, there are no missing values in any column of this dataset.

## Make spellings of categories uniform

Clearly, in the CompanyName column there are different spellings for the same company. We follow the below convention for naming:
- Maxda = Mazda
- Nissan = nissan
- porcshce = porsche
- toyouta = toyota
- vokswagen = volkswagen
- vw = volkswagen

In [None]:
automobile_df['CompanyName'].value_counts()

toyota         32
nissan         18
mazda          17
mitsubishi     13
honda          13
volkswagen     12
subaru         12
peugeot        11
volvo          11
dodge           9
bmw             8
buick           8
plymouth        7
audi            7
saab            6
porsche         5
isuzu           4
chevrolet       3
alfa-romero     3
jaguar          3
renault         2
mercury         1
Name: CompanyName, dtype: int64

In [None]:
correction = {'maxda': 'mazda',  'Nissan': 'nissan', 
               'porcshce': 'porsche', 'toyouta': 'toyota',
               'vokswagen': 'volkswagen', 'vw': 'volkswagen'}

for incorrect in correction:
    automobile_df.loc[automobile_df['CompanyName'] == incorrect, 'CompanyName'] = correction[incorrect]

## Encoding categorical columns

As observed above, the following columns have non-numerical data:

- CompanyName
- fueltype
- aspiration
- doornumber
- carbody            
- drivewheel         
- enginelocation 
- enginetype          
- cylindernumber 
- fuelsystem  



### Label Encoding

Out of the above, the columns fueltype, aspiration, enginelocation have only two types of values that they can take. Hence, we could use label encoding for these columns.

In [None]:
def fit_labels(column):
  unique_values = column.unique()
  for label, value in enumerate(unique_values):
    column = column.replace(value, label)
  return column

In [None]:
cols_to_label_encode = ['fueltype', 'aspiration', 'enginelocation']

for col_to_label_encode in cols_to_label_encode:
  automobile_df[col_to_label_encode] = fit_labels(automobile_df[col_to_label_encode]) 

Further, we have columns where the data given are numbers in words. For example: doornumber and cylindernumber.

In [None]:
words2num = {'two': 2, 'three': 3, 'four':4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}

cols_to_convert = ['doornumber', 'cylindernumber']

for col_to_convert in cols_to_convert:
  for word in words2num:
    automobile_df[col_to_convert] = automobile_df[col_to_convert].replace(word, words2num[word])


### One Hot Encoding

Now, the remaining columns have more than 2 categories and no categories where numbers are given in words. Hence, we use one-hot encoding for encoding categories given in these columns.

Therefore, we see all the categories we have in the columns left.

In [None]:
cols_to_one_hot = ['CompanyName', 'carbody', 'drivewheel', 'enginetype', 'fuelsystem']

for col_to_one_hot in cols_to_one_hot:
  # Get all the unique values in a column
  col_types = automobile_df[col_to_one_hot].unique()

  # Insert all unique values as columns in the dataframe
  for col_type in col_types:
    automobile_df[col_type] = [0 for i in range(len(automobile_df['car_ID']))]
  
  # Set the value of the column with the same name as value in the cell.
  for i, cell in enumerate(automobile_df[col_to_one_hot]):
    automobile_df[cell][i] = 1

  automobile_df = automobile_df.drop(col_to_one_hot, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [None]:
automobile_df.head()

Unnamed: 0,car_ID,symboling,fueltype,aspiration,doornumber,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,cylindernumber,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,alfa-romero,audi,bmw,chevrolet,dodge,honda,isuzu,jaguar,mazda,buick,mercury,mitsubishi,nissan,peugeot,plymouth,porsche,renault,saab,subaru,toyota,volkswagen,volvo,convertible,hatchback,sedan,wagon,hardtop,rwd,fwd,4wd,dohc,ohcv,ohc,l,rotor,ohcf,dohcv,mpfi,2bbl,mfi,1bbl,spfi,4bbl,idi,spdi
0,1,3,0,0,2,0,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111,5000,21,27,13495.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,2,3,0,0,2,0,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111,5000,21,27,16500.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,3,1,0,0,2,0,94.5,171.2,65.5,52.4,2823,6,152,2.68,3.47,9.0,154,5000,19,26,16500.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,4,2,0,0,4,0,99.8,176.6,66.2,54.3,2337,4,109,3.19,3.4,10.0,102,5500,24,30,13950.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
4,5,2,0,0,4,0,99.4,176.6,66.4,54.3,2824,5,136,3.19,3.4,8.0,115,5500,18,22,17450.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0


## Removing duplicates 

Since the car ID column contains all unique elements and it's length is the same as the length of the dataset, there are no repeated rows.

In [None]:
automobile_df = automobile_df.drop(columns = 'car_ID')
automobile_df = automobile_df.drop_duplicates()

# Splitting into train and test sets

We shuffle the data set and split it in two unequal parts: train set and test set.

The smaller part of the dataset is the test set, while the larger is the train set.

In [None]:
import random

def train_test_split(train_percent, df, id_col):
  df = df.sample(frac=1).reset_index(drop=True)
  train_set_len = int(train_percent * len(df[id_col]))

  return df.head(train_set_len), df.tail(len(df[id_col]) - train_set_len)

In [None]:
train_set, test_set = train_test_split(0.7, automobile_df, 'price')

train_set_size = len(train_set)
test_set_size = len(test_set)

print("Size of the train set", train_set_size)
print("Size of the test set", test_set_size)

Size of the train set 143
Size of the test set 62


In [None]:
train_y = train_set['price'].to_numpy()
train_x = train_set.drop(columns = 'price').to_numpy()
train_x = np.insert(train_x, 0, np.ones(143), axis=1)

In [None]:
test_y = test_set['price'].to_numpy()
test_x = test_set.drop(columns = 'price').to_numpy()
test_x = np.insert(test_x, 0, np.ones(205 - 143), axis=1)

## Learning Algorithm

We use the linear regression algorithm, wherein the weights are calculated by:

w = inv(x) * y

### Closed form solution

In [None]:
def transpose_product(x):
  return (x.T).dot(x)

In [None]:
def inverse(x):
  return np.linalg.inv(x)

In [None]:
def fit(x, y, lambda_coefficient):
  return (inverse(transpose_product(x) - lambda_coefficient * np.identity(65)).dot(x.T)).dot(y)

Since, the matrix transpose(X) * X is not invertible, we add a very small lambda correction to the matrix, so that it is invertible.

In [None]:
weights = fit(train_x, train_y, 0.000001)

# Inference Algorithm

In [None]:
def predict(x, weights):
  return x.dot(weights)

# Evaluation Metrics

We use mean squared error in order to evaluate the model.

In [None]:
def mean_squared_error(x, w, y, n):
  return sum((predict(x_val, w) - y_val) ** 2 for x_val, y_val in zip(x, y)) / n

In [None]:
test_error_closed_form = mean_squared_error(test_x, weights, test_y, DATASET_SIZE)

In [None]:
train_error_closed_form = mean_squared_error(train_x, weights, train_y, DATASET_SIZE)

In [None]:
print("Error in the test set:", test_error_closed_form)
print("Error in the train set:", train_error_closed_form)

Error in the test set: 4403242.3075199
Error in the train set: 1535627.6164094873


### Gradient Descent Solution

In [None]:
def gd_predict(x, w, ind, no_of_features):
  return sum(w[i] * x[i] for i in range(no_of_features))

In [None]:
def derivative_wrt_weight(x, weight, ind, y, set_size, no_of_features):
  return (-2) * sum(x[i][ind] * (y[i] - gd_predict(x[i], weight, i, no_of_features)) for i in range(set_size))

In [None]:
def derivative_wrt_wt_vector(x, w_old, y, set_size, no_of_features):
  return np.asarray([derivative_wrt_weight(x, w_old, i, y, set_size, no_of_features) for i in range(no_of_features)])

In [None]:
def gd_fit(epochs, learning_rate, x, y, set_size, no_of_features):
  w_old = np.zeros(len(x[0]))
  
  for epoch_no in range(epochs):
    derivative_vector = derivative_wrt_wt_vector(x, w_old, y, set_size, no_of_features)
    w_old = w_old + learning_rate * derivative_vector

  return w_old
    

In [None]:
LEARNING_RATE = 0.0000000000000000000000000001
EPOCHS = 2

set_size, no_of_features = train_x.shape

gradient_desc_wts = gd_fit(EPOCHS, LEARNING_RATE, train_x, train_y, set_size, no_of_features)

In [None]:
test_error_gd = mean_squared_error(test_x, gradient_desc_wts, test_y, DATASET_SIZE)
train_error_gd = mean_squared_error(train_x, gradient_desc_wts, train_y, DATASET_SIZE)

In [None]:
print("Error in the test set:", test_error_gd)
print("Error in the train set:", train_error_gd)

Error in the test set: 79840066.60243903
Error in the train set: 159941412.8887019


### Newton's Method Solution

In [None]:
def double_derivative(x, w, k, j, set_size):
  return sum(x[i][j] * x[i][k] * 2 for i in range(set_size))

In [None]:
def compute_hessian(x, w):
  hessian = [[0 for j in range(len(w))] for i in range(len(w))]

  for i in range(len(w)):
    for j in range(len(w)):
      hessian[i][j] = double_derivative(x, w, i, j, set_size)
  
  return hessian

In [None]:
def fit_newtons_method(x, y, epochs, lambda_correction, set_size, no_of_features):
  w = np.zeros(no_of_features)

  for epoch in range(epochs):
    gradient = derivative_wrt_wt_vector(x, w, y, set_size, no_of_features)
    hessian = (np.linalg.inv(compute_hessian(x, w) - lambda_correction * 
                           np.identity(no_of_features)))
    w = w - hessian.dot(gradient)

  return w

In [None]:
newton_mtd_wts = fit_newtons_method(train_x, train_y, 3, 0.00001, set_size, no_of_features)

In [None]:
test_error_newtons_mtd = mean_squared_error(test_x, newton_mtd_wts, test_y, DATASET_SIZE)
train_error_newtons_mtd = mean_squared_error(train_x, newton_mtd_wts, train_y, DATASET_SIZE)

In [None]:
print("Error in the test set:", test_error_newtons_mtd)
print("Error in the train set:", train_error_newtons_mtd)

Error in the test set: 4403476.468809987
Error in the train set: 1535627.6161937353
