In [1]:
"""
Importing necessary libraries 
"""
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

In [2]:
"""
Checking Numpy Version
"""
np.__version__

'1.20.3'

In [3]:
"""
Reading the data
"""
data = pd.read_csv('car price.csv')

In [4]:
"""
Checking the shape of the data
"""
data.shape

(11914, 16)

The data has 11914 rows and 16 columns

In [5]:
"""
Checking first 5 rows of the data
"""
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [6]:
"Normalizing column names"
data.columns = data.columns.str.lower().str.replace(' ', '_')
data.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [7]:
"""
Checking the most used car manufacturers in the dataset and their popularity
"""
data[['make', 'popularity']].value_counts().head(3)

make        popularity
Chevrolet   1385          1123
Ford        5657           881
Volkswagen  873            809
dtype: int64

The 3 most popular car maufacturers are Chevrolet, Ford and Volkswagen

In [8]:
"""
Checking the number of unique Audi models in the data set 
"""
data[data.make=='Audi']['model'].nunique()

34

The number of unique Audi models in the data set is 34

In [9]:
"""
This function takes a dataframe and returns the columns that have missing values as well as the number of missing values
:params: pandas dataframe
:return: columns with missing values and the number of missing values 
:rtype: List
"""
def sum_missing_values(data):
    missing_columns = [[column, data[column].isna().sum()] for column in data if data[column].isna().sum()]
    num_of_missing_columns = len(missing_columns)
    return missing_columns, num_of_missing_columns

In [10]:
missing_columns, total_missing_columns =  sum_missing_values(data)
total_missing_columns

5

The number of columns with missing values = 5

In [11]:
"""
Median value of Engine Cylinders Column
"""

median_engine_cylinders = data['engine_cylinders'].median()
most_freq_engine_cylinders = data['engine_cylinders'].mode()
print(median_engine_cylinders)
filled_engine_cylinders = data['engine_cylinders'].fillna(0)
filled_engine_cylinders.median()

6.0


6.0

The median of the Engine Cylinders column does not change after filling missing values with zero

In [12]:
"""
Performing Linear Regression on a subset of the data
"""

#Creating a subset of the data
Lotus_cars = data[data.make=='Lotus']
Lotus_clean = Lotus_cars[['engine_hp', 'engine_cylinders']].drop_duplicates()

#Extracting an array as our feature matrix
X = Lotus_clean.values

#Calculating the gram matrix
XTX = X.T.dot(X)
#inverting the gram matrix
XTX_inv = np.linalg.inv(XTX)
#the given targets for each observation
y = [1100, 800, 750, 850, 1300, 1000, 1000, 1300, 800]
#calculating the weights for each feature
w = XTX_inv.dot(X.T).dot(y)
#Displaying the weight of the first feature
w[0]

4.594944810094579