In [2]:
"""A simple snippet to demo Linear Regression on car price. 
"""

'A simple snippet to demo Linear Regression on car price. \n'

# Import libs

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.linear_model import LinearRegression

# Step 1: Load data into data frames

In [4]:
# Load data from csv:
car_price_df = pd.read_csv(r"F:\Data Science Grand Track\Machine Learning\datasets\car_price\train.csv")

In [5]:
display(car_price_df)

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,45798355,8467,-,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0 Turbo,300000 km,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5
19233,45778856,15681,831,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,161600 km,4.0,Tiptronic,Front,04-May,Left wheel,Red,8
19234,45804997,26108,836,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2,116365 km,4.0,Automatic,Front,04-May,Left wheel,Grey,4
19235,45793526,5331,1288,CHEVROLET,Captiva,2007,Jeep,Yes,Diesel,2,51258 km,4.0,Automatic,Front,04-May,Left wheel,Black,4


# Step 2. Basic EDA

## Min, max on numeric columns

In [6]:
car_price_df.describe()

Unnamed: 0,ID,Price,Prod. year,Cylinders,Airbags
count,19237.0,19237.0,19237.0,19237.0,19237.0
mean,45576540.0,18555.93,2010.912824,4.582991,6.582627
std,936591.4,190581.3,5.668673,1.199933,4.320168
min,20746880.0,1.0,1939.0,1.0,0.0
25%,45698370.0,5331.0,2009.0,4.0,4.0
50%,45772310.0,13172.0,2012.0,4.0,6.0
75%,45802040.0,22075.0,2015.0,4.0,12.0
max,45816650.0,26307500.0,2020.0,16.0,16.0


## Create a year column

### Get current year

In [7]:
from datetime import datetime

# Get the current date and time
current_datetime = datetime.now()

# Extract the year from the datetime object
current_year = current_datetime.year

### Create a age column

In [8]:
car_price_df['Age'] = current_year - car_price_df['Prod. year']

In [9]:
display(car_price_df['Age'])

0        15
1        14
2        19
3        14
4        11
         ..
19232    26
19233    14
19234    15
19235    18
19236    13
Name: Age, Length: 19237, dtype: int64

## Create ML-only data frame

In [10]:
car_price_numeric_df = car_price_df[['Price','Age','Cylinders','Airbags']]

In [11]:
display(car_price_numeric_df)

Unnamed: 0,Price,Age,Cylinders,Airbags
0,13328,15,6.0,12
1,16621,14,6.0,8
2,8467,19,4.0,2
3,3607,14,4.0,0
4,11726,11,4.0,4
...,...,...,...,...
19232,8467,26,4.0,5
19233,15681,14,4.0,8
19234,26108,15,4.0,4
19235,5331,18,4.0,4


# Step 3: Linear Regression Demo

## 3.1: Train a model

### Set X and y

In [12]:
# Set feature matrix X (i.e, input)
X = car_price_df[['Age','Cylinders','Airbags']]

# Set predictor matrix Y (i.e, output):
y = car_price_df[['Price']]


### Actually train the model

In [13]:
# Create an instance of LinearRegression: 
price_predictor = LinearRegression()

# Fit data on instance:
price_predictor.fit(X,y)

### Print weight and coefficent:

In [14]:
# Print intercept:
print("Intercept b0: " + str(price_predictor.intercept_))

# Print feature weights (coefficents):
print("Feature weights b1->bn: " + str(price_predictor.coef_))

Intercept b0: [23846.53058974]
Feature weights b1->bn: [[-633.93547925 2033.30886509 -862.70710913]]


### Print rough formula:
$$ Price^=23846.53−633.94⋅Age+2033.31⋅Cylinders−862.71⋅Airbags $$

## Print testing features:

In [15]:
from sklearn.metrics import r2_score

y_pred = price_predictor.predict(X)
print("R² Score:", r2_score(y, y_pred))


R² Score: 0.0005918225584964354


### Scatter plot

In [28]:
age_vs_price = px.scatter(car_price_df,'Price','Age',trendline="lowess")
age_vs_price.show()