# Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from numpy import absolute

from numpy import mean
from numpy import std

# Read the dataset

In [2]:
df=pd.read_csv('car_age_price.csv')

In [3]:
df.head()

Unnamed: 0,Year,Price
0,2018,465000
1,2019,755000
2,2019,700000
3,2018,465000
4,2018,465000


# Checking for null values

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Year    112 non-null    int64
 1   Price   112 non-null    int64
dtypes: int64(2)
memory usage: 1.9 KB


Null values are not present

In [5]:
df.describe()

Unnamed: 0,Year,Price
count,112.0,112.0
mean,2016.669643,483866.044643
std,1.629616,91217.450533
min,2013.0,300000.0
25%,2015.0,423750.0
50%,2017.0,500000.0
75%,2017.0,550000.0
max,2020.0,755000.0


In [6]:
df.shape

(112, 2)

# Checking duplicates

In [7]:
df.loc[df.duplicated()]

Unnamed: 0,Year,Price
3,2018,465000
4,2018,465000
9,2017,575000
15,2017,500000
16,2017,500000
18,2019,500000
19,2017,500000
20,2018,500000
21,2019,500000
23,2017,550000


# Replace duplicates by their average value of price

In [8]:
df=df.groupby('Year').mean().reset_index()

In [9]:
df.head()

Unnamed: 0,Year,Price
0,2013,333750.0
1,2014,380166.666667
2,2015,390454.454545
3,2016,419333.333333
4,2017,523106.361702


In [10]:
df

Unnamed: 0,Year,Price
0,2013,333750.0
1,2014,380166.666667
2,2015,390454.454545
3,2016,419333.333333
4,2017,523106.361702
5,2018,497272.727273
6,2019,577500.0
7,2020,621250.0


# independent variable x is Year and Dependent variable y is Price

In [11]:
x=df.drop(['Price'],axis=1)
x

Unnamed: 0,Year
0,2013
1,2014
2,2015
3,2016
4,2017
5,2018
6,2019
7,2020


In [12]:
y=df['Price']
y

0    333750.000000
1    380166.666667
2    390454.454545
3    419333.333333
4    523106.361702
5    497272.727273
6    577500.000000
7    621250.000000
Name: Price, dtype: float64

# Building model and predicting

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

In [14]:
from sklearn.linear_model import LinearRegression
linear_regrn=LinearRegression()
model=linear_regrn.fit(x_train,y_train)
predict=model.predict(x_test)

In [15]:
print(predict)

[362474.56033526 535111.2572276 ]


# The possible price of 2022 Rs.535111.2572276

# Checking the predicted value using metrics Mean squared error and r2 score

In [16]:
from sklearn.metrics import mean_squared_error,r2_score
print('Mean squared error',mean_squared_error(y_test,predict))
print('R2 error',r2_score(y_test,predict))

Mean squared error 872382487.7937216
R2 error 0.7455466418905055


# Using Lasso regression

In [17]:
from sklearn.linear_model import Lasso

In [18]:
X=df.drop(['Price'],axis=1)
Y=df['Price']

In [19]:
model = Lasso(alpha=1.0)

In [20]:
cv = RepeatedKFold(n_splits=8, n_repeats=3, random_state=1)

In [21]:
scores = cross_val_score(model, X, Y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [22]:
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean MAE: 24279.414 (10467.567)


Mean Squared error has reduced using lasso regression