# import the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# read the data

In [2]:
df = pd.read_csv(r"car data.xls")

# first 5 records of data

In [3]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


# drop unnecessary columns

In [4]:
df.drop(["Present_Price" ,'Car_Name'] , axis = 1 , inplace = True)

# check null values

In [5]:
df.isnull().sum()

Year             0
Selling_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

# check duplicate values

In [6]:
df.duplicated().sum()

2

# check the shape of data

In [7]:
df.shape

(301, 7)

# drop the duplicate values

In [8]:
df.drop_duplicates(inplace = True)

# check duplicate values

In [9]:
df.duplicated().sum()

0

# check the shape of data

In [10]:
df.shape

(299, 7)

# check the information of the data

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299 entries, 0 to 300
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Year           299 non-null    int64  
 1   Selling_Price  299 non-null    float64
 2   Kms_Driven     299 non-null    int64  
 3   Fuel_Type      299 non-null    object 
 4   Seller_Type    299 non-null    object 
 5   Transmission   299 non-null    object 
 6   Owner          299 non-null    int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 18.7+ KB


In [12]:
df.head()

Unnamed: 0,Year,Selling_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,27000,Petrol,Dealer,Manual,0
1,2013,4.75,43000,Diesel,Dealer,Manual,0
2,2017,7.25,6900,Petrol,Dealer,Manual,0
3,2011,2.85,5200,Petrol,Dealer,Manual,0
4,2014,4.6,42450,Diesel,Dealer,Manual,0


# statistical calculation

In [13]:
df.describe()

Unnamed: 0,Year,Selling_Price,Kms_Driven,Owner
count,299.0,299.0,299.0,299.0
mean,2013.615385,4.589632,36916.752508,0.043478
std,2.896868,4.98424,39015.170352,0.24872
min,2003.0,0.1,500.0,0.0
25%,2012.0,0.85,15000.0,0.0
50%,2014.0,3.51,32000.0,0.0
75%,2016.0,6.0,48883.5,0.0
max,2018.0,35.0,500000.0,3.0


# find the age of the car

In [14]:
import datetime

# retrieve the current date and time

In [15]:
date_time = datetime.datetime.now()

# find the age of the car

In [16]:
df["age"] = date_time.year - df["Year"]

In [17]:
df.head()

Unnamed: 0,Year,Selling_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,age
0,2014,3.35,27000,Petrol,Dealer,Manual,0,10
1,2013,4.75,43000,Diesel,Dealer,Manual,0,11
2,2017,7.25,6900,Petrol,Dealer,Manual,0,7
3,2011,2.85,5200,Petrol,Dealer,Manual,0,13
4,2014,4.6,42450,Diesel,Dealer,Manual,0,10


# drop the year column

In [18]:
df.drop("Year" , axis = 1 , inplace = True)

In [19]:
df.head()

Unnamed: 0,Selling_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,age
0,3.35,27000,Petrol,Dealer,Manual,0,10
1,4.75,43000,Diesel,Dealer,Manual,0,11
2,7.25,6900,Petrol,Dealer,Manual,0,7
3,2.85,5200,Petrol,Dealer,Manual,0,13
4,4.6,42450,Diesel,Dealer,Manual,0,10


# find the unique() values of fuel-type

In [20]:
df["Fuel_Type"].unique()

array(['Petrol', 'Diesel', 'CNG'], dtype=object)

# assign the number of each category present in the fuel-type

In [21]:
df["Fuel_Type"] = df["Fuel_Type"].map({"Petrol" : 0 , "Diesel" : 1 , "CNG" : 2})

In [22]:
df["Fuel_Type"].unique()

array([0, 1, 2], dtype=int64)

# unique values of the seller type 

In [23]:
df["Seller_Type"].unique()

array(['Dealer', 'Individual'], dtype=object)

# assign the number for each category

In [24]:
df["Seller_Type"] = df["Seller_Type"].map({"Dealer" : 0 , "Individual" : 1})

In [25]:
df["Seller_Type"].unique()

array([0, 1], dtype=int64)

# unique values of transmission column

In [26]:
df["Transmission"].unique()

array(['Manual', 'Automatic'], dtype=object)

# assign the number to each category

In [27]:
df["Transmission"] = df["Transmission"].map({"Manual" : 0 , "Automatic" : 1})

In [28]:
df["Transmission"].unique()

array([0, 1], dtype=int64)

In [29]:
df.head()

Unnamed: 0,Selling_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,age
0,3.35,27000,0,0,0,0,10
1,4.75,43000,1,0,0,0,11
2,7.25,6900,0,0,0,0,7
3,2.85,5200,0,0,0,0,13
4,4.6,42450,1,0,0,0,10


# decide the dependent and independent

In [30]:
x = df.drop(["Selling_Price"] , axis = 1)
y = df["Selling_Price"]

In [31]:
x

Unnamed: 0,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,age
0,27000,0,0,0,0,10
1,43000,1,0,0,0,11
2,6900,0,0,0,0,7
3,5200,0,0,0,0,13
4,42450,1,0,0,0,10
...,...,...,...,...,...,...
296,33988,1,0,0,0,8
297,60000,0,0,0,0,9
298,87934,0,0,0,0,15
299,9000,1,0,0,0,7


In [32]:
y

0       3.35
1       4.75
2       7.25
3       2.85
4       4.60
       ...  
296     9.50
297     4.00
298     3.35
299    11.50
300     5.30
Name: Selling_Price, Length: 299, dtype: float64

# split the data into train and test

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size= 0.20 , random_state= 42)

# model selection

In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressora
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# create the object for each model

In [36]:
LR = LinearRegression()
RF = RandomForestRegressor()
GB = GradientBoostingRegressor(n_estimators = 100)
XG = XGBRegressor(learning_rate = 1 , n_estimator = 100)

# fit the model

In [37]:
LR.fit(x_train , y_train)
RF.fit(x_train , y_train)
GB.fit(x_train , y_train)
XG.fit(x_train , y_train)

Parameters: { "n_estimator" } are not used.



# predict on test data

In [38]:
y_pred_lr = LR.predict(x_test)
y_pred_rf = RF.predict(x_test)
y_pred_gb = GB.predict(x_test)
y_pred_xg = XG.predict(x_test)

# check the accuracy 

In [39]:
from sklearn.metrics import r2_score

In [40]:
print("linear regression" , r2_score(y_test , y_pred_lr)) 
print("---------------------------------------------------")

print("random forest" , r2_score(y_test , y_pred_rf)) 
print("---------------------------------------------------")

print("gradient boost" , r2_score(y_test , y_pred_gb)) 
print("---------------------------------------------------")

print("xgboost" , r2_score(y_test , y_pred_xg)) 
print("---------------------------------------------------")

linear regression 0.4809158492012464
---------------------------------------------------
random forest 0.6424282406078143
---------------------------------------------------
gradient boost 0.7117996156103449
---------------------------------------------------
xgboost 0.6242947092646042
---------------------------------------------------


In [41]:
import pickle

In [42]:
with open("car.pkl" , "wb")as file :
    pickle.dump(GB,file)