# Car price prediction

## Import the libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import the dataset

In [3]:
data = pd.read_csv('car data.csv')

## Cleaning the dataset

In [4]:
data = data.drop(columns = ['Car_Name'])

## Encoding the categorical variables


In [5]:
#data['Transmission'] = (data['Transmission']== 'Automatic').astype(int) ## 0 = Manual ; 1 = Automatic
#data['Selling_type'] = (data['Selling_type'] == 'Individual').astype(int) ## 0 = Dealer; 1 = Individual  

In [6]:
print(data.Fuel_Type.value_counts())
print(data.Selling_type.value_counts())
print(data.Transmission.value_counts())

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64
Selling_type
Dealer        195
Individual    106
Name: count, dtype: int64
Transmission
Manual       261
Automatic     40
Name: count, dtype: int64


In [7]:
from sklearn.preprocessing import LabelEncoder
le_fuel = LabelEncoder()
le_trans = LabelEncoder()
le_sell = LabelEncoder()

data['Fuel_Type'] = le_fuel.fit_transform(data['Fuel_Type'])
data['Transmission'] = le_trans.fit_transform(data['Transmission'])
data['Selling_type'] = le_sell.fit_transform(data['Selling_type'])

In [8]:
data

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,2014,3.35,5.59,27000,2,0,1,0
1,2013,4.75,9.54,43000,1,0,1,0
2,2017,7.25,9.85,6900,2,0,1,0
3,2011,2.85,4.15,5200,2,0,1,0
4,2014,4.60,6.87,42450,1,0,1,0
...,...,...,...,...,...,...,...,...
296,2016,9.50,11.60,33988,1,0,1,0
297,2015,4.00,5.90,60000,2,0,1,0
298,2009,3.35,11.00,87934,2,0,1,0
299,2017,11.50,12.50,9000,1,0,1,0


In [9]:
fuel_map = dict(zip(le_fuel.classes_, le_fuel.transform(le_fuel.classes_)))
print("Fuel: ", fuel_map)
trans_map = dict(zip(le_trans.classes_, le_trans.transform(le_trans.classes_)))
print("Transmission: ", trans_map)
sell_map = dict(zip(le_sell.classes_, le_sell.transform(le_sell.classes_)))
print("Selling type: ",sell_map)

Fuel:  {'CNG': 0, 'Diesel': 1, 'Petrol': 2}
Transmission:  {'Automatic': 0, 'Manual': 1}
Selling type:  {'Dealer': 0, 'Individual': 1}


## Splitting the dataset

In [10]:
X = data.drop(columns=['Selling_Price']).values
y = data['Selling_Price'].values

## Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the model

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [12]:
from xgboost import XGBRegressor
regressor = XGBRegressor()
regressor.fit(X_train,y_train)

## Predicting the results

In [13]:
y_pred = regressor.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[ 6.8380127   8.25      ]
 [ 0.43991357  0.5       ]
 [ 4.49984455  5.25      ]
 [ 8.8179369   9.5       ]
 [16.64114761 18.        ]
 [ 5.22012138  5.5       ]
 [ 2.90802574  3.75      ]
 [ 0.46455875  0.45      ]
 [ 3.85519791  4.5       ]
 [ 4.55173349  4.5       ]
 [ 2.75152612  2.7       ]
 [ 0.83044571  0.72      ]
 [ 4.6638484   5.25      ]
 [ 7.49327135  7.5       ]
 [ 7.75219011  7.75      ]
 [15.8965559  19.75      ]
 [ 6.92913723  7.4       ]
 [ 3.78073573  4.75      ]
 [ 0.73566955  0.35      ]
 [ 1.55842066  1.7       ]
 [ 3.30505061  4.4       ]
 [ 4.87911749  5.25      ]
 [ 4.99598789  5.11      ]
 [10.24263191 11.25      ]
 [ 0.15770955  0.2       ]
 [ 0.79466861  0.78      ]
 [ 0.21576412  0.38      ]
 [ 0.46160036  0.65      ]
 [ 0.4095996   0.5       ]
 [ 3.67923784  3.6       ]
 [ 2.15778613  3.25      ]
 [ 5.60725355  7.2       ]
 [ 0.45022669  0.6       ]
 [ 7.54221535  8.75      ]
 [ 3.43249369  3.        ]
 [ 1.21509147  1.15      ]
 [ 6.11109829  6.6       ]
 

In [14]:
data
#Fuel:  {'CNG': 0, 'Diesel': 1, 'Petrol': 2}
#Transmission:  {'Automatic': 0, 'Manual': 1}
#Selling type:  {'Dealer': 0, 'Individual': 1}

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,2014,3.35,5.59,27000,2,0,1,0
1,2013,4.75,9.54,43000,1,0,1,0
2,2017,7.25,9.85,6900,2,0,1,0
3,2011,2.85,4.15,5200,2,0,1,0
4,2014,4.60,6.87,42450,1,0,1,0
...,...,...,...,...,...,...,...,...
296,2016,9.50,11.60,33988,1,0,1,0
297,2015,4.00,5.90,60000,2,0,1,0
298,2009,3.35,11.00,87934,2,0,1,0
299,2017,11.50,12.50,9000,1,0,1,0


## Single prediction

In [21]:
print(regressor.predict([[2023,9,27000,2,0,1,0]]))

[7.6732416]


## Model evaluation

In [16]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)

from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)


In [17]:
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R2) Score:", r2)


Mean Absolute Error: 0.6254185837018684
Mean Squared Error: 0.9763539218368368
Root Mean Squared Error: 0.9881062300364454
R-squared (R2) Score: 0.9613743079938505


## Saving the model

In [18]:
regressor.save_model("xgboost_model.json")

In [19]:
import pickle
filename = 'trained_model.sav'
pickle.dump(regressor, open(filename, 'wb'))

In [20]:
loaded_model = pickle.load(open('trained_model.sav', 'rb'))