## Importing required libraries

In [1]:
import datetime

import numpy as np
import pandas as pd

# import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

## Reading CSV file 

In [2]:
a=pd.read_csv("car_price.csv")
a.head()

Unnamed: 0.1,Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
0,0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats


## Performing EDA (Exploratory Data Analysis)

In [3]:
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5512 entries, 0 to 5511
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           5512 non-null   int64 
 1   car_name             5512 non-null   object
 2   car_prices_in_rupee  5512 non-null   object
 3   kms_driven           5512 non-null   object
 4   fuel_type            5512 non-null   object
 5   transmission         5512 non-null   object
 6   ownership            5512 non-null   object
 7   manufacture          5512 non-null   int64 
 8   engine               5512 non-null   object
 9   Seats                5512 non-null   object
dtypes: int64(2), object(8)
memory usage: 430.8+ KB


In [4]:
a.describe()

Unnamed: 0.1,Unnamed: 0,manufacture
count,5512.0,5512.0
mean,2755.5,2015.455552
std,1591.321673,3.927974
min,0.0,1995.0
25%,1377.75,2013.0
50%,2755.5,2016.0
75%,4133.25,2018.0
max,5511.0,2022.0


In [5]:
a[a.isna()].count()

Unnamed: 0             0
car_name               0
car_prices_in_rupee    0
kms_driven             0
fuel_type              0
transmission           0
ownership              0
manufacture            0
engine                 0
Seats                  0
dtype: int64

## Cleaning Dataset: 
since Unnamed: 0 column is not required we will remove it

In [6]:
a=a.drop(columns='Unnamed: 0',axis=1)

In [7]:
a

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90 Lakh,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,BMW M Series M4 Coupe,64.90 Lakh,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,Jaguar XF 2.2 Litre Luxury,13.75 Lakh,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,BMW 7 Series 730Ld,29.90 Lakh,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


## Creating new coulmn named Currency

In [8]:
a.loc[a['car_prices_in_rupee'].str.contains('Lakh'), 'Currency'] = 'Lakh'
a.loc[a['car_prices_in_rupee'].str.contains('Crore'), 'Currency'] = 'Crore'
a

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,Currency
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats,Lakh
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats,Lakh
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats,Lakh
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats,Lakh
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats,Lakh
...,...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90 Lakh,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats,Lakh
5508,BMW M Series M4 Coupe,64.90 Lakh,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats,Lakh
5509,Jaguar XF 2.2 Litre Luxury,13.75 Lakh,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats,Lakh
5510,BMW 7 Series 730Ld,29.90 Lakh,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats,Lakh


## Dropping all NULL values using dropna() method
Since our model cannot compute null values


In [9]:
a=a.dropna()
a

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,Currency
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats,Lakh
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats,Lakh
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats,Lakh
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats,Lakh
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats,Lakh
...,...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90 Lakh,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats,Lakh
5508,BMW M Series M4 Coupe,64.90 Lakh,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats,Lakh
5509,Jaguar XF 2.2 Litre Luxury,13.75 Lakh,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats,Lakh
5510,BMW 7 Series 730Ld,29.90 Lakh,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats,Lakh


## Cleaning Data:
Removing Strings attached with numbers and removing commas

In [10]:
a['kms_driven']=a['kms_driven'].str.replace(' kms','')
a['car_prices_in_rupee']=a['car_prices_in_rupee'].str.replace(' Lakh','')
a['kms_driven']=a['kms_driven'].str.replace(',','')
a['Seats']=a['Seats'].str.replace(' Seats','')
a['engine']=a['engine'].str.replace(' cc','')
a['car_prices_in_rupee']=a['car_prices_in_rupee'].str.replace(' Crore','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['kms_driven']=a['kms_driven'].str.replace(' kms','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['car_prices_in_rupee']=a['car_prices_in_rupee'].str.replace(' Lakh','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['kms_driven']=a['kms_driven'].str.replace(',','')
A value is trying to be 

In [11]:
a

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,Currency
0,Jeep Compass 2.0 Longitude Option BSIV,10.03,86226,Diesel,Manual,1st Owner,2017,1956,5,Lakh
1,Renault Duster RXZ Turbo CVT,12.83,13248,Petrol,Automatic,1st Owner,2021,1330,5,Lakh
2,Toyota Camry 2.5 G,16.40,60343,Petrol,Automatic,1st Owner,2016,2494,5,Lakh
3,Honda Jazz VX CVT,7.77,26696,Petrol,Automatic,1st Owner,2018,1199,5,Lakh
4,Volkswagen Polo 1.2 MPI Highline,5.15,69414,Petrol,Manual,1st Owner,2016,1199,5,Lakh
...,...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90,45000,Diesel,Automatic,1st Owner,2018,2995,7,Lakh
5508,BMW M Series M4 Coupe,64.90,29000,Petrol,Automatic,2nd Owner,2015,1968,5,Lakh
5509,Jaguar XF 2.2 Litre Luxury,13.75,90000,Diesel,Automatic,2nd Owner,2013,2755,5,Lakh
5510,BMW 7 Series 730Ld,29.90,79000,Diesel,Automatic,3rd Owner,2015,2967,6,Lakh


## Converting Strings to int and float type for computing

In [12]:
a['car_prices_in_rupee']=a['car_prices_in_rupee'].astype('float')
a['kms_driven']=a['kms_driven'].astype('int')
a['Seats']=a['Seats'].astype('int')
a['engine']=a['engine'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['car_prices_in_rupee']=a['car_prices_in_rupee'].astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['kms_driven']=a['kms_driven'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Seats']=a['Seats'].astype('int')
A value is trying to be set on a copy of a slice from a D

In [13]:
a

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,Currency
0,Jeep Compass 2.0 Longitude Option BSIV,10.03,86226,Diesel,Manual,1st Owner,2017,1956,5,Lakh
1,Renault Duster RXZ Turbo CVT,12.83,13248,Petrol,Automatic,1st Owner,2021,1330,5,Lakh
2,Toyota Camry 2.5 G,16.40,60343,Petrol,Automatic,1st Owner,2016,2494,5,Lakh
3,Honda Jazz VX CVT,7.77,26696,Petrol,Automatic,1st Owner,2018,1199,5,Lakh
4,Volkswagen Polo 1.2 MPI Highline,5.15,69414,Petrol,Manual,1st Owner,2016,1199,5,Lakh
...,...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90,45000,Diesel,Automatic,1st Owner,2018,2995,7,Lakh
5508,BMW M Series M4 Coupe,64.90,29000,Petrol,Automatic,2nd Owner,2015,1968,5,Lakh
5509,Jaguar XF 2.2 Litre Luxury,13.75,90000,Diesel,Automatic,2nd Owner,2013,2755,5,Lakh
5510,BMW 7 Series 730Ld,29.90,79000,Diesel,Automatic,3rd Owner,2015,2967,6,Lakh


In [14]:
a['car_prices_in_rupee']=np.where(a['Currency'] == 'Crore',
                                           a['car_prices_in_rupee'] * 100,
                                           a['car_prices_in_rupee'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['car_prices_in_rupee']=np.where(a['Currency'] == 'Crore',


In [15]:
a

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,Currency
0,Jeep Compass 2.0 Longitude Option BSIV,10.03,86226,Diesel,Manual,1st Owner,2017,1956,5,Lakh
1,Renault Duster RXZ Turbo CVT,12.83,13248,Petrol,Automatic,1st Owner,2021,1330,5,Lakh
2,Toyota Camry 2.5 G,16.40,60343,Petrol,Automatic,1st Owner,2016,2494,5,Lakh
3,Honda Jazz VX CVT,7.77,26696,Petrol,Automatic,1st Owner,2018,1199,5,Lakh
4,Volkswagen Polo 1.2 MPI Highline,5.15,69414,Petrol,Manual,1st Owner,2016,1199,5,Lakh
...,...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90,45000,Diesel,Automatic,1st Owner,2018,2995,7,Lakh
5508,BMW M Series M4 Coupe,64.90,29000,Petrol,Automatic,2nd Owner,2015,1968,5,Lakh
5509,Jaguar XF 2.2 Litre Luxury,13.75,90000,Diesel,Automatic,2nd Owner,2013,2755,5,Lakh
5510,BMW 7 Series 730Ld,29.90,79000,Diesel,Automatic,3rd Owner,2015,2967,6,Lakh


In [16]:
a['car_prices_in_rupee']=a['car_prices_in_rupee']*100000
a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['car_prices_in_rupee']=a['car_prices_in_rupee']*100000


Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,Currency
0,Jeep Compass 2.0 Longitude Option BSIV,1003000.0,86226,Diesel,Manual,1st Owner,2017,1956,5,Lakh
1,Renault Duster RXZ Turbo CVT,1283000.0,13248,Petrol,Automatic,1st Owner,2021,1330,5,Lakh
2,Toyota Camry 2.5 G,1640000.0,60343,Petrol,Automatic,1st Owner,2016,2494,5,Lakh
3,Honda Jazz VX CVT,777000.0,26696,Petrol,Automatic,1st Owner,2018,1199,5,Lakh
4,Volkswagen Polo 1.2 MPI Highline,515000.0,69414,Petrol,Manual,1st Owner,2016,1199,5,Lakh
...,...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,2890000.0,45000,Diesel,Automatic,1st Owner,2018,2995,7,Lakh
5508,BMW M Series M4 Coupe,6490000.0,29000,Petrol,Automatic,2nd Owner,2015,1968,5,Lakh
5509,Jaguar XF 2.2 Litre Luxury,1375000.0,90000,Diesel,Automatic,2nd Owner,2013,2755,5,Lakh
5510,BMW 7 Series 730Ld,2990000.0,79000,Diesel,Automatic,3rd Owner,2015,2967,6,Lakh


In [17]:
train=a.copy()

In [18]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

In [19]:
X= train.drop(columns=['car_prices_in_rupee','car_name','ownership', 'fuel_type'] ,axis=1)
Y= train['car_prices_in_rupee']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.15, random_state = 100)



In [21]:
# make_train = X_train["car_name"].str.split(" ", expand = True)
# make_test = X_test["car_name"].str.split(" ", expand = True)
# make_train

In [22]:
# X_train["Manufacturer"] = make_train[0]
# X_test["Manufacturer"] = make_test[0]
# X_train

In [23]:
X_train = pd.get_dummies(X_train,
                         columns = ['Currency' ,"transmission"],
                         drop_first = True)

X_test = pd.get_dummies(X_test,
                         columns = ['Currency', "transmission"],
                         drop_first = True)
# X_train.drop("car_name",axis=1,inplace=True)
# X_test.drop("car_name",axis=1,inplace=True)

In [24]:
model=LinearRegression()
model.fit(X_train,y_train)
ypred=model.predict(X_test)

In [25]:
X_test

Unnamed: 0,kms_driven,manufacture,engine,Seats,Currency_Lakh,transmission_Manual
1025,32272,2020,1086,5,1,0
1786,36000,2018,1498,5,1,1
3986,170290,2016,1248,5,1,0
1608,40000,2019,1968,5,1,1
296,105053,2013,1248,5,1,1
...,...,...,...,...,...,...
1289,30000,2020,1451,5,1,0
3799,55077,2009,1497,5,1,1
537,50152,2011,1461,5,1,1
3861,67000,2013,999,5,1,1


In [26]:
ypred

array([ 2.61909079e+06,  8.52166158e+05,  2.08140008e+06,  1.02367814e+06,
        2.39258061e+05,  1.18325918e+06,  4.32081674e+05,  1.19851883e+06,
        6.03635292e+05,  6.01560037e+05,  7.02135944e+05,  8.27272757e+05,
        2.54247334e+06,  1.09646946e+06,  2.75017091e+06,  2.07535147e+06,
        9.98446779e+05, -7.27144696e+04,  6.72975430e+05,  6.54742640e+05,
        8.37788281e+05,  1.03166187e+06,  6.24358208e+05,  1.34825109e+05,
        1.06949677e+06,  2.54603815e+05,  7.93217190e+05,  5.54691272e+05,
        2.87559158e+04,  3.82989126e+05,  3.53866074e+05,  2.85964783e+06,
        9.93743756e+05,  8.91264065e+05,  2.38392964e+06,  5.29765218e+05,
        2.49390043e+06,  2.78042466e+06,  1.11923566e+06,  8.99452952e+05,
       -2.16665813e+05, -2.85386875e+05,  2.88872303e+06,  8.29212568e+05,
        9.80581030e+05,  2.78695698e+06,  2.44790718e+06,  8.68877207e+05,
        4.48880006e+05,  4.01876323e+05,  7.18446206e+05,  2.18383471e+06,
        3.18176259e+04, -

In [27]:
from sklearn.metrics import r2_score

In [28]:
r2_score(y_test,ypred)

0.7332676982532722

In [29]:
c=int(input("Enter the Kilometer_driven: "))

Enter the Kilometer_driven: 200000


In [30]:
d=int(input("Enter the year of manufacturing: "))

Enter the year of manufacturing: 2003


In [31]:
e=int(input("Enter the Power of engine: "))

Enter the Power of engine: 800


In [32]:
f=int(input("Enter the Number of seat you want: "))

Enter the Number of seat you want: 4


In [33]:
g=int(input("Enter the Currency in 0 or 1 1 for lakh and 0 for crore: "))

Enter the Currency in 0 or 1 1 for lakh and 0 for crore: 0


In [34]:
h=int(input("Enter the type of vehicle 0 for automatic and 1 for manual: "))

Enter the type of vehicle 0 for automatic and 1 for manual: 1


In [35]:
model.predict([[c,d,e,f,g,h]])



array([11342551.25276157])