In [61]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras as ks
from sklearn.model_selection import train_test_split
import numpy as np
import warnings
warnings.filterwarnings('ignore', module='pandas', category=UserWarning)

In [62]:
df = pd.read_csv("car_price.csv")
df

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90 Lakh,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,BMW M Series M4 Coupe,64.90 Lakh,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,Jaguar XF 2.2 Litre Luxury,13.75 Lakh,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,BMW 7 Series 730Ld,29.90 Lakh,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


Checking null values

In [63]:
df.loc[df.isna().any(axis=1)]

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats


Duplicates

In [64]:
df = df.drop_duplicates()

In [65]:
print(df["car_prices_in_rupee"].str.extract(r"([a-zA-Z]+)")[0].unique())

print(df["kms_driven"].str.extract(r"([a-zA-Z]+)")[0].unique())

print(df["fuel_type"].unique())

print(df["engine"].str.extract(r"([a-zA-Z]+)")[0].unique())
print(df["transmission"].unique())

print(df["ownership"].unique())

print(df["Seats"].str.extract(r"([a-zA-Z]+)")[0].unique())

['Lakh' 'Crore' nan]
['kms']
['Diesel' 'Petrol' 'Cng' 'Electric' 'Lpg']
['cc']
['Manual' 'Automatic']
['1st Owner' '2nd Owner' '3rd Owner' '4th Owner' '5th Owner' '0th Owner']
['Seats']


In [66]:
df.ownership.value_counts()

ownership
1st Owner    3678
2nd Owner    1298
3rd Owner     358
4th Owner      84
5th Owner      12
0th Owner       6
Name: count, dtype: int64

In [67]:
df = df.drop(df[df["ownership"] == "0th Owner"].index)

Rupee to Euro

In [68]:
def rupee_to_euro(price): # 1 rupee = 0.01123 euro
    price = price.replace(",", ".")
    if("Lakh" in price):
        return float(price.replace(" Lakh", ""))  * 100000 * 0.01123
    elif("Crore" in price):
        return float(price.replace(" Crore", "").replace(",", ".")) * 10000000 * 0.01123
    else:
        return float(price)


df["car_prices_in_rupee"] = df["car_prices_in_rupee"].apply(rupee_to_euro)

In [69]:
df = df.rename(columns={"car_prices_in_rupee": "price_euro"})

kms object to float

In [70]:
def convert_kms(kms):
    kms = kms.replace(" kms", "")
    return float(kms.replace(",", ".", 1).replace(",", ""))

df["kms_driven"] = df["kms_driven"].apply(convert_kms)

In [71]:
df["ownership"] = df["ownership"].str.replace(r"(st|nd|rd|th) Owner", "", regex=True).astype(int)
df["engine"] = df["engine"].str.replace(" cc", "").astype(int)
df["Seats"] = df["Seats"].str.replace(" Seats", "").astype(int)

Encoding

In [72]:
df["transmission"] = df["transmission"].apply(lambda x: int(x=="Automatic"))
df = df.rename(columns={"transmission": "is_automatic"})
df

Unnamed: 0,car_name,price_euro,kms_driven,fuel_type,is_automatic,ownership,manufacture,engine,Seats
0,Jeep Compass 2.0 Longitude Option BSIV,11263.69,86.226,Diesel,0,1,2017,1956,5
1,Renault Duster RXZ Turbo CVT,14408.09,13.248,Petrol,1,1,2021,1330,5
2,Toyota Camry 2.5 G,18417.20,60.343,Petrol,1,1,2016,2494,5
3,Honda Jazz VX CVT,8725.71,26.696,Petrol,1,1,2018,1199,5
4,Volkswagen Polo 1.2 MPI Highline,5783.45,69.414,Petrol,0,1,2016,1199,5
...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,32454.70,45.000,Diesel,1,1,2018,2995,7
5508,BMW M Series M4 Coupe,72882.70,29.000,Petrol,1,2,2015,1968,5
5509,Jaguar XF 2.2 Litre Luxury,15441.25,90.000,Diesel,1,2,2013,2755,5
5510,BMW 7 Series 730Ld,33577.70,79.000,Diesel,1,3,2015,2967,6
