# Data Ingestion

In [1]:
import pandas as pd
data = pd.read_csv("cars.csv")
data.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


# Feature Engineering

In [2]:
data.shape

(8128, 5)

### Check for duplicate records

In [3]:
## check for duplicated records
data.duplicated().sum()

1678

In [4]:
data = data.drop_duplicates()
data.duplicated().sum()

0

### Finding numerical and categorical columns

In [5]:
numerical_columns=data.columns[data.dtypes!='object']
categorical_columns=data.columns[data.dtypes=='object']
print("Numerical columns:",numerical_columns)
print('Categorical Columns:',categorical_columns)

Numerical columns: Index(['km_driven', 'selling_price'], dtype='object')
Categorical Columns: Index(['brand', 'fuel', 'owner'], dtype='object')


### Converting categorical columns into numerical ones

In [6]:
data['brand'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Fiat', 'Datsun', 'Jeep',
       'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW',
       'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo',
       'Kia', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel', 'Peugeot'],
      dtype=object)

In [7]:
data['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [8]:
data['owner'].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [9]:
import numpy as np
#Using One hot encoding
d1 = pd.get_dummies(data['fuel'])
d2 = pd.get_dummies(data['brand'])

df = pd.concat([data,d1,d2],axis='columns')

In [10]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price,CNG,Diesel,LPG,Petrol,Ambassador,...,Mitsubishi,Nissan,Opel,Peugeot,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo
0,Maruti,145500,Diesel,First Owner,450000,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Skoda,120000,Diesel,Second Owner,370000,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Honda,140000,Petrol,Third Owner,158000,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Hyundai,127000,Diesel,First Owner,225000,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Maruti,120000,Petrol,First Owner,130000,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#Using Label Encoder
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['owner']= enc.fit_transform(df['owner'])

df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price,CNG,Diesel,LPG,Petrol,Ambassador,...,Mitsubishi,Nissan,Opel,Peugeot,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo
0,Maruti,145500,Diesel,0,450000,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Skoda,120000,Diesel,2,370000,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Honda,140000,Petrol,4,158000,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Hyundai,127000,Diesel,0,225000,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Maruti,120000,Petrol,0,130000,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Droping the categorical columns
df=df.drop(['brand', 'fuel'], axis=1)
df.head()

Unnamed: 0,km_driven,owner,selling_price,CNG,Diesel,LPG,Petrol,Ambassador,Ashok,Audi,...,Mitsubishi,Nissan,Opel,Peugeot,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo
0,145500,0,450000,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,120000,2,370000,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,140000,4,158000,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,127000,0,225000,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,120000,0,130000,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df.isnull().sum()

km_driven        0
owner            0
selling_price    0
CNG              0
Diesel           0
LPG              0
Petrol           0
Ambassador       0
Ashok            0
Audi             0
BMW              0
Chevrolet        0
Daewoo           0
Datsun           0
Fiat             0
Force            0
Ford             0
Honda            0
Hyundai          0
Isuzu            0
Jaguar           0
Jeep             0
Kia              0
Land             0
Lexus            0
MG               0
Mahindra         0
Maruti           0
Mercedes-Benz    0
Mitsubishi       0
Nissan           0
Opel             0
Peugeot          0
Renault          0
Skoda            0
Tata             0
Toyota           0
Volkswagen       0
Volvo            0
dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6450 entries, 0 to 8125
Data columns (total 39 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   km_driven      6450 non-null   int64
 1   owner          6450 non-null   int32
 2   selling_price  6450 non-null   int64
 3   CNG            6450 non-null   uint8
 4   Diesel         6450 non-null   uint8
 5   LPG            6450 non-null   uint8
 6   Petrol         6450 non-null   uint8
 7   Ambassador     6450 non-null   uint8
 8   Ashok          6450 non-null   uint8
 9   Audi           6450 non-null   uint8
 10  BMW            6450 non-null   uint8
 11  Chevrolet      6450 non-null   uint8
 12  Daewoo         6450 non-null   uint8
 13  Datsun         6450 non-null   uint8
 14  Fiat           6450 non-null   uint8
 15  Force          6450 non-null   uint8
 16  Ford           6450 non-null   uint8
 17  Honda          6450 non-null   uint8
 18  Hyundai        6450 non-null   uint8
 19  Isuzu 

In [93]:
# TO DELETE ROWS HAVING NULL VALUES

# data.dropna(how='all', axis=1, inplace=True)

#data = data.dropna(axis=0, subset=['column_name'])


In [94]:
#FILLING NULL WITH VALUES

#data['column_name'] = data['column_name'].fillna(data['column_name'].mode()[0])

# for numerical values :

#sns.boxplot(data.column_name)
#data.fillna(data.mean())
#data.fillna(data.median())