# Data Preprocessing on Toyota Dataset from Kaggle
1. Binning
2. Smoothening
3. Encoding
4. Scaling
5. Outlier analysis

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [42]:
df = pd.read_csv(r'C:\SEM5\de\datasets\Toyota.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1436 non-null   int64  
 1   Price       1436 non-null   int64  
 2   Age         1336 non-null   float64
 3   KM          1436 non-null   object 
 4   FuelType    1336 non-null   object 
 5   HP          1436 non-null   object 
 6   MetColor    1286 non-null   float64
 7   Automatic   1436 non-null   int64  
 8   CC          1436 non-null   int64  
 9   Doors       1436 non-null   object 
 10  Weight      1436 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 123.5+ KB


In [44]:
df.isnull().sum()

Unnamed: 0      0
Price           0
Age           100
KM              0
FuelType      100
HP              0
MetColor      150
Automatic       0
CC              0
Doors           0
Weight          0
dtype: int64

In [45]:
df.duplicated().value_counts()

False    1436
Name: count, dtype: int64

In [46]:
df["Doors"].unique()

array(['three', '3', '5', '4', 'four', 'five', '2'], dtype=object)

Doors column has inconsistent data. We need to make it consistent by mapping it to correct values : array(['three', '3', '5', '4', 'four', 'five', '2'], dtype=object)

In [47]:

df["Doors"] = df["Doors"].map({'three' : 3, 'four' : 4, 'five' : 5, '3' : 3, '2' : 2, '1' : 1, '0' : 0, '5' : 5, '4' : 4})


In [48]:
df.sample(frac=1).head()


Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
984,984,9450,63.0,45000,Petrol,110,1.0,0,1600,4,1035
106,106,18800,14.0,11500,Petrol,110,1.0,0,1600,3,1045
1286,1286,7950,79.0,72328,Petrol,110,1.0,0,1600,5,1075
1214,1214,8750,79.0,86000,,86,1.0,0,1300,5,1040
1019,1019,9750,64.0,34114,Petrol,110,0.0,0,1600,3,1050


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1436 non-null   int64  
 1   Price       1436 non-null   int64  
 2   Age         1336 non-null   float64
 3   KM          1436 non-null   object 
 4   FuelType    1336 non-null   object 
 5   HP          1436 non-null   object 
 6   MetColor    1286 non-null   float64
 7   Automatic   1436 non-null   int64  
 8   CC          1436 non-null   int64  
 9   Doors       1436 non-null   int64  
 10  Weight      1436 non-null   int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 123.5+ KB


Unnamed: 0 Column is just representing index again. So, its creating redundancy. We can drop it.

In [50]:
df.columns.unique()
df.drop('Unnamed: 0', axis = 1, inplace=True)

In [51]:
df["Age"] = df["Age"].fillna(df["Age"].mean())


In [52]:
df.sample(frac=1).head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
56,15250,28.0,43210,Petrol,97,0.0,0,1400,5,1110
16,22750,30.0,34000,Petrol,192,1.0,0,1800,3,1185
456,10950,55.672156,71725,Petrol,110,1.0,0,1600,4,1035
6,16900,27.0,??,Diesel,????,,0,2000,3,1245
561,10750,54.0,39000,Petrol,97,0.0,0,1400,3,1025


In [53]:
df["FuelType"].value_counts().iloc[0]

np.int64(1177)

In [54]:
df[df["KM"] == "??"]

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
6,16900,27.0,??,Diesel,????,,0,2000,3,1245
64,17950,27.0,??,Petrol,97,1.0,0,1400,5,1110
91,22250,20.0,??,Diesel,90,1.0,0,2000,3,1260
112,24950,8.0,??,Diesel,116,1.0,0,2000,5,1320
125,21750,16.0,??,Petrol,110,1.0,0,1600,5,1130
131,15950,20.0,??,Petrol,97,1.0,0,1400,5,1110
398,10500,55.0,??,Petrol,110,1.0,0,1600,4,1035
491,10750,54.0,??,Petrol,110,1.0,0,1600,5,1075
1049,6750,77.0,??,Diesel,72,1.0,0,2000,4,1100
1120,7350,72.0,??,Petrol,86,1.0,0,1300,3,1015


In [59]:
df["KM"] = df["KM"].replace("??", np.nan)

In [58]:
df[df["KM"] ==

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.000000,46986,Diesel,90,1.0,0,2000,3,1165
1,13750,23.000000,72937,Diesel,90,1.0,0,2000,3,1165
2,13950,24.000000,41711,Diesel,90,,0,2000,3,1165
3,14950,26.000000,48000,Diesel,90,0.0,0,2000,3,1165
4,13750,30.000000,38500,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...
1431,7500,55.672156,20544,Petrol,86,1.0,0,1300,3,1025
1432,10845,72.000000,,Petrol,86,0.0,0,1300,3,1015
1433,8500,55.672156,17016,Petrol,86,0.0,0,1300,3,1015
1434,7250,70.000000,,,86,1.0,0,1300,3,1015
