# Data Preprocessing on Toyota Dataset
1. Binning
2. Smoothening
3. Encoding
4. Scaling
5. Outlier analysis

In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [114]:
df = pd.read_csv('Toyota.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1436 non-null   int64  
 1   Price       1436 non-null   int64  
 2   Age         1336 non-null   float64
 3   KM          1436 non-null   object 
 4   FuelType    1336 non-null   object 
 5   HP          1436 non-null   object 
 6   MetColor    1286 non-null   float64
 7   Automatic   1436 non-null   int64  
 8   CC          1436 non-null   int64  
 9   Doors       1436 non-null   object 
 10  Weight      1436 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 123.5+ KB


In [116]:
df.isnull().sum()

Unnamed: 0      0
Price           0
Age           100
KM              0
FuelType      100
HP              0
MetColor      150
Automatic       0
CC              0
Doors           0
Weight          0
dtype: int64

In [117]:
df.duplicated().value_counts()

False    1436
Name: count, dtype: int64

In [118]:
df["Doors"].unique()

array(['three', '3', '5', '4', 'four', 'five', '2'], dtype=object)

Doors column has inconsistent data. We need to make it consistent by mapping it to correct values : array(['three', '3', '5', '4', 'four', 'five', '2'], dtype=object)

In [119]:

df["Doors"] = df["Doors"].map({'three' : 3, 'four' : 4, 'five' : 5, '3' : 3, '2' : 2, '1' : 1, '0' : 0, '5' : 5, '4' : 4})


In [120]:
df.sample(frac=1).head()


Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
1135,1135,5950,72.0,105856,Petrol,86,0.0,0,1300,3,1015
1030,1030,9450,68.0,30300,Petrol,86,1.0,0,1300,4,1000
969,969,10500,62.0,47750,Petrol,86,1.0,0,1300,5,1040
615,615,6250,67.0,149000,Diesel,90,1.0,0,2000,5,1135
1402,1402,10500,,47400,Petrol,86,1.0,0,1300,5,1040


In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1436 non-null   int64  
 1   Price       1436 non-null   int64  
 2   Age         1336 non-null   float64
 3   KM          1436 non-null   object 
 4   FuelType    1336 non-null   object 
 5   HP          1436 non-null   object 
 6   MetColor    1286 non-null   float64
 7   Automatic   1436 non-null   int64  
 8   CC          1436 non-null   int64  
 9   Doors       1436 non-null   int64  
 10  Weight      1436 non-null   int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 123.5+ KB


In [122]:
df.drop('Unnamed: 0', axis = 1, inplace=True)
df.columns

Index(['Price', 'Age', 'KM', 'FuelType', 'HP', 'MetColor', 'Automatic', 'CC',
       'Doors', 'Weight'],
      dtype='object')

In [123]:
df["Age"] = df["Age"].fillna(df["Age"].mean())

In [124]:
df.sample(frac=1).head(20)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
1120,7350,72.0,??,Petrol,86,1.0,0,1300,3,1015
76,18750,31.0,25266,Petrol,110,1.0,0,1600,5,1130
553,10995,55.672156,41273,Petrol,97,0.0,0,1400,5,1060
617,6900,59.0,144521,Diesel,72,1.0,0,2000,5,1135
1415,6950,72.0,42000,Petrol,110,,0,1600,3,1050
1300,7950,76.0,71000,Petrol,110,1.0,0,1600,3,1050
712,8750,55.672156,91246,Petrol,86,1.0,0,1300,3,1015
116,21950,8.0,10841,,90,1.0,0,2000,5,1270
330,11000,41.0,33000,Petrol,110,1.0,1,1600,5,1075
595,10500,56.0,19313,Petrol,97,0.0,0,1400,3,1025


In [125]:
df["FuelType"].value_counts().iloc[0]

1177

In [126]:
df['FuelType'] = df['FuelType'].fillna(df['FuelType'].mode()[0])

In [127]:
df.sample(frac=1).head(20)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
694,8950,66.0,97046,Petrol,86,1.0,0,1300,4,1000
295,10950,38.0,41754,Petrol,110,0.0,0,1600,3,1040
1158,6950,72.0,99245,Petrol,110,1.0,0,1600,5,1114
1256,8500,76.0,76268,Petrol,86,1.0,0,1300,3,1015
408,9950,50.0,102030,Petrol,97,1.0,0,1400,5,1060
1286,7950,79.0,72328,Petrol,110,1.0,0,1600,5,1075
472,9950,52.0,65170,Petrol,97,1.0,0,1400,3,1025
509,10900,46.0,53700,Petrol,97,1.0,0,1400,5,1060
1150,9500,79.0,100719,Petrol,110,0.0,0,1600,5,1070
668,9750,64.0,106784,Petrol,110,0.0,1,1600,3,1050


In [128]:
df['FuelType'].isna().sum()

0

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      1436 non-null   int64  
 1   Age        1436 non-null   float64
 2   KM         1436 non-null   object 
 3   FuelType   1436 non-null   object 
 4   HP         1436 non-null   object 
 5   MetColor   1286 non-null   float64
 6   Automatic  1436 non-null   int64  
 7   CC         1436 non-null   int64  
 8   Doors      1436 non-null   int64  
 9   Weight     1436 non-null   int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 112.3+ KB


In [130]:
df['MetColor'] = df['MetColor'].mode()[0]

In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      1436 non-null   int64  
 1   Age        1436 non-null   float64
 2   KM         1436 non-null   object 
 3   FuelType   1436 non-null   object 
 4   HP         1436 non-null   object 
 5   MetColor   1436 non-null   float64
 6   Automatic  1436 non-null   int64  
 7   CC         1436 non-null   int64  
 8   Doors      1436 non-null   int64  
 9   Weight     1436 non-null   int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 112.3+ KB


# Will perform binning on Age

In [132]:
df['Age'].max()

80.0

In [133]:
df['Age'].min()

1.0

In [134]:
df = df.drop('younger_than',axis=1)

KeyError: "['younger_than'] not found in axis"

In [None]:
df['time_period'] = pd.cut(df['Age'], 5, labels=['New', 'Recent', 'Mid', 'Old', 'Ancient'])

In [None]:
df[df['Age'] == 1.0]

In [None]:
df.sample(frac=1).head(20)

In [None]:
df['time_period'].value_counts().plot(kind='bar', xlabel="Period", ylabel="Frequency")

# Data Smoothening will be done on Price

In [None]:
df['Price'].min()

In [None]:
df['Price'].max()

## Explanation
- We are sorting the price before applying the smoothening function (rolling mean) so that the groups that are formed have values that are closer to each other. By doing so, the mean produced and assigned will make more sense.

- Later we are reassigning the smoothened prices with the proper indices

In [None]:
sorted_prices = df['Price'].sort_values().reset_index()
sorted_prices['Smooth_Price'] = sorted_prices['Price'].rolling(window=10).mean()

In [None]:
sorted_prices

In [None]:
index_prices = sorted_prices.sort_values(by='index').set_index('index')
index_prices

In [136]:
df['Smooth_Price'] = index_prices['Smooth_Price']

In [137]:
df.sample(frac=1).head(20)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight,Smooth_Price
1435,6950,76.0,1,Petrol,110,1.0,0,1600,5,1114,6950.0
1299,8950,72.0,71054,Petrol,86,1.0,1,1300,3,1045,8950.0
782,8950,59.0,76162,Petrol,110,1.0,0,1600,5,1075,8950.0
1240,8750,80.0,82021,Petrol,86,1.0,0,1300,5,1035,8750.0
1404,8500,77.0,45507,Petrol,110,1.0,0,1600,3,1050,8498.0
1128,7500,55.672156,109263,Petrol,110,1.0,0,1600,5,1070,7500.0
551,10750,52.0,41700,Petrol,97,1.0,0,1400,3,1025,10750.0
1288,8950,71.0,72128,Petrol,107,1.0,1,1600,5,1100,8950.0
1370,8900,78.0,57628,Petrol,110,1.0,0,1600,5,1075,8900.0
435,10895,53.0,79800,Petrol,110,1.0,0,1600,5,1075,10799.0
