## <span style="color : green"> Data Pre-Processing </span>

# <center> Table of Contents </center>

1. To find the percentage of missing values, duplicate values
2. Impute the missing data - interpolation, mean, mode and Droping column
1. Compute Z-Score for an attribute 
1. Apply the normalization technique - MinMax Scalar, Standard Scalar, Z-Score Normalization
1. Convert the categorical to numerical by LabelEncoder and One-Hot Encoder 

In [32]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from warnings import filterwarnings

filterwarnings('ignore')

In [5]:
df = pd.read_csv('../Datasets/Toyato.csv')
df.head()

Unnamed: 0,Price,Age,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,Automatic,cc,Doors,Gears,Quarterly_Tax,Weight,Mfr_Guarantee,BOVAG_Guarantee,Guarantee_Period
0,12950,23,10,2002,46986,Diesel,90,1,0,2000,3.0,Five,210,1165,0.0,1,3
1,12950,23,10,2002,72937,Diesel,90,1,0,2000,3.0,Five,210,1165,0.0,1,3
2,12950,24,9,2002,41711,Diesel,90,1,0,2000,3.0,Five,210,1165,1.0,1,3
3,13950,26,7,2002,48000,Diesel,90,0,0,2000,3.0,Five,210,1165,1.0,1,4
4,17950,30,3,2002,38500,Diesel,90,0,0,2000,3.0,Five,210,1170,1.0,1,5


### To find the percentage of missing values, duplicate values

In [9]:
# Percentage of missing values
missingPercent = (df.isna().sum() / len(df)) *100
missingPercent[missingPercent > 0 ]

Doors            12.0
Mfr_Guarantee    46.0
dtype: float64

In [30]:
# Percentage of duplicate values

duplicatePercent = df.duplicated().sum() * 100 / len(df)
print(f'The percentage of duplicated values - {duplicatePercent}')

The percentage of duplicated values - 0.0


### Impute the missing data - interpolation, mean, mode and Droping column

In [21]:
# Using mean
tf = df.copy()
tf['Doors'].fillna(tf['Doors'].mean(), inplace=True)
tf.isna().sum()

Price                0
Age                  0
Mfg_Month            0
Mfg_Year             0
KM                   0
Fuel_Type            0
HP                   0
Met_Color            0
Automatic            0
cc                   0
Doors                0
Gears                0
Quarterly_Tax        0
Weight               0
Mfr_Guarantee       23
BOVAG_Guarantee      0
Guarantee_Period     0
dtype: int64

In [22]:
# Using mode 
tf = df.copy()
tf['Doors'].fillna(tf['Doors'].mode()[0], inplace=True)
tf.isna().sum()

Price                0
Age                  0
Mfg_Month            0
Mfg_Year             0
KM                   0
Fuel_Type            0
HP                   0
Met_Color            0
Automatic            0
cc                   0
Doors                0
Gears                0
Quarterly_Tax        0
Weight               0
Mfr_Guarantee       23
BOVAG_Guarantee      0
Guarantee_Period     0
dtype: int64

In [24]:
# Nearest value
tf = df.copy()
tf['Doors'].fillna(tf['Doors'].interpolate(), inplace=True)
tf.isna().sum()

Price                0
Age                  0
Mfg_Month            0
Mfg_Year             0
KM                   0
Fuel_Type            0
HP                   0
Met_Color            0
Automatic            0
cc                   0
Doors                0
Gears                0
Quarterly_Tax        0
Weight               0
Mfr_Guarantee       23
BOVAG_Guarantee      0
Guarantee_Period     0
dtype: int64

### Compute Z-Score for an attribute

In [75]:
df['Z-Score-Price'] = (df.Price - df.Price.mean()) / df.Price.std()
df.head(2)

Unnamed: 0,Price,Age,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,Automatic,cc,Doors,Gears,Quarterly_Tax,Weight,Mfr_Guarantee,BOVAG_Guarantee,Guarantee_Period,Z-Score-Price
0,12950,23,10,2002,46986,Diesel,90,1,0,2000,3.0,Five,210,1165,0.0,1,3,-0.849159
1,12950,23,10,2002,72937,Diesel,90,1,0,2000,3.0,Five,210,1165,0.0,1,3,-0.849159


### Apply the normalization technique - MinMax Scalar, Standard Scalar, Z-Score Normalization

In [50]:
features = df.drop(['Price','Fuel_Type','Gears'], axis=1)
print(list(features.columns))

['Age', 'Mfg_Month', 'Mfg_Year', 'KM', 'HP', 'Met_Color', 'Automatic', 'cc', 'Doors', 'Quarterly_Tax', 'Weight', 'Mfr_Guarantee', 'BOVAG_Guarantee', 'Guarantee_Period']


In [73]:
# MinMax Scalar 

scalar = MinMaxScaler()
scaled_features = scalar.fit_transform(features)
scaled_features[:1]

array([[0.1       , 0.9       , 0.        , 0.36994059, 0.17073171,
        1.        , 0.        , 1.        , 0.        , 0.88837209,
        0.48780488, 0.        , 1.        , 0.        ]])

In [72]:
# Standard Scalar 

scalar = StandardScaler()
scaled_features = scalar.fit_transform(features)
scaled_features[:1]

array([[-1.04767205,  1.04767205,  0.        ,  0.04379574, -0.6497924 ,
         0.81649658, -0.14285714,  1.30049203, -0.39735971,  1.34767273,
         0.25416539, -1.11803399,  0.25264558, -0.46852129]])

In [74]:
# Z-Score Normalization 

scaled_features = (features - features.mean())/features.std()
scaled_features.head(2)

Unnamed: 0,Age,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,cc,Doors,Quarterly_Tax,Weight,Mfr_Guarantee,BOVAG_Guarantee,Guarantee_Period
0,-1.037142,1.037142,,0.043356,-0.643262,0.80829,-0.141421,1.287421,-0.392818,1.334128,0.251611,-1.097134,0.250106,-0.463812
1,-1.037142,1.037142,,1.127391,-0.643262,0.80829,-0.141421,1.287421,-0.392818,1.334128,0.251611,-1.097134,0.250106,-0.463812


### Convert the categorical to numerical by LabelEncoder and One-Hot Encoder

In [56]:
labels = df[['Fuel_Type']]

In [70]:
# Label Encoder

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
print(encoded_labels)

[0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 0 0 0 1 0 0]


In [71]:
# One Hot Encoder 
encoded_result = []

encoder = OneHotEncoder()
encoded_labels = encoder.fit_transform(labels)

for i in range(len( encoded_labels.toarray())):
    encoded_result.append(np.argmax(encoded_labels.toarray()[i]))

print(encoded_result)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0]
