# Data Preprocessing Tools

## Importing the libraries

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [0]:
dataset = pd.read_csv('Demo.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [0]:
print(X)

[['andhra' 40 5]
 ['maharastra' 52 4]
 ['delhi' 54 5]
 ['tamilnadu' 51 2]
 ['kerala' 41 8]
 ['tamilnadu' 41 6]
 ['kerala' 52 5]
 ['kerala' 53 7]
 ['maharastra' 51 9]
 ['delhi' 54 7]
 ['andhra' 51 5]
 ['andhra' 52 8]
 ['tamilnadu' 54 6]
 ['maharastra' 51 8]
 ['kerala' 23 8]
 ['delhi' 25 7]
 ['delhi' 56 5]
 ['delhi' 26 2]
 ['tamilnadu' 29 1]]


In [0]:
print(y)

['y' 'n' 'y' 'y' 'n' 'y' 'y' 'y' 'n' 'n' 'y' 'y' 'y' 'y' 'n' 'n' 'n' 'y'
 'n']


## Taking care of missing data

In [0]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [0]:
print(X)

[['andhra' 40.0 5.0]
 ['maharastra' 52.0 4.0]
 ['delhi' 54.0 5.0]
 ['tamilnadu' 51.0 2.0]
 ['kerala' 41.0 8.0]
 ['tamilnadu' 41.0 6.0]
 ['kerala' 52.0 5.0]
 ['kerala' 53.0 7.0]
 ['maharastra' 51.0 9.0]
 ['delhi' 54.0 7.0]
 ['andhra' 51.0 5.0]
 ['andhra' 52.0 8.0]
 ['tamilnadu' 54.0 6.0]
 ['maharastra' 51.0 8.0]
 ['kerala' 23.0 8.0]
 ['delhi' 25.0 7.0]
 ['delhi' 56.0 5.0]
 ['delhi' 26.0 2.0]
 ['tamilnadu' 29.0 1.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [0]:
print(X)

[[1.0 0.0 0.0 0.0 0.0 40.0 5.0]
 [0.0 0.0 0.0 1.0 0.0 52.0 4.0]
 [0.0 1.0 0.0 0.0 0.0 54.0 5.0]
 [0.0 0.0 0.0 0.0 1.0 51.0 2.0]
 [0.0 0.0 1.0 0.0 0.0 41.0 8.0]
 [0.0 0.0 0.0 0.0 1.0 41.0 6.0]
 [0.0 0.0 1.0 0.0 0.0 52.0 5.0]
 [0.0 0.0 1.0 0.0 0.0 53.0 7.0]
 [0.0 0.0 0.0 1.0 0.0 51.0 9.0]
 [0.0 1.0 0.0 0.0 0.0 54.0 7.0]
 [1.0 0.0 0.0 0.0 0.0 51.0 5.0]
 [1.0 0.0 0.0 0.0 0.0 52.0 8.0]
 [0.0 0.0 0.0 0.0 1.0 54.0 6.0]
 [0.0 0.0 0.0 1.0 0.0 51.0 8.0]
 [0.0 0.0 1.0 0.0 0.0 23.0 8.0]
 [0.0 1.0 0.0 0.0 0.0 25.0 7.0]
 [0.0 1.0 0.0 0.0 0.0 56.0 5.0]
 [0.0 1.0 0.0 0.0 0.0 26.0 2.0]
 [0.0 0.0 0.0 0.0 1.0 29.0 1.0]]


### Encoding the Dependent Variable

In [0]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [0]:
print(y)

[1 0 1 1 0 1 1 1 0 0 1 1 1 1 0 0 0 1 0]


## Splitting the dataset into the Training set and Test set

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

In [0]:
print(X_train)

[[0.0 0.0 0.0 1.0 0.0 51.0 9.0]
 [0.0 0.0 0.0 0.0 1.0 29.0 1.0]
 [0.0 0.0 1.0 0.0 0.0 23.0 8.0]
 [0.0 1.0 0.0 0.0 0.0 56.0 5.0]
 [0.0 0.0 1.0 0.0 0.0 52.0 5.0]
 [0.0 0.0 1.0 0.0 0.0 41.0 8.0]
 [0.0 1.0 0.0 0.0 0.0 54.0 5.0]
 [0.0 0.0 0.0 0.0 1.0 41.0 6.0]
 [0.0 0.0 0.0 1.0 0.0 51.0 8.0]
 [0.0 1.0 0.0 0.0 0.0 54.0 7.0]
 [0.0 0.0 1.0 0.0 0.0 53.0 7.0]
 [0.0 1.0 0.0 0.0 0.0 26.0 2.0]
 [1.0 0.0 0.0 0.0 0.0 52.0 8.0]
 [0.0 0.0 0.0 0.0 1.0 51.0 2.0]
 [1.0 0.0 0.0 0.0 0.0 40.0 5.0]
 [0.0 1.0 0.0 0.0 0.0 25.0 7.0]
 [0.0 0.0 0.0 0.0 1.0 54.0 6.0]]


In [0]:
print(X_test)

[[1.0 0.0 0.0 0.0 0.0 51.0 5.0]
 [0.0 0.0 0.0 1.0 0.0 52.0 4.0]]


In [0]:
print(y_train)

[0 0 0 0 1 0 1 1 1 0 1 1 1 1 1 0 1]


In [0]:
print(y_test)

[1 0]


## Feature Scaling

In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [0]:
print(X)

[[ 2.30940108 -0.5976143  -0.51639778 -0.4330127  -0.51639778 -0.46154913
  -0.31075944]
 [-0.4330127  -0.5976143  -0.51639778  2.30940108 -0.51639778  0.63463005
  -0.76494631]
 [-0.4330127   1.67332005 -0.51639778 -0.4330127  -0.51639778  0.81732658
  -0.31075944]
 [-0.4330127  -0.5976143  -0.51639778 -0.4330127   1.93649167  0.54328179
  -1.67332005]
 [-0.4330127  -0.5976143   1.93649167 -0.4330127  -0.51639778 -0.37020086
   1.05180118]
 [-0.4330127  -0.5976143  -0.51639778 -0.4330127   1.93649167 -0.37020086
   0.14342743]
 [-0.4330127  -0.5976143   1.93649167 -0.4330127  -0.51639778  0.63463005
  -0.31075944]
 [-0.4330127  -0.5976143   1.93649167 -0.4330127  -0.51639778  0.72597832
   0.5976143 ]
 [-0.4330127  -0.5976143  -0.51639778  2.30940108 -0.51639778  0.54328179
   1.50598805]
 [-0.4330127   1.67332005 -0.51639778 -0.4330127  -0.51639778  0.81732658
   0.5976143 ]
 [ 2.30940108 -0.5976143  -0.51639778 -0.4330127  -0.51639778  0.54328179
  -0.31075944]
 [ 2.30940108 -0.5976