# Data Preprocessing Tools

## Importing the libraries

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

## Importing the dataset

In [23]:
dataset = pd.read_csv('iris.csv')
X = dataset.iloc[:, :-1].values  
y = dataset.iloc[:, -1].values 

## Taking care of missing data

In [26]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

## Encoding categorical data

### Encoding the Independent Variable

In [27]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X).toarray() 

### Encoding the Dependent Variable

In [28]:
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)

## Splitting the dataset into the Training set and Test set

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
print(X_train)

[[0.  0.  0.  ... 3.6 1.  0.2]
 [0.  0.  0.  ... 4.4 1.5 0.4]
 [0.  0.  0.  ... 3.1 4.4 1.4]
 ...
 [0.  0.  0.  ... 4.  1.2 0.2]
 [0.  0.  0.  ... 2.6 4.  1.2]
 [0.  0.  0.  ... 3.  5.9 2.1]]


In [31]:
print(X_test)

[[0.  0.  0.  ... 2.8 4.7 1.2]
 [0.  0.  0.  ... 3.8 1.7 0.3]
 [0.  0.  0.  ... 2.6 6.9 2.3]
 ...
 [0.  0.  0.  ... 3.2 5.9 2.3]
 [0.  0.  0.  ... 3.  1.4 0.3]
 [0.  0.  0.  ... 3.1 1.6 0.2]]


In [32]:
print(y_train)

[0 0 1 0 0 2 1 0 0 0 2 1 1 0 0 1 2 2 1 2 1 2 1 0 2 1 0 0 0 1 2 0 0 0 1 0 1
 2 0 1 2 0 2 2 1 1 2 1 0 1 2 0 0 1 1 0 2 0 0 1 1 2 1 2 2 1 0 0 2 2 0 0 0 1
 2 0 2 2 0 1 1 2 1 2 0 2 1 2 1 1 1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1
 1 2 2 0 1 2 0 1 2]


In [33]:
print(y_test)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


## Feature Scaling

In [34]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [35]:
print(X_train)

[[ 0.          0.          0.         ...  1.20365799 -1.56253475
  -1.31260282]
 [ 0.          0.          0.         ...  2.99237573 -1.27600637
  -1.04563275]
 [ 0.          0.          0.         ...  0.08570939  0.38585821
   0.28921757]
 ...
 [ 0.          0.          0.         ...  2.09801686 -1.4479234
  -1.31260282]
 [ 0.          0.          0.         ... -1.0322392   0.15663551
   0.02224751]
 [ 0.          0.          0.         ... -0.13788033  1.24544335
   1.22361279]]


In [36]:
print(X_test)

[[ 0.          0.          0.         ... -0.58505976  0.55777524
   0.02224751]
 [ 0.          0.          0.         ...  1.65083742 -1.16139502
  -1.17911778]
 [ 0.          0.          0.         ... -1.0322392   1.8185001
   1.49058286]
 ...
 [ 0.          0.          0.         ...  0.30929911  1.24544335
   1.49058286]
 [ 0.          0.          0.         ... -0.13788033 -1.33331205
  -1.17911778]
 [ 0.          0.          0.         ...  0.08570939 -1.2187007
  -1.31260282]]
