# Data Preprocessing Tools

## Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [3]:
dataset = pd.read_csv("Data.csv")
# iloc takes 2 lists as input - first list is number of rows and second list is number of cols. 0:3 means a list starting from 0th position and until 2nd position
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
print(X)
print(y)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0
0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


## Taking care of missing data

In [4]:
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy= 'mean')
imputer.fit(X.iloc[:, 1:3])
X.iloc[:, 1:3] = imputer.transform(X.iloc[:, 1:3])
print(X)


   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


In [5]:
print(X.dtypes)
X = X.astype({"Age":"int" , "Salary": "int"})
print(X)

Country     object
Age        float64
Salary     float64
dtype: object
   Country  Age  Salary
0   France   44   72000
1    Spain   27   48000
2  Germany   30   54000
3    Spain   38   61000
4  Germany   40   63777
5   France   35   58000
6    Spain   38   52000
7   France   48   79000
8  Germany   50   83000
9   France   37   67000


## Encoding categorical data

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [7]:
print(X)

[[1.0000e+00 0.0000e+00 0.0000e+00 4.4000e+01 7.2000e+04]
 [0.0000e+00 0.0000e+00 1.0000e+00 2.7000e+01 4.8000e+04]
 [0.0000e+00 1.0000e+00 0.0000e+00 3.0000e+01 5.4000e+04]
 [0.0000e+00 0.0000e+00 1.0000e+00 3.8000e+01 6.1000e+04]
 [0.0000e+00 1.0000e+00 0.0000e+00 4.0000e+01 6.3777e+04]
 [1.0000e+00 0.0000e+00 0.0000e+00 3.5000e+01 5.8000e+04]
 [0.0000e+00 0.0000e+00 1.0000e+00 3.8000e+01 5.2000e+04]
 [1.0000e+00 0.0000e+00 0.0000e+00 4.8000e+01 7.9000e+04]
 [0.0000e+00 1.0000e+00 0.0000e+00 5.0000e+01 8.3000e+04]
 [1.0000e+00 0.0000e+00 0.0000e+00 3.7000e+01 6.7000e+04]]


Encoding the Dependent Variable


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=1)

In [12]:
print(X_train)


[[0.0000e+00 0.0000e+00 1.0000e+00 3.8000e+01 5.2000e+04]
 [0.0000e+00 1.0000e+00 0.0000e+00 4.0000e+01 6.3777e+04]
 [1.0000e+00 0.0000e+00 0.0000e+00 4.4000e+01 7.2000e+04]
 [0.0000e+00 0.0000e+00 1.0000e+00 3.8000e+01 6.1000e+04]
 [0.0000e+00 0.0000e+00 1.0000e+00 2.7000e+01 4.8000e+04]
 [1.0000e+00 0.0000e+00 0.0000e+00 4.8000e+01 7.9000e+04]
 [0.0000e+00 1.0000e+00 0.0000e+00 5.0000e+01 8.3000e+04]
 [1.0000e+00 0.0000e+00 0.0000e+00 3.5000e+01 5.8000e+04]]


In [13]:
print(X_test)

[[0.0e+00 1.0e+00 0.0e+00 3.0e+01 5.4e+04]
 [1.0e+00 0.0e+00 0.0e+00 3.7e+01 6.7e+04]]


In [14]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [15]:
print(y_test)

[0 1]


## Feature Scaling

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [17]:
print(X_train)

[[ 0.          0.          1.         -0.28942984 -1.07811699]
 [ 0.          1.          0.          0.         -0.07018988]
 [ 1.          0.          0.          0.57885968  0.63357038]
 [ 0.          0.          1.         -0.28942984 -0.30785767]
 [ 0.          0.          1.         -1.88129397 -1.42045447]
 [ 1.          0.          0.          1.15771937  1.23266096]
 [ 0.          1.          0.          1.44714921  1.57499844]
 [ 1.          0.          0.         -0.72357461 -0.56461078]]


In [23]:
print(X_train[:, 1:3])

[[0. 1.]
 [1. 0.]
 [0. 0.]
 [0. 1.]
 [0. 1.]
 [0. 0.]
 [1. 0.]
 [0. 0.]]


In [18]:

print(X_test)

[[ 0.          1.          0.         -1.44714921 -0.90694825]
 [ 1.          0.          0.         -0.43414476  0.20564854]]
