## Data Preprocessing

In [1]:
# 1. Getting Dataset
# 2. Importing Libraries
# 3. Importing Datasets
# 4. Finding Missing Values
# 5. Encoding Categorical Data
# 6. Splitting Dataset into Training & Test Set
# 7. Feature Scaling

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("data.csv")

In [8]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [10]:
df.shape

(10, 4)

In [16]:
# Separate Dependent and Independent Variables
x = df[['Country', 'Age', 'Salary']].values
y = df[['Purchased']]

## Handling Missing Values

In [17]:
# 1. Dropping rows containing Missing Values
# 2. Replacing NaN by calculating Mean, Median, Mode

from sklearn.impute import SimpleImputer

In [24]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [26]:
imp = imp.fit(x[:,1:3])

In [21]:
x[:,1:3] = imp.transform(x[:,1:3])

In [22]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding Categorical Data

In [27]:
from sklearn.preprocessing import LabelEncoder

In [28]:
label_encode_x = LabelEncoder()

In [29]:
x[:,0] = label_encode_x.fit_transform(x[:,0])

In [30]:
x

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [31]:
from sklearn.preprocessing import OneHotEncoder

In [32]:
oneHotEncoder = OneHotEncoder()

In [33]:
oneHotEncoder.fit_transform(df.Country.values.reshape(-1,1)).toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [34]:
label_encoder_y = LabelEncoder()

In [40]:
y = label_encoder_y.fit_transform(y)

In [41]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

## Splitting Dataset into Training & Test Set

In [42]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

## Feature Scaling

In [47]:
from sklearn.preprocessing import StandardScaler

In [48]:
sc_x = StandardScaler()

In [50]:
X_train = sc_x.fit_transform(X_train)

In [53]:
X_test = sc_x.fit_transform(X_test)

In [54]:
X_test

array([[ 0., -1., -1.],
       [ 0.,  1.,  1.]])