In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
dataFrame = pd.read_csv("Data.csv")
dataFrame

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [13]:
x = dataFrame.iloc[:, :-1].values
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [14]:
y = dataFrame.iloc[:,-1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Missing data

In [17]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.NaN, strategy="mean")
imputer.fit(x[:,1:3])

x[:,1:3] = imputer.transform(x[:,1:3])

In [18]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Encoding categorical data

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [32]:
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough")

In [33]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [34]:
x = np.array(ct.fit_transform(x))

In [35]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
labelEncoder = LabelEncoder()

In [38]:
y = labelEncoder.fit_transform(y)

In [39]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Training and testing dataset

In [40]:
from sklearn.model_selection import train_test_split

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25)

In [46]:
x_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 0.0, 1.0, 38.0, 61000.0]], dtype=object)

In [47]:
x_test

array([[0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0]], dtype=object)

In [48]:
y_train

array([0, 0, 1, 1, 0, 1, 0])

## Feature Scaling

In [49]:
# Standardization is a recommended scaling applied after train test split
# std = x - mean(x)/stdDev(x)

In [50]:
from sklearn.preprocessing import StandardScaler

In [51]:
ss = StandardScaler()

In [52]:
x_train[:, 3:] = ss.fit_transform(x_train[:,3:])
x_test[:, 3:] = ss.fit_transform(x_test[:,3:])

In [53]:
x_train

array([[0.0, 0.0, 1.0, 0.43614234845757466, -0.9565180032646371],
       [1.0, 0.0, 0.0, 1.3927478994078555, 1.5335908482094183],
       [1.0, 0.0, 0.0, 0.11048939494258525, 0.9110636353409045],
       [0.0, 0.0, 1.0, -1.7213084685792293, -1.454539773559448],
       [0.0, 1.0, 0.0, -1.171769109522685, -0.7075071181172315],
       [0.0, 1.0, 0.0, 0.6600287539991296, 0.5098794314923071],
       [0.0, 0.0, 1.0, 0.2936691812947667, 0.16403097989868784]],
      dtype=object)

In [54]:
x_test

array([[0.0, 1.0, 0.0, 0.8521330020990451, 0.8816259554501988],
       [1.0, 0.0, 0.0, -1.40351317992784, -1.3984411707141073],
       [1.0, 0.0, 0.0, 0.5513801778287938, 0.5168152152639098]],
      dtype=object)