In [1]:
#importing Libraries and dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values

In [2]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [4]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [5]:
#Taking care of missing data
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
#Encoding Categorical data using LabelEncoder
from sklearn.preprocessing import LabelEncoder
labelEncoder_X = LabelEncoder()
labelEncoder_X.fit_transform(X[:,0])

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

In [7]:
X[:, 0] = labelEncoder_X.fit_transform(X[:, 0]) 
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [8]:
#Encoding Categorical data using OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [0])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
)
# X = ct.fit_transform(X)
X = np.array(ct.fit_transform(X), dtype=np.long)

In [9]:
X

array([[    1,     0,     0,    44, 72000],
       [    0,     0,     1,    27, 48000],
       [    0,     1,     0,    30, 54000],
       [    0,     0,     1,    38, 61000],
       [    0,     1,     0,    40, 63777],
       [    1,     0,     0,    35, 58000],
       [    0,     0,     1,    38, 52000],
       [    1,     0,     0,    48, 79000],
       [    0,     1,     0,    50, 83000],
       [    1,     0,     0,    37, 67000]])

In [10]:
labelEncoder_Y = LabelEncoder()
Y = labelEncoder_X.fit_transform(Y) 
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [11]:
#Splitting the dataset into Trainning set & Test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [12]:
X_train

array([[    0,     1,     0,    40, 63777],
       [    1,     0,     0,    37, 67000],
       [    0,     0,     1,    27, 48000],
       [    0,     0,     1,    38, 52000],
       [    1,     0,     0,    48, 79000],
       [    0,     0,     1,    38, 61000],
       [    1,     0,     0,    44, 72000],
       [    1,     0,     0,    35, 58000]])

In [13]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [14]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.27978024,  0.12374357],
       [ 1.        , -0.37796447, -0.77459667, -0.23673712,  0.4617671 ],
       [-1.        , -0.37796447,  1.29099445, -1.95846165, -1.53092514],
       [-1.        , -0.37796447,  1.29099445, -0.06456467, -1.11141099],
       [ 1.        , -0.37796447, -0.77459667,  1.65715986,  1.72030956],
       [-1.        , -0.37796447,  1.29099445, -0.06456467, -0.16750414],
       [ 1.        , -0.37796447, -0.77459667,  0.96847005,  0.98615979],
       [ 1.        , -0.37796447, -0.77459667, -0.58108203, -0.48213975]])

In [15]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.44194429, -0.90165391],
       [-1.        ,  2.64575131, -0.77459667,  2.00150476,  2.13982372]])