In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Reading dataset

In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes



**Features and independent variables**

After importing the dataset, we will have to do a very importing thing. Which is differenciating between the matrix of features and the dependent variable vector. Depending on the features or the independent variables, we will predict the outcome of the dependent variable. This is a very important principle in machine learning.

In [4]:
# FEATURES / DEPENDENT VARIABLE MATRICES
# x = dataset.loc[:,['Country','Age','Salary']].values
# # or
x = dataset.iloc[:,:-1].values
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
# INDEPENDENT VARIABLE VECTOR
y = dataset.loc[:,'Purchased'].values
# or
y = dataset.iloc[:,-1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

Above the .values method will turn the dataset into np arrays.

# Taking care of missing data

In [6]:
from sklearn.impute import SimpleImputer

In [7]:
#1
'''First step is to create an object in which we will define 
what we will replace and how will we replace it.'''

imputer = SimpleImputer(missing_values=np.nan,strategy='mean')

#2
'''In step 2 we will have to apply this object to the matrix of
features.'''

x[:,1:-1] = imputer.fit_transform(x[:,1:-1])
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)


# Encode categorical data
In our dataset we have some columns which contains non-numeric data. For our machine it is not possible to create co-relation between these non-numeric and numeric data. That is why we need to encode categorcal data. Meaning we will have to turn the non-numeric values into numbers. We can generally map different countries with different numbers but the problem we will face if we do that is, then our ML model will think that there are numerical order between countries. It will think that order matters. That is why we will have to encode this country column in to vector columns. Which will help us to avoid this problem.

**Encoding the independent variables**

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [10]:
ct = ColumnTransformer(transformers=[("encoder",OneHotEncoder(),[0])],remainder='passthrough')

x = ct.fit_transform(x)


In [11]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)


**Encoding dependent variables**


In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [14]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes



# Feature Scaling
Feature scaling is a technique which will put all the values of the independent variable in the same range. If we look at the age and the salary column, we will see that those are not in the same range. Age is in the range of 0-100 and salary is in the range of 0-100000. We will have to do it because while training our machine learning model, some of them will create bias while creating corelation computation. Meaning, the features that have higher numbers, will dominates the other features and the other features might not be considered. Things to note that some models automatically fixes this issue. So, we will not have to apply it all the time. 

There are two ways of feature scaling -->

**Standardisation**

Xstand = (x-mean(x))/standard_Deviation(x)

Standardisation will put all the values of all the different features in the range of -3 to +3 (usually).

**Normalisation**

Xnorm = (x-min(x))/(max(x)-min(x))

This will put all the values of all the features between o and 1.

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

In [16]:
x

array([[ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
         7.58874362e-01,  7.49473254e-01],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
        -1.71150388e+00, -1.43817841e+00],
       [-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
        -1.27555478e+00, -8.91265492e-01],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
        -1.13023841e-01, -2.53200424e-01],
       [-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
         1.77608893e-01,  6.63219199e-16],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
        -5.48972942e-01, -5.26656882e-01],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
         0.00000000e+00, -1.07356980e+00],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
         1.34013983e+00,  1.38753832e+00],
       [-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
         1.63077256e+00,  1.75214693e+00],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
        -2.58340208e-01



# Spliting the dataset into Training set and Test set
With training set we will train out ML model and with test set we will evaluate our model's predictive power on new observations.

In [17]:
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

That random_state argument in optional, it is to make sure to get the same value as the tutorial, which i'm following.

In [23]:
x_train

array([[-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
         1.77608893e-01,  6.63219199e-16],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
        -2.58340208e-01,  2.93712492e-01],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
        -1.71150388e+00, -1.43817841e+00],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
         0.00000000e+00, -1.07356980e+00],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
         1.34013983e+00,  1.38753832e+00],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
        -1.13023841e-01, -2.53200424e-01],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
         7.58874362e-01,  7.49473254e-01],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
        -5.48972942e-01, -5.26656882e-01]])

In [20]:
x_test

array([[-0.81649658,  1.52752523, -0.65465367, -1.27555478, -0.89126549],
       [-0.81649658,  1.52752523, -0.65465367,  1.63077256,  1.75214693]])

In [21]:
y_train

array([1, 1, 1, 0, 1, 0, 0, 1])

In [22]:
y_test

array([0, 0])