In [201]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

IMPORTING THE DATASET

In [202]:
dataset = pd.read_csv('Data.csv')
#if the dataset file is seperated by ; use a delimiter or sep e.g
#dataset = pd.read_csv('Data.csv', delimiter=';')

#create X & y entities. note that : indicates all values in the range, -1 indicates last column and upper range is excluded 
# so always increase by 1

#seperating the features
X = dataset.iloc[: , :-1].values
# or if aware of the target you can do this e.g
# dataset.drop('Purchased', axis=1)- where axis 1 is column

#seperating the target
y = dataset.iloc[:, -1].values
#or if you are aware of the target name 
# y = dataset['Purchased']

In [203]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [204]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


TAKING CARE OF MISSING DATA

In [205]:
#checking for null values. true means there is a null value there
dataset.isnull()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [206]:
#calculating total null values in a column
dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [207]:
#calculating the total null values in the whole dataset
dataset.isnull().sum().sum()

2

In [208]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')

#applying the imputer object on the matrix of features x
imputer.fit(X[:, 1:3])

#transform will apply the transform strategy in this case 'mean'
X[:, 1:3] = imputer.transform(X[:, 1:3]) 

In [209]:
#printing the new matrix of features x
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


ENCODING CATEGORICAL DATA

- Encoding the independent varaible

In [210]:
#importing column transformer class from scikit-learn library
from sklearn.compose import ColumnTransformer

#importing one hot encoding class from the pre-processing module
from sklearn.preprocessing import OneHotEncoder

#you can create a list containing the categorical features and call it in ColumnTransformer instance e.g
# categorical_features = ['Country']

#Implementing an instance from the column transformer class using a variable ct
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[0])], remainder= 'passthrough')
#NOTE- the [0] can be replaced with a list that contains selected categorical features

#Applying the fit_transform method on the instance of ColumnTransformer
X = ct.fit_transform(X)
# OR X = ct.fit_transform(dataset))

#remember to convert the output into a numpy array for independent variables
X = np.array(X)  

In [211]:
#printing the matrix of features
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


- Encoding the dependent variable

In [212]:
#using LabelEncoder to encode binary categorical data
from sklearn.preprocessing import LabelEncoder 

#since its only one column in y nothing will be written in the parenthesis
le = LabelEncoder()

#output does not need to be converted into a numpy array because its a dependent variable
y = le.fit_transform(y)
#Note- it can also be written by selecting the column with dependent variable e.g
# y = le.fit_transform(dataset['purchased'])

In [213]:
#printing the dependent variable vector
print(y)

[0 1 0 0 1 1 0 1 0 1]


SPLITTING THE DATASET INTO THE TRAINING AND TEST SET

In [214]:
from sklearn.model_selection import train_test_split

#Note- test size is 20%, if random state has an integer usually 42, it will produce same result whenever the dataset is splitted
# else if 0, it will produce a different result each time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [215]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [216]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [217]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [218]:
print(y_test)

[0 1]


FEATURE SCALING -
This is not always used in every machine learning model.
Note feature scaling will not be applied to the dummy column(country)

In [219]:
#Applying feature scaling on the training and test sets using standardscaler

In [226]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.fit_transform(X_test[:, 3:])                            


In [227]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [228]:
print(X_test)

[[0.0 1.0 0.0 -1.0 -1.0]
 [1.0 0.0 0.0 1.0 1.0]]
