In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Read the dataset and split into X , y arrays

In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
#Take X and y as numpy arrays
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [5]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Handle Missing Data

In [7]:
from sklearn.preprocessing import Imputer

In [8]:
#Create imputer object, that replaces missing values, with the average along columns(axis=0)
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis=0)

'''
Calculate statistics avg/mean/mode and store it internally in the imputer object
This case it calculates separate averages for columns 1 and 2
(Note: Only passing float-columns, as imputer dosen't accept string columns)
'''
imputer = imputer.fit(X[:,1:3])

'''
Apply those statistics to transform data (In this case replace nan's with avgs calculated in previous step)
Note: If you apply transform to some other data, it would replace that value with avg value for that column
calculated in the previous step, thus we can use this technique to transform test data based on avgs/stats 
calculated on the training data
'''
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [9]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Process Categorical data

In [10]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [11]:
#First convert categorical strings to numbers (0 to num_classes - 1)
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])

In [12]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [13]:
'''
To avoid Model to falsely learn any ranks (eg. 2 > 0 {Spain > France}, 1 < 2 {Germany < Spain}) between categories
Convert it to One-Hot encoding

Specify the column containing categorical features
Unlike previous methods where we used to pass only specific columns, here we pass the entire array
This is because it returns a new array with different shape (More columns added after one-hot-encoding)
'''
onehotencoder_X = OneHotEncoder(categorical_features=[0])

'''
By default any one hot encoding invlolves lots of zeros, hence sklearn treats and stores it as a sparse matrix
Meaning it may internally store it differently (efficiently, to save space and time complexity), 
but here we need to convert it to dense array to work with
Note: There is no '_'(underscore) between 'to' and 'array'
'''
X = onehotencoder_X.fit_transform(X).toarray()

In [17]:
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [14]:
#The output representation depends on the model
#Here we could have done one hot encoding, had it been a neural network with 2 output neurons
#But mainly it's because there are only 2 output classes and it's a binary classifier so 0 and 1 should suffice!
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [15]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

### Create Train-Test Split

In [18]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
X_train, y_train

(array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
         6.37777778e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
         6.70000000e+04],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
         4.80000000e+04],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
         5.20000000e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
         7.90000000e+04],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
         6.10000000e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
         7.20000000e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
         5.80000000e+04]]), array([1, 1, 1, 0, 1, 0, 0, 1]))

In [24]:
X_test, y_test

(array([[0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
        [0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04]]), array([0, 0]))

### Feature Scaling

In [25]:
'''
Now different features/columns here are in different ranges. 
eg. Age is in the range 20 to 80, whereas salary is in the range 10000 to 50000
A lot of machine learning algorithms involve calculating Eucledian Distance (Difference Squared)
Thus features/columns with larger values will dominate over ones with smaller values and the model might be biased
towards those columns.
Hence it's better to bring all values to the same scale (Say 0 to 1, -1 to 1, etc.)

Why, When and How of feature Scaling!
https://medium.com/greyatom/why-how-and-when-to-scale-your-features-4b30ab09db5e
'''
from sklearn.preprocessing import StandardScaler

In [28]:
'''
Apply the standard scaler, x - mean/std_dev, mean and std_dev will be calculated for train set
and applied to train as well as test set, so that the test set has the same distribution as the train set
'''

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [29]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [30]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])