In [140]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [141]:
dataset = pd.read_csv('Data.csv')

In [142]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [143]:
dataset.shape

(10, 4)

In [144]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [145]:
X = dataset.iloc[:, :-1].values # Grab all the rows, and feature columns (all except last column)
y = dataset.iloc[:, -1].values # Dependent Variable

In [146]:
# X[3][0] = np.nan

In [147]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [148]:
type(X)

numpy.ndarray

### Replace missing data with mean of values in column:

In [149]:
from sklearn.impute import SimpleImputer

Initialize imputer object, which is used to fill in missing values in numpy Array

In [150]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

Mean statistic on the train data is computed and stored during FIT step:

In [151]:
imputer.fit(X[:, 1:3])  # fit(X, y=None)

Stored data in imputer:

In [152]:
imputer.statistics_

array([3.87777778e+01, 6.37777778e+04])

Mean statistic is applied to the data during TRANSFORM step:

In [153]:
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [154]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Imputer only applied to numerical data?

### Replace categorical data with encoded values:

#### Encode the independent categorical values (countries):

In [155]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [156]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

In [157]:
X = np.array(ct.fit_transform(X))

#### Encode the dependent categorical values (purchased yes/no):

In [158]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [159]:
from sklearn.preprocessing import LabelEncoder

In [160]:
le = LabelEncoder() # Use label encode when 2 binary categorical values are involved
y = le.fit_transform(y)

In [161]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [162]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Should we apply feature scaling before splitting into training/test set or after splitting? AFTER!

This prevents information leakage when testing your model on the test set. 

If feature scaling is done before splitting, then the test data also ends up being scaled, and it then doesn't make sense to test your model on that test data since it's not representative of future new data

### Split data into training and test set:

In [163]:
from sklearn.model_selection import train_test_split

In [164]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [165]:
X_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [166]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [167]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [168]:
y_test

array([0, 1])