In [1]:
import numpy as np
import pandas as pd

# importing the data set

In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


# Extracting dependent and independent variables

In [4]:
# independent variables
X = dataset.iloc[:,:-1].values
# dependent variable
y = dataset.iloc[:,3].values

In [5]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Now we have to take care of the missing data.

'''
In order to remove missing values from the dataset, what we do is to take the mean of the overall column for 
the missing value, this preprocessing can be done via a library known as sklearn, which provides another library called
preprocessing for doing preprocessing stuffs.
from that sub-library we import a class 'Imputer' which will allow us to handle missing values in the dataset.
But now this Imputer class is modified and it is now called SimpleImputer which is present inside another sub-library called sklearn.impute.
'''

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [9]:
imputer = imputer.fit(X[:, 1:3])

In [10]:
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [11]:
X[:, 1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, 63777.77777777778],
       [35.0, 58000.0],
       [38.77777777777778, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

# Categorical Data

In many cases we need to encode the categorical data into some numeriacal data because the machine learning algorithms 
and equations are based on the numerical data.
for that we use the same sklearn library to encode the categorical data

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
encode_x = LabelEncoder()
# encode_x = encode_x.fit(X[:, 0])
# X[:, 0] = encode_x.transform(X[:, 0])
X[:, 0] = encode_x.fit_transform(X[:, 0])

In [14]:
X[:, 0]

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=object)

As we can see that we have encoded the country column with the numerical column but here is one ambiguity
Which is that some country is assigned number 2, some 1, and some 0, due to which the machine learning equation will think that
might one country has greater value than another. or one country is best than the other, while this is not the case.

for that we must avoid the algorithms to not to think that one country is greater or less than another country which does not
make any sense.
to prevent this we use the dummy variable to convert all the categories into same single numerical value in the form of three 
column instead of one single column

We will use another class for creating this dummy variable called OneHotEncoder

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [16]:
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],   # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                                         # Leave the rest of the columns untouched
)

X = ct.fit_transform(X)

In [17]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [18]:
encode_y = LabelEncoder()
y = encode_y.fit_transform(y)

In [19]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Spliting the dataset into test and training set

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_trian, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [26]:
X_train

array([[0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [27]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0]], dtype=object)