**Importing the Libraries**

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


**Import DataSet** 

In [11]:
dataset= pd.read_csv("Data.csv")
x= dataset.iloc[:, :-1].values
y= dataset.iloc[:, -1].values

print (x)
print(y)


[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


**Calculating the Missing Data**

In [15]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(x[:, 1:3])
x[:, 1:3] =imp_mean.transform(x[:, 1:3])


print (x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


**Encoding Independent Variable**

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x), dtype = np.str)

print (x)

#https://towardsdatascience.com/columntransformer-in-scikit-for-labelencoding-and-onehotencoding-in-machine-learning-c6255952731b
#Use this link for better uniderstanding.

[['1.0' '0.0' '0.0' '44.0' '72000.0']
 ['0.0' '0.0' '1.0' '27.0' '48000.0']
 ['0.0' '1.0' '0.0' '30.0' '54000.0']
 ['0.0' '0.0' '1.0' '38.0' '61000.0']
 ['0.0' '1.0' '0.0' '40.0' '63777.77777777778']
 ['1.0' '0.0' '0.0' '35.0' '58000.0']
 ['0.0' '0.0' '1.0' '38.77777777777778' '52000.0']
 ['1.0' '0.0' '0.0' '48.0' '79000.0']
 ['0.0' '1.0' '0.0' '50.0' '83000.0']
 ['1.0' '0.0' '0.0' '37.0' '67000.0']]


**Encoding Dependent Variable**

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y= le.fit_transform(y)

print (y)

[0 1 0 0 1 1 0 1 0 1]


**Feature Scaling**

In [19]:
#What is Feature Scaling and Why ?
# Feature Scaling is a technique to standardize the independent features present in the data in a fixed range. It is performed during the data pre-processing to handle highly varying magnitudes or values or units. If feature scaling is not done, then a machine learning algorithm tends to weigh greater values, higher and consider smaller values as the lower values, regardless of the unit of the values.

# Example: If an algorithm is not using feature scaling method then it can consider the value 3000 meter to be greater than 5 km but that’s actually not true and in this case, the algorithm will give wrong predictions. So, we use Feature Scaling to bring all values to same magnitudes and thus, tackle this issue.

# There are to ways to do Feature Scaling. 1> Standardisation and anothier is 2> Normalisation. 

# Here we are implementing Standardization as it is a bit more efficient. 

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

print (x)


[[ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01  7.58874362e-01
   7.49473254e-01]
 [-8.16496581e-01 -6.54653671e-01  1.52752523e+00 -1.71150388e+00
  -1.43817841e+00]
 [-8.16496581e-01  1.52752523e+00 -6.54653671e-01 -1.27555478e+00
  -8.91265492e-01]
 [-8.16496581e-01 -6.54653671e-01  1.52752523e+00 -1.13023841e-01
  -2.53200424e-01]
 [-8.16496581e-01  1.52752523e+00 -6.54653671e-01  1.77608893e-01
   2.35783334e-16]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01 -5.48972942e-01
  -5.26656882e-01]
 [-8.16496581e-01 -6.54653671e-01  1.52752523e+00  8.88178420e-17
  -1.07356980e+00]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01  1.34013983e+00
   1.38753832e+00]
 [-8.16496581e-01  1.52752523e+00 -6.54653671e-01  1.63077256e+00
   1.75214693e+00]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01 -2.58340208e-01
   2.93712492e-01]]


**Split Dataset into Test Set and Training Set**

In [21]:
from sklearn.model_selection import train_test_split
x_train, y_train , x_test, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

print(x_train)
print(y_train)
print(x_test)
print(y_test)

[[-8.16496581e-01  1.52752523e+00 -6.54653671e-01  1.77608893e-01
   2.35783334e-16]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01 -2.58340208e-01
   2.93712492e-01]
 [-8.16496581e-01 -6.54653671e-01  1.52752523e+00 -1.71150388e+00
  -1.43817841e+00]
 [-8.16496581e-01 -6.54653671e-01  1.52752523e+00  8.88178420e-17
  -1.07356980e+00]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01  1.34013983e+00
   1.38753832e+00]
 [-8.16496581e-01 -6.54653671e-01  1.52752523e+00 -1.13023841e-01
  -2.53200424e-01]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01  7.58874362e-01
   7.49473254e-01]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01 -5.48972942e-01
  -5.26656882e-01]]
[[-0.81649658  1.52752523 -0.65465367 -1.27555478 -0.89126549]
 [-0.81649658  1.52752523 -0.65465367  1.63077256  1.75214693]]
[1 1 1 0 1 0 0 1]
[0 0]
