# **Step 1: Import the necessary libraries**

In [None]:
import pandas as pd
import scipy
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

# **Step 2: Data Importing**

In [None]:
df = pd.read_csv('SampleData.csv')

# **Step 3 : Seggregation of dataset into Matrix of Features & Dependent Variable Vector**

In [None]:
x_features = df.iloc[:, :-1].values
y_dependent = df.iloc[:, -1].values

In [None]:
print(x_features)

[['USA' 34.0 70000.0]
 ['Spain' 28.0 55000.0]
 ['Germany' 31.0 60000.0]
 ['USA' 39.0 nan]
 ['Germany' 41.0 72000.0]
 ['USA' 36.0 65000.0]
 ['Spain' nan 59000.0]
 ['USA' 49.0 85000.0]
 ['Germany' 51.0 90000.0]
 ['USA' 38.0 71000.0]]


In [None]:
print(y_dependent)

['No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes']


# **Step 4 : Handling missing Data**

In [None]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x_features[:,1:3])
x_features[:,1:3] = imputer.transform(x_features[:,1:3])

In [None]:
print (x_features)

[['USA' 34.0 70000.0]
 ['Spain' 28.0 55000.0]
 ['Germany' 31.0 60000.0]
 ['USA' 39.0 69666.66666666667]
 ['Germany' 41.0 72000.0]
 ['USA' 36.0 65000.0]
 ['Spain' 38.55555555555556 59000.0]
 ['USA' 49.0 85000.0]
 ['Germany' 51.0 90000.0]
 ['USA' 38.0 71000.0]]


# **Step 5 : Encoding Categorical Data : One Hot Encoding**

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
column_transformer = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder='passthrough')
x_features = np.array(column_transformer.fit_transform(x_features))

In [None]:
print(x_features)

[[0.0 0.0 1.0 34.0 70000.0]
 [0.0 1.0 0.0 28.0 55000.0]
 [1.0 0.0 0.0 31.0 60000.0]
 [0.0 0.0 1.0 39.0 69666.66666666667]
 [1.0 0.0 0.0 41.0 72000.0]
 [0.0 0.0 1.0 36.0 65000.0]
 [0.0 1.0 0.0 38.55555555555556 59000.0]
 [0.0 0.0 1.0 49.0 85000.0]
 [1.0 0.0 0.0 51.0 90000.0]
 [0.0 0.0 1.0 38.0 71000.0]]


# **Step 6 : Encoding Categorical Data : Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoding = LabelEncoder()
y_dependent = label_encoding.fit_transform(y_dependent)

In [None]:
print(y_dependent)

[0 1 0 1 0 1 0 1 0 1]


# **Step 7 : Splitting dataset into Training and Test Dataset**

In [None]:
from sklearn.model_selection import train_test_split
x_train_set, x_test_set, y_train_set, y_test_set = train_test_split(x_features, y_dependent, test_size = 0.2, random_state = 1)

In [None]:
print(x_train_set)

[[0.0 1.0 0.0 38.55555555555556 59000.0]
 [1.0 0.0 0.0 41.0 72000.0]
 [0.0 0.0 1.0 34.0 70000.0]
 [0.0 0.0 1.0 39.0 69666.66666666667]
 [0.0 1.0 0.0 28.0 55000.0]
 [0.0 0.0 1.0 49.0 85000.0]
 [1.0 0.0 0.0 51.0 90000.0]
 [0.0 0.0 1.0 36.0 65000.0]]


In [None]:
print(x_test_set)

[[1.0 0.0 0.0 31.0 60000.0]
 [0.0 0.0 1.0 38.0 71000.0]]


In [None]:
print(y_train_set)

[0 0 0 1 1 1 0 1]


In [None]:
print(y_test_set)

[0 1]


# **Step 8 : Feature Scaling**

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_set[ :,3:] = sc.fit_transform(x_train_set[:,3:])
x_test_set[ :,3:] = sc.transform(x_test_set[:,3:])


In [19]:
print (x_train_set)

[[0.0 1.0 0.0 -0.14327751628179441 -1.0492747141653387]
 [1.0 0.0 0.0 0.20215868735650533 0.11575628519261646]
 [0.0 0.0 1.0 -0.7870449866986264 -0.06347925317014586]
 [0.0 0.0 1.0 -0.08047093380210375 -0.09335184289727247]
 [0.0 1.0 0.0 -1.6349338501744537 -1.4077457908908633]
 [0.0 0.0 1.0 1.3326771719909416 1.2807872845505714]
 [1.0 0.0 0.0 1.6153067931495506 1.7288761304574773]
 [0.0 0.0 1.0 -0.5044153655400173 -0.5115680990770517]]


In [20]:
print (x_test_set)

[[1.0 0.0 0.0 -1.21098941843654 -0.9596569449839575]
 [0.0 0.0 1.0 -0.22178574438140827 0.0261385160112353]]
