Step 1: Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

Step 2: Importing the dataset

In [2]:
data = pd.read_csv('./datasets/Data.csv')

Step 3: Handling the Missing Data

In [3]:
data.head(10)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [5]:
imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer = imputer.fit(data.iloc[ : , 1:3])
data.iloc[ : , 1:3] = imputer.transform(data.iloc[ : , 1:3])



In [6]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes


Step 4: Encoding categorical data

In [7]:
labelencoder_X = LabelEncoder()
data.iloc[ : , 3] = labelencoder_X.fit_transform(data.iloc[ : , 3])


In [8]:
data.head(25)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63777.777778,1
5,France,35.0,58000.0,1
6,Spain,38.777778,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


In [9]:
dummies_df = pd.get_dummies(data['Country'])
data = pd.concat([data, dummies_df], axis=1)
data = data.drop('Country', axis=1)
data.head(10)

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,38.777778,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


Step 5: Train and test split

In [10]:
y = data['Purchased']
X = data.drop('Purchased', axis=1)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Step 6: Feature Scaling

In [12]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
Y_train = ss.fit_transform(X_test)