In [1]:
#Data.csv

**Step 1: Importing the libraries**

In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

**Step 2: Importing dataset**

In [46]:
data = pd.read_csv("/content/Data.csv")
data.head(3)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No


In [47]:
data.shape

(10, 4)

**Step 3: Handling the missing data**

In [48]:
data.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

`Age` and `Salary` features have missing values

In [49]:
data.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


Average `Age` is **38.78** and Average `Salary` is **63,778.78**.

In [50]:
data["Age"] = data["Age"].fillna(data["Age"].mean())

In [51]:
data["Salary"] = data["Salary"].fillna(data["Salary"].mean())

In [52]:
data.isna().sum()

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64

All missing values have been imputed using Mean values. 

**Step 4: Encoding categorical data**

In [53]:
data["Country"].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [54]:
data["Purchased"].unique()

array(['No', 'Yes'], dtype=object)

**Step 5: Creating a dummy variable**

In [55]:
data["Purchased"].value_counts()

No     5
Yes    5
Name: Purchased, dtype: int64

In [56]:
data['Purchased'] = data['Purchased'].map({'No':0,'Yes':1})

In [57]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63777.777778,1


In [58]:
data = pd.get_dummies(data)

In [59]:
data.head()

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [69]:
# Using 70:30 split of train:test
train = data[0:7]
test = data[7:]
x_train = train.drop('Purchased',axis=1)
y_train = train["Purchased"]
x_test = test.drop('Purchased',axis=1)
y_test = test["Purchased"]

**Step 7: Feature Scaling**

In [70]:
sc = StandardScaler()
sc.fit(x_train)
x_train_scales = sc.transform(x_train)

In [71]:
x_train_scales

array([[ 1.44157926,  1.82413934,  1.58113883, -0.63245553, -0.8660254 ],
       [-1.66492253, -1.39417885, -0.63245553, -0.63245553,  1.15470054],
       [-1.11671633, -0.5895993 , -0.63245553,  1.58113883, -0.8660254 ],
       [ 0.34516687,  0.34907684, -0.63245553, -0.63245553,  1.15470054],
       [ 0.71063766,  0.72156737, -0.63245553,  1.58113883, -0.8660254 ],
       [-0.20303933, -0.05321293,  1.58113883, -0.63245553, -0.8660254 ],
       [ 0.4872944 , -0.85779248, -0.63245553, -0.63245553,  1.15470054]])

In [72]:
from sklearn.preprocessing import MinMaxScaler
# define min max scaler
scaler = MinMaxScaler()
# transform data
scaled = scaler.fit_transform(x_train)
print(scaled)

[[1.         1.         1.         0.         0.        ]
 [0.         0.         0.         0.         1.        ]
 [0.17647059 0.25       0.         1.         0.        ]
 [0.64705882 0.54166667 0.         0.         1.        ]
 [0.76470588 0.65740741 0.         1.         0.        ]
 [0.47058824 0.41666667 1.         0.         0.        ]
 [0.69281046 0.16666667 0.         0.         1.        ]]
