# Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset

In [69]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [70]:
X = df.iloc[:, :-1]
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [71]:
y = df.iloc[:, -1]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

# Taking care of missing data

In [72]:
X['Age'] = df['Age'].fillna(df['Age'].mean())
X['Salary'] = df['Salary'].fillna(df['Salary'].mean())

In [73]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


# Encoding categorical data

### Encoding the independent variable

In [23]:
df['Country'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [74]:
dummies = pd.get_dummies(df['Country'])

In [75]:
X = pd.concat([X, dummies], axis='columns')
X

Unnamed: 0,Country,Age,Salary,France,Germany,Spain
0,France,44.0,72000.0,1,0,0
1,Spain,27.0,48000.0,0,0,1
2,Germany,30.0,54000.0,0,1,0
3,Spain,38.0,61000.0,0,0,1
4,Germany,40.0,63777.777778,0,1,0
5,France,35.0,58000.0,1,0,0
6,Spain,38.777778,52000.0,0,0,1
7,France,48.0,79000.0,1,0,0
8,Germany,50.0,83000.0,0,1,0
9,France,37.0,67000.0,1,0,0


In [76]:
# To handle dummy variable trap, just remove one dummy column from the df

In [79]:
#X.drop(['Spain'], axis=1, inplace=True)
#X.drop(['Country'], axis=1, inplace=True)
X

Unnamed: 0,Age,Salary,France,Germany
0,44.0,72000.0,1,0
1,27.0,48000.0,0,0
2,30.0,54000.0,0,1
3,38.0,61000.0,0,0
4,40.0,63777.777778,0,1
5,35.0,58000.0,1,0
6,38.777778,52000.0,0,0
7,48.0,79000.0,1,0
8,50.0,83000.0,0,1
9,37.0,67000.0,1,0


### Encoding the Dependent Variable

In [80]:
dummies = pd.get_dummies(df['Purchased'])

In [82]:
y = pd.concat([y, dummies], axis='columns')
y

Unnamed: 0,Purchased,No,Yes
0,No,1,0
1,Yes,0,1
2,No,1,0
3,No,1,0
4,Yes,0,1
5,Yes,0,1
6,No,1,0
7,Yes,0,1
8,No,1,0
9,Yes,0,1


In [83]:
y.drop(['Purchased'], axis=1, inplace=True)

In [84]:
y.drop(['No'], axis=1, inplace=True)

In [85]:
y

Unnamed: 0,Yes
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


# Splitting the dataset into the Training set and Test set

In [135]:
from sklearn.model_selection import train_test_split

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [92]:
y_train

Unnamed: 0,Yes
6,0
4,1
0,0
3,0
1,1
7,1
8,0
5,1


# Feature Scaling

In [94]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [114]:
X_test

Unnamed: 0,Age,Salary,France,Germany
2,30.0,54000.0,0,1
9,37.0,67000.0,1,0


In [133]:
#pd.options.mode.chained_assignment = None
X_train.iloc[:, :2] = sc.fit_transform(X_train.iloc[:, :2])
X_train


Unnamed: 0,Age,Salary,France,Germany
6,-0.191592,-1.078126,0,0
4,-0.014117,-0.070132,0,1
0,0.566709,0.633562,1,0
3,-0.30453,-0.307866,0,0
1,-1.901801,-1.420464,0,0
7,1.147534,1.232653,1,0
8,1.437947,1.574991,0,1
5,-0.74015,-0.564619,1,0


In [134]:
X_test.iloc[:, :2] = sc.transform(X_test.iloc[:, :2])
X_test

Unnamed: 0,Age,Salary,France,Germany
2,-6.035274,-5.528593,0,1
9,-5.88768,-5.528498,1,0
