In [121]:
# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [122]:
# import the dataset
dataset = pd.read_csv('Data.csv')

In [123]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [124]:
X = dataset.iloc[:,:-1]

In [125]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [126]:
y = dataset.iloc[:, 3]

In [127]:
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [128]:
pd.DataFrame(y)

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


In [129]:
X[['Age', 'Salary']] # Or X.iloc[:,1:3]

Unnamed: 0,Age,Salary
0,44.0,72000.0
1,27.0,48000.0
2,30.0,54000.0
3,38.0,61000.0
4,40.0,
5,35.0,58000.0
6,,52000.0
7,48.0,79000.0
8,50.0,83000.0
9,37.0,67000.0


In [130]:
# Taking care of missing values
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean = imp_mean.fit(X[['Age', 'Salary']])


In [131]:
 X[['Age', 'Salary']] = imp_mean.transform(X[['Age', 'Salary']])

In [132]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [133]:
# Encoding categorical data
# Use only LabelEncoder for dependent vector (y) and for any ranked independent matrix (X)
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# labelencoder_X = LabelEncoder()
# X['Country'] = labelencoder_X.fit_transform(X['Country'])
# onehotencoder = OneHotEncoder(categorical_features = [0])
# X = pd.DataFrame(onehotencoder.fit_transform(X).toarray())

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [0])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
)

X = pd.DataFrame(np.array(ct.fit_transform(X), dtype=np.int))

le_y = LabelEncoder()
y = pd.DataFrame(le_y.fit_transform(y), columns =['Purchased'])

In [134]:
X

Unnamed: 0,0,1,2,3,4
0,1,0,0,44,72000
1,0,0,1,27,48000
2,0,1,0,30,54000
3,0,0,1,38,61000
4,0,1,0,40,63777
5,1,0,0,35,58000
6,0,0,1,38,52000
7,1,0,0,48,79000
8,0,1,0,50,83000
9,1,0,0,37,67000


In [135]:
X.columns = ['France', 'Spain', 'Germany', 'Age', 'Salary']

In [136]:
X

Unnamed: 0,France,Spain,Germany,Age,Salary
0,1,0,0,44,72000
1,0,0,1,27,48000
2,0,1,0,30,54000
3,0,0,1,38,61000
4,0,1,0,40,63777
5,1,0,0,35,58000
6,0,0,1,38,52000
7,1,0,0,48,79000
8,0,1,0,50,83000
9,1,0,0,37,67000


In [137]:
y

Unnamed: 0,Purchased
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


In [138]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split # Used to be sklearn.cross_validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [139]:
X_train

Unnamed: 0,France,Spain,Germany,Age,Salary
6,0,0,1,38,52000
0,1,0,0,44,72000
2,0,1,0,30,54000
7,1,0,0,48,79000
5,1,0,0,35,58000
9,1,0,0,37,67000
3,0,0,1,38,61000
1,0,0,1,27,48000


In [140]:
X_test

Unnamed: 0,France,Spain,Germany,Age,Salary
4,0,1,0,40,63777
8,0,1,0,50,83000


In [141]:
y_train

Unnamed: 0,Purchased
6,0
0,0
2,0
7,1
5,1
9,1
3,0
1,1


In [142]:
y_test

Unnamed: 0,Purchased
4,1
8,0


In [143]:
# Feature Scaling 
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test) # Only transform 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  """


In [144]:
X_train

array([[-1.        , -0.37796447,  1.29099445,  0.1373077 , -0.94468628],
       [ 1.        , -0.37796447, -0.77459667,  1.07884623,  1.07064446],
       [-1.        ,  2.64575131, -0.77459667, -1.11807701, -0.74315321],
       [ 1.        , -0.37796447, -0.77459667,  1.70653859,  1.77601021],
       [ 1.        , -0.37796447, -0.77459667, -0.33346156, -0.34008706],
       [ 1.        , -0.37796447, -0.77459667, -0.01961539,  0.56681177],
       [-1.        , -0.37796447,  1.29099445,  0.1373077 , -0.03778745],
       [-1.        , -0.37796447,  1.29099445, -1.58884627, -1.34775243]])

In [145]:
X_test

array([[-1.        ,  2.64575131, -0.77459667,  0.45115388,  0.24204122],
       [-1.        ,  2.64575131, -0.77459667,  2.02038476,  2.17907636]])