# Importing Libraries

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

# Importing Datasets

In [8]:
dataset = pd.read_csv('/Users/user/Desktop/data/Data.csv')
x = dataset.iloc[ : , :-1]
y = dataset.iloc[ : ,-1]

# Printing x(Independent Variables) and y(Dependent Variables)

In [10]:
print(x)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [11]:
print(y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


# Taking Care Of Missing Data

In [13]:
imputer = SimpleImputer(missing_values= np.nan ,strategy='mean') #creating an object using the simpleImputer class telling it we want all
#nan values to be replaced by mean of other values

In [14]:
imputer.fit(x.iloc[ :, 1:3])
#we use the fit method to connect ur independent variables in our data set(x) to our object
#we also indicate the columns we want to be affected by our object since we also have a column which is made up of words
#note we use x.iloc because we can't slice though a pandas array without using iloc

In [20]:
imputer.transform(x.iloc[ :, 1:3]) #after we have fitted the data we now transform the data with respect to our object. 

array([[4.40000000e+01, 7.20000000e+04],
       [2.70000000e+01, 4.80000000e+04],
       [3.00000000e+01, 5.40000000e+04],
       [3.80000000e+01, 6.10000000e+04],
       [4.00000000e+01, 6.37777778e+04],
       [3.50000000e+01, 5.80000000e+04],
       [3.87777778e+01, 5.20000000e+04],
       [4.80000000e+01, 7.90000000e+04],
       [5.00000000e+01, 8.30000000e+04],
       [3.70000000e+01, 6.70000000e+04]])

In [23]:
x.iloc[ :, 1:3]=imputer.transform(x.iloc[ :, 1:3]) #assign our transformed dataset to our original dataset

In [25]:
x = np.round(x,1)
print(x) #print out our dataset

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0  63777.8
5   France  35.0  58000.0
6    Spain  38.8  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


# Encoding Categorical Data

In [28]:
# Encoding the independent data columns
#importing required variables to encoded
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [30]:
#creating an object of the column transformer class called ct
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder='passthrough')
#attach now our dataset to our newly created object and assign it to replace our old x and convert it to a numpy array since data is 
#analyzed in an array
x = np.array(ct.fit_transform(x))

In [32]:
x = np.round(x,1)
print(x)

[[1.00000e+00 0.00000e+00 0.00000e+00 4.40000e+01 7.20000e+04]
 [0.00000e+00 0.00000e+00 1.00000e+00 2.70000e+01 4.80000e+04]
 [0.00000e+00 1.00000e+00 0.00000e+00 3.00000e+01 5.40000e+04]
 [0.00000e+00 0.00000e+00 1.00000e+00 3.80000e+01 6.10000e+04]
 [0.00000e+00 1.00000e+00 0.00000e+00 4.00000e+01 6.37778e+04]
 [1.00000e+00 0.00000e+00 0.00000e+00 3.50000e+01 5.80000e+04]
 [0.00000e+00 0.00000e+00 1.00000e+00 3.88000e+01 5.20000e+04]
 [1.00000e+00 0.00000e+00 0.00000e+00 4.80000e+01 7.90000e+04]
 [0.00000e+00 1.00000e+00 0.00000e+00 5.00000e+01 8.30000e+04]
 [1.00000e+00 0.00000e+00 0.00000e+00 3.70000e+01 6.70000e+04]]


In [34]:
from sklearn.preprocessing import LabelEncoder
#create an object of LabelEncoder class le
le = LabelEncoder()
#attach now our dataset to our newly created object and assign it to replace our old y
y = le.fit_transform(y)

In [36]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# Splitting Data Into Training And Test Set

In [39]:
#importing train_test_split function to help us split data
from sklearn.model_selection import train_test_split
#splitting our data into 80% training and 20% test data 
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 1)

In [41]:
print(x_train)

[[0.00000e+00 0.00000e+00 1.00000e+00 3.88000e+01 5.20000e+04]
 [0.00000e+00 1.00000e+00 0.00000e+00 4.00000e+01 6.37778e+04]
 [1.00000e+00 0.00000e+00 0.00000e+00 4.40000e+01 7.20000e+04]
 [0.00000e+00 0.00000e+00 1.00000e+00 3.80000e+01 6.10000e+04]
 [0.00000e+00 0.00000e+00 1.00000e+00 2.70000e+01 4.80000e+04]
 [1.00000e+00 0.00000e+00 0.00000e+00 4.80000e+01 7.90000e+04]
 [0.00000e+00 1.00000e+00 0.00000e+00 5.00000e+01 8.30000e+04]
 [1.00000e+00 0.00000e+00 0.00000e+00 3.50000e+01 5.80000e+04]]


In [43]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [45]:
print(x_test)

[[0.0e+00 1.0e+00 0.0e+00 3.0e+01 5.4e+04]
 [1.0e+00 0.0e+00 0.0e+00 3.7e+01 6.7e+04]]


In [47]:
print(y_test)

[0 1]


# Feature Scaling Of Training And Test Set

In [50]:
#import the library that will help us apply standardisation
from sklearn.preprocessing import StandardScaler
#create an object of the standardscaler class called sc
sc = StandardScaler()
#scaling of the train feature matrix
x_train = sc.fit_transform(x_train)
#scaling of test feature matrix using the scaling parameters of the train feature matrix
x_test = sc.transform(x_test)

In [52]:
print(x_train)

[[-0.77459667 -0.57735027  1.29099445 -0.18878287 -1.0781262 ]
 [-0.77459667  1.73205081 -0.77459667 -0.01452176 -0.07013001]
 [ 1.29099445 -0.57735027 -0.77459667  0.5663486   0.63356221]
 [-0.77459667 -0.57735027  1.29099445 -0.30495694 -0.30786642]
 [-0.77459667 -0.57735027  1.29099445 -1.90235043 -1.42046388]
 [ 1.29099445 -0.57735027 -0.77459667  1.14721896  1.23265315]
 [-0.77459667  1.73205081 -0.77459667  1.43765414  1.57499083]
 [ 1.29099445 -0.57735027 -0.77459667 -0.74060971 -0.56461968]]


In [54]:
print(x_test)

[[-0.77459667  1.73205081 -0.77459667 -1.46669766 -0.90695736]
 [ 1.29099445 -0.57735027 -0.77459667 -0.45017453  0.20564011]]
