In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Importing Dataset
dataset = pd.read_csv('Data.csv')

In [3]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:
# First three columns are independent variables
# storing values of independent variables in X
X = dataset.iloc[:,:-1].values

# Last column is Dependent variable so we store this in y variable
y = dataset.iloc[:,3].values

In [5]:
print("Independent values: ",X)
print("Dependent values: ",y)

Independent values:  [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
Dependent values:  ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [6]:
## Taking care of Missing Data From Dataset
imputer = Imputer(missing_values='NaN',strategy='mean',axis=0)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

In [7]:
print("Now the missing data is removed: ",X)

Now the missing data is removed:  [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [8]:
# Now we will convert the categories into numbers from 1st and last column of the Dataset
labelEncoder_x = LabelEncoder()
# labelEncoder_x is a object of LabelEncoder class which converts text to numbers
X[:, 0] = labelEncoder_x.fit_transform(X[:,0])

In [9]:
print("Now the data in the first column is: ",X[:,0])

Now the data in the first column is:  [0 2 1 2 1 0 2 0 1 0]


In [10]:
# But the machine learning models will think that Germany has a higher precedence
# than France and spain because the value of germany is 2
# So to remove this issue we will create three columns for each column
# the value will be one corresponding to that country in the dataset
oneHotEncoder = OneHotEncoder(categorical_features= [0])
X = oneHotEncoder.fit_transform(X).toarray()

In [11]:
print("Now X has 5 columns: ",X)

Now X has 5 columns:  [[  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.40000000e+01
    7.20000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    4.80000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.87777778e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]]


In [12]:
# Now we will convert the categories into numbers from 1st and last column of the Dataset
labelEncoder_y = LabelEncoder()
# labelEncoder_x is a object of LabelEncoder class which converts text to numbers
y = labelEncoder_y.fit_transform(y)

In [13]:
print("Now the dependent data is transformed into numbers: ",y)

Now the dependent data is transformed into numbers:  [0 1 0 0 1 1 0 1 0 1]


In [14]:
## Now we need to split the dataset into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Feature Scaling

 We do feature scaling because most of the machine learning algorithms works
 upon the Euclidean distance p=sqrt((x2-x1)^2 - (y2-y1)^2)
 In this case we can see salary is a dominating feature which will effect our results
 So we will do feature scaling to improve this thing
 It can be done in two ways: -> Standardisation and Normalisation.
 ### Standardisation is x = (x-mean(x))/standard deviation(x)
 and 
 ### Normalisation is x = (x-min(x))/max(x)-min(x)

In [15]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
print("Now we will check the columns of X_train and X_test",X_train,X_test)

Now we will check the columns of X_train and X_test [[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]] [[-1.          2.64575131 -0.77459667 -1.45882927 -0.90166297]
 [-1.          2.64575131 -0.77459667  1.98496442  2.13981082]]
