In [1]:
#Importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#importing dataset
dataset=pd.read_csv('Data.csv')
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [3]:
X=dataset.iloc[:, :-1].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [4]:
y=dataset.iloc[:,3].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [5]:
#Missing Data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)    #axis=0:impute along columns
imputer=imputer.fit(X[:,1:3])
X[:, 1:3]=imputer.transform(X[:, 1:3])
X




array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_X=LabelEncoder()
X[:, 0]=labelencoder_X.fit_transform(X[:, 0])
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [7]:
# Dummy Encoding:- all the countries have same priority.but model think spain is greater than germany and france etc..
#to resolve this problem we use dummy encoding
from sklearn.preprocessing import OneHotEncoder
onehotencoder=OneHotEncoder(categorical_features=[0])
X=onehotencoder.fit_transform(X).toarray()
X

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [8]:
# As y is dependent varibale. model knows these are categorial values.no need to use OneHotEncoder
labelencoder_y=LabelEncoder()
y=labelencoder_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [9]:
# Splitting the dataset into training and test Set
from sklearn.model_selection import train_test_split #train_test_split is part of model_selection module now not of Cross_validation.
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25,random_state=0)
X_train,y_train,X_test,y_test

(array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
         6.70000000e+04],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
         4.80000000e+04],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
         5.20000000e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
         7.90000000e+04],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
         6.10000000e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
         7.20000000e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
         5.80000000e+04]]),
 array([1, 1, 0, 1, 0, 0, 1]),
 array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
         5.40000000e+04],
        [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
         8.30000000e+04],
        [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
   

#### Feature Scaling:- 
Varibales in dataset are not on same scale.This will cause issues in machine learning model b'coz 
most of machine learning models are based on Euclidian distance.
#### Methods of Feature Scaling:-
1. Standardization                                  (x-mean(x))/standard deviation(x)
2. Normalistaion                                    (x-min(x))/(max(x)-min(x))   


In [10]:
from sklearn.preprocessing import StandardScaler
sc_X= StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [13]:
X_train                             #all the variables are on same scale now.

array([[ 0.8660254 ,  0.        , -0.8660254 , -0.2029809 ,  0.44897083],
       [-1.15470054,  0.        ,  1.15470054, -1.82168936, -1.41706417],
       [-1.15470054,  0.        ,  1.15470054,  0.08478949, -1.0242147 ],
       [ 0.8660254 ,  0.        , -0.8660254 ,  1.5775984 ,  1.62751925],
       [-1.15470054,  0.        ,  1.15470054, -0.04111006, -0.14030338],
       [ 0.8660254 ,  0.        , -0.8660254 ,  0.93011502,  0.94003267],
       [ 0.8660254 ,  0.        , -0.8660254 , -0.52672259, -0.43494049]])

In [14]:
X_test                              #all the variables are on same scale now in X_test also.

array([[-1.15470054,  1.        , -0.8660254 , -1.33607682, -0.82778996],
       [-1.15470054,  1.        , -0.8660254 ,  1.90134009,  2.02036872],
       [-1.15470054,  1.        , -0.8660254 ,  0.28263164,  0.13250875]])