### Check if you are in right directory

In [1]:
import os
os.listdir()

['.ipynb_checkpoints',
 'CleanupData.csv',
 'Data Preprocessing.html',
 'Data Preprocessing.ipynb',
 'Revision DataPreprocessing.ipynb']

### Import required libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# To show plotting
%matplotlib inline

### Load the dataset using pandas

In [3]:
dataFrame = pd.read_csv('CleanupData.csv')

In [4]:
dataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   State             10 non-null     object 
 1   Age               9 non-null      float64
 2   Pocket Money      9 non-null      float64
 3   Course Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [5]:
dataFrame

Unnamed: 0,State,Age,Pocket Money,Course Purchased
0,Delhi,34.0,7200.0,No
1,Mumbai,17.0,4800.0,Yes
2,Banglore,20.0,5400.0,No
3,Mumbai,28.0,6100.0,No
4,Banglore,30.0,,Yes
5,Delhi,25.0,5800.0,Yes
6,Mumbai,,5200.0,No
7,Delhi,38.0,7900.0,Yes
8,Banglore,40.0,8300.0,No
9,Delhi,27.0,6700.0,Yes


### Get Independent and Dependent Matrix from dataset

In [6]:
X = dataFrame.iloc[ : , :-1]  # Column 'Course Purchased' is 3 or -1

In [7]:
X

Unnamed: 0,State,Age,Pocket Money
0,Delhi,34.0,7200.0
1,Mumbai,17.0,4800.0
2,Banglore,20.0,5400.0
3,Mumbai,28.0,6100.0
4,Banglore,30.0,
5,Delhi,25.0,5800.0
6,Mumbai,,5200.0
7,Delhi,38.0,7900.0
8,Banglore,40.0,8300.0
9,Delhi,27.0,6700.0


In [8]:
X = dataFrame.iloc[ : , :-1].values # To convert dataframe to Matrix /Array

### Matrix of Independent Variables

In [9]:
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, nan],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', nan, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [10]:
y = dataFrame.iloc[ : , 3].values

### Matrix of Dependent Variable

In [11]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Step 1 : Handling of Missing Data

In [12]:
from sklearn.impute import SimpleImputer

In [13]:
simpleimputer = SimpleImputer(missing_values=float(np.NaN), strategy='mean')

In [14]:
simpleimputer.fit(X[ : , 1:3 ])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [15]:
X[ : , 1:3 ] =  simpleimputer.transform(X[ : , 1:3 ])

In [16]:
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, 6377.777777777777],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', 28.77777777777778, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [17]:
simpleimputer.statistics_

array([  28.77777778, 6377.77777778])

### Step 2 : Handling of Categorical Data as ML algo understands number better than charcters

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
labelencode_X = LabelEncoder()

In [20]:
X[ : , 0]  = labelencode_X.fit_transform(X[ :, 0])

In [21]:
X

array([[1, 34.0, 7200.0],
       [2, 17.0, 4800.0],
       [0, 20.0, 5400.0],
       [2, 28.0, 6100.0],
       [0, 30.0, 6377.777777777777],
       [1, 25.0, 5800.0],
       [2, 28.77777777777778, 5200.0],
       [1, 38.0, 7900.0],
       [0, 40.0, 8300.0],
       [1, 27.0, 6700.0]], dtype=object)

### Step 3 : Dummy Matrix for Label Encoded Categorical Data so that numberical comparison don't create mess

In [22]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
onehotencoder = OneHotEncoder()

In [24]:
X_cat = X[ : , 0]

In [25]:
X_cat = X_cat.reshape([10,1])

In [26]:
X_cat = onehotencoder.fit_transform(X_cat).toarray()

In [27]:
X_cat

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [28]:
X

array([[1, 34.0, 7200.0],
       [2, 17.0, 4800.0],
       [0, 20.0, 5400.0],
       [2, 28.0, 6100.0],
       [0, 30.0, 6377.777777777777],
       [1, 25.0, 5800.0],
       [2, 28.77777777777778, 5200.0],
       [1, 38.0, 7900.0],
       [0, 40.0, 8300.0],
       [1, 27.0, 6700.0]], dtype=object)

In [29]:
Dummy = np.zeros([10, 5])

In [30]:
Dummy[ : , :3] = X_cat 

In [31]:
Dummy

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [32]:
Dummy[ : , 3:] = X[:,1:]

In [33]:
Dummy

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.40000000e+01,
        7.20000000e+03],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.70000000e+01,
        4.80000000e+03],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.00000000e+01,
        5.40000000e+03],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.80000000e+01,
        6.10000000e+03],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        6.37777778e+03],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 2.50000000e+01,
        5.80000000e+03],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.87777778e+01,
        5.20000000e+03],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.80000000e+01,
        7.90000000e+03],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        8.30000000e+03],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 2.70000000e+01,
        6.70000000e+03]])

In [34]:
X = Dummy

In [35]:
X

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.40000000e+01,
        7.20000000e+03],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.70000000e+01,
        4.80000000e+03],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.00000000e+01,
        5.40000000e+03],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.80000000e+01,
        6.10000000e+03],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        6.37777778e+03],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 2.50000000e+01,
        5.80000000e+03],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.87777778e+01,
        5.20000000e+03],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.80000000e+01,
        7.90000000e+03],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        8.30000000e+03],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 2.70000000e+01,
        6.70000000e+03]])

In [36]:
np.set_printoptions(suppress=True)

In [37]:
X

array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ]])

### Handling Matrix y Categorical Data

In [38]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [39]:
# Step 1 : LabelEncoding 
labelencode_y = LabelEncoder()
y = labelencode_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [40]:
# No OneHotEncoding '.' it requires if values in array contains more than two distinct values (0, 1) after label encoding
# # Step 2 : OneHotEncoding
# y = y.reshape(10, 1)
# y = onehotencoder.fit_transform(y).toarray()
# y

### Step 4 : Prepare Training and Testing data sets

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [43]:
X_train

array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ]])

In [44]:
y_test

array([0, 1])

### Step 5 (Optional) : Feature Scaling [ Standard Deviation or Normalization]


In [45]:
from sklearn.preprocessing import StandardScaler

In [46]:
scale_X = StandardScaler()

In [47]:
X_train = scale_X.fit_transform(X_train)

In [48]:
X_test = scale_X.transform(X_test)

In [49]:
X_train

array([[-0.57735027,  1.29099445, -0.77459667,  0.5426211 ,  0.54945728],
       [-0.57735027,  1.29099445, -0.77459667, -0.49723074,  0.11183644],
       [-0.57735027,  1.29099445, -0.77459667,  1.13682215,  1.16212645],
       [-0.57735027, -0.77459667,  1.29099445, -0.23314138, -1.20102608],
       [-0.57735027, -0.77459667,  1.29099445, -0.34868048, -0.41330857],
       [ 1.73205081, -0.77459667, -0.77459667, -0.05157995, -0.17018588],
       [ 1.73205081, -0.77459667, -0.77459667,  1.43392268,  1.51222312],
       [-0.57735027, -0.77459667,  1.29099445, -1.98273337, -1.55112275]])

In [50]:
X_test

array([[ 1.73205081, -0.77459667, -0.77459667, -1.53708258, -1.02597775],
       [-0.57735027,  1.29099445, -0.77459667, -0.79433127, -0.67588107]])