## Data Preprocessing

### Importing libraries

In [306]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

### Importing dataset

In [307]:
dataset = pd.read_csv('./Data.csv')
print(dataset)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [308]:
dataset.values # dataframe values as np array

array([['France', 44.0, 72000.0, 'No'],
       ['Spain', 27.0, 48000.0, 'Yes'],
       ['Germany', 30.0, 54000.0, 'No'],
       ['Spain', 38.0, 61000.0, 'No'],
       ['Germany', 40.0, nan, 'Yes'],
       ['France', 35.0, 58000.0, 'Yes'],
       ['Spain', nan, 52000.0, 'No'],
       ['France', 48.0, 79000.0, 'Yes'],
       ['Germany', 50.0, 83000.0, 'No'],
       ['France', 37.0, 67000.0, 'Yes']], dtype=object)

### Seperate inputs and outputs in form of matrix

In [309]:
# x is featured vector (inputs) and y is dependent variable vector (outputs)
X = dataset.iloc[:, :-1].values # iloc used for row-col postions
y = dataset.iloc[:, -1:].values # X -> Features and y -> Labels
print(type(X))
print(X)
print(y)

<class 'numpy.ndarray'>
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
[['No']
 ['Yes']
 ['No']
 ['No']
 ['Yes']
 ['Yes']
 ['No']
 ['Yes']
 ['No']
 ['Yes']]


### Hanling missing values

#### Approach-1 Handling missing values from dataset

In [310]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


#### Finding missing values 

In [311]:
dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

#### Fill the missing values

In [312]:
avarage_age = dataset.Age.median()
avarage_age = round(avarage_age)
avarage_age

38

In [313]:
# dataset.Age.fillna(avarage_age, inplace=True)

In [314]:
dataset.head(2)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes


#### Approach-2 Scikit-Learn

In [315]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan , strategy='median')
imputer.fit(X[:,1:3]) # only numeric cols
X[:,1:3] = imputer.transform(X[:,1:3])


In [316]:
type(X)

numpy.ndarray

In [317]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 61000.0],
       ['France', 35.0, 58000.0],
       ['Spain', 38.0, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Encoding categorical data

#### Encode independent values

In [318]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X)

In [319]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 61000.0]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.0 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


#### Encoding dependent values

In [320]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [321]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


### Splitting dataset into training and testing sets

In [322]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [323]:
print(X_train ,'\n')
print(X_test ,'\n')
print(y_train ,'\n')
print(y_test ,'\n')

[[0.0 0.0 1.0 38.0 52000.0]
 [0.0 1.0 0.0 40.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]] 

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]] 

[0 1 0 0 1 1 0 1] 

[0 1] 



### Features scaling (Need to apply for some of the ML Model) <br>
Same scaling of features

There are 2 main features scaling techniques:
1. Standardisation -> (x - mean(x)) / standard deviation(x) <br>
Range in most cases -> [-3, 3]

2. Normalisation   -> (x - min(x)) / (max(x) - min(x)) <br>
Range in most cases -> [0, 1]

In [324]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) # not transform the dummy variables as they are already in the range of [-3,3]
X_test[:, 3:] = sc.transform(X_test[:, 3:]) # not transform the dummy variables as they are already in the range of [-3,3]

In [325]:
print(X_train)

[[0.0 0.0 1.0 -0.28942984211696865 -1.0430254692900243]
 [0.0 1.0 0.0 0.0 -0.2767210428728636]
 [1.0 0.0 0.0 0.5788596842339373 0.659873256081444]
 [0.0 0.0 1.0 -0.28942984211696865 -0.2767210428728636]
 [0.0 0.0 1.0 -1.8812939737602963 -1.383605214364318]
 [1.0 0.0 0.0 1.1577193684678746 1.2558878099614579]
 [0.0 1.0 0.0 1.4471492105848434 1.5964675550357514]
 [1.0 0.0 0.0 -0.7235746052924217 -0.5321558516785838]]


In [326]:
print(X_test)

[[0.0 1.0 0.0 -1.4471492105848434 -0.8727355967528775]
 [1.0 0.0 0.0 -0.434144763175453 0.2341485747385769]]
