# **Data Preprocessing Tools**

In [48]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Import Data of retail company which contains country,age,salary,isPurchasedProduct?
dataset=pd.read_csv('https://raw.githubusercontent.com/ProgramSKAN/Sample-Data-For-Machine-Learning/master/Data.csv')
X=dataset.iloc[:, :-1].values #get all rows and all columns except last column
y=dataset.iloc[:,-1].values # get only last column
print(dataset)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [49]:
print(X) #Independent Variable

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [50]:
print(y) #dependent variable

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Take care of missing values

In [51]:
print(np.nan)

nan


In [52]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])

In [53]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Encoding Categorical Data

Encoding the Independent variable

In [54]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
X=np.array(ct.fit_transform(X))

In [55]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


Encoding the Dependent variable

In [56]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)

In [57]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


### Spliting the dataset into Train and Test set

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)  #80% train 20% test set, splitted uniform randomly

In [59]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [60]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [61]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [62]:
print(y_test)

[0 1]


### Feature Scaling
for scaling all our variables/features to make sure that they all take values in the same scale.This is to prevent one feature to dominate the other which therefore dominated features would be neglected by ML model.Because since most ML model depends on Eclidean distance ie.

Eclidean distance between points P1 and P2=sqrt((x2-x1)^2+(y2-y1)^2)
so, if we plot the graph between salary and age, then Eclidean distance of salary will dominate by its number.so we need it to be in same scale.

Feature scaling should be applied after splitting dataset because the test set supposed to be a brand new set on which we evaluate ML model.since Feature scaling is technique to get a mean and std dev of the feature inorder to perform scaling.so Feature scaling should not be performed on test set.

if we apply Feature Scaling before split then it actually get mean and std dev of all the values including the one in test set.since test set is the one we not supposed to have like future data in production.so,applying Feature Scaling before split causes information leakage on test set which we not suppose to have until training is done.

we don't use feature scaling for all the models even though we have the features taking very different values.Ex: in multiple linear regression equation (y=b0 + b1*x1 + ....+bn*xn), each variable xn is multiplied by the a coefficeint bn, if some variables that take much higer values than others, when learning the coefficients the coefficients will just compensate by taking small values for the variables that take high values.


std dev=sqrt(variance)

**Standardisation:**Makes all features to be between -3 and +3.standardization actually works well all the time.

Xstand=(x-mean(x))/std dev(x)   


**Normalisation:** makes all the values of features to be between 0 and 1.Normalization is recommended when you have a normal distribution in most of your features.

Xnorm=(x-min(x))/(max(x)-min(x))

*feature scaling should not be applied on test set
*do we have to apply feature scaling ( standardization) to the dummy variables in column1 (binary numbers) in the matrix of features.? No, because we loose information of country column.sometime it increases performance of model but still not recommended.

In [63]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train[:,3:]=sc.fit_transform(X_train[:,3:]) #apply it to age and salary not for dummy variable(country).fit only gets mean and std dev.transform applies whole formula
X_test[:,3:]=sc.transform(X_test[:,3:]) #apply the same transform for test set that applied on training set

In [66]:
print(X_train) #now all variables on same scale

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [67]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
