# Data Preprocessing Tools

# Importing the libraries

In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

# Importing the dataset

Notes:
    
1. Features or independant variables are the columns with which you will predict the dependant variable.
   In this case, in the file named "data.csv", country,age,salary are the dependant variables, while the Purchase column
   is the independant variable.

2. In most data sets, features are generally occupied by the columns in first and the dependant variable is generally the last column

In [23]:
import pandas as pd
dataset=pd.read_csv("Data.csv")

**Splitting the entire data set into 2 parts: Input and Output**

In [24]:
x1=dataset.drop(columns='Purchased') #  input
y1=dataset['Purchased'] # o/p

**Another way of splitting data is using iloc function, by location of index**

In [25]:
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1:].values

In [26]:
print(x)
print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
[['No']
 ['Yes']
 ['No']
 ['No']
 ['Yes']
 ['Yes']
 ['No']
 ['Yes']
 ['No']
 ['Yes']]


# Taking care of missing data

In [27]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Notes:
1. In Large data set, for example 1000 rows, if single data is missing in a particular row, we can ignore it..because it will      not effect the insights or preparation of model.
  
2. But, if the data is small, we have to replace the data by either most frequently available, avg or median value of that  
   column   
3. In statistics, imputation is the process of replacing missing data with substituted values.
  
   **Refer** : https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
   
  
 
    

In [28]:
import numpy as np
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan, strategy='mean')

**Now, we have to apply this object to our dataframe**

In [29]:
# Step1: connect this object to the data frame
    
imputer.fit(x[:,1:3])  # as an argument it expects all the columns with numerical values, not the columns with strings

# Step2: apply the transform function
    
x[:,1:3]=imputer.transform(x[:,1:3])  # this transfrom function does not update x, it returns a new object

print(x)
# the missing data have been replaced

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# Encoding Categorical Data

# Encoding the Independant Variable 

In [30]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct=ColumnTransformer(transformers=[('encoding',OneHotEncoder(),[0])], remainder='passthrough')
x=ct.fit_transform(x)

Notes:
1. transformers : takes a tuple inside a list contining 3 elements in that tuple
    
2. element1: kind of transformation, which is encoding
    
3. element2: what kind of encoding we want to use
    
4. element3: indexes of columns that we want to encode remainder
    
5. why are we assigning passthrough to the remainder parameter?
   Ans:The reason is that, we want to keep the columns that are not one hot encoded

In [31]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

# Encoding the Dependent Variable

In [32]:
from sklearn.preprocessing import LabelEncoder
#This transformer should be used to encode target values, i.e. y, and not the input X.
le=LabelEncoder()
y=le.fit_transform(y)


In [33]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

**Encoding the Country column by LabelEnocdeR**


In [34]:
COUNTRY=x1['Country'] 

In [35]:
from sklearn.preprocessing import LabelEncoder
#This transformer should be used to encode target values, i.e. y, and not the input X.
le=LabelEncoder()
COUNTRY=le.fit_transform(COUNTRY)


In [36]:
COUNTRY

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

# Feature Scaling

**Why do we need scaling?**

1. While training our machine learning model,if we have a feature of a lot higher range than the other feature, it can create a    bias.The feautre that have high values than the other features might dominate.In other words, the other features might have    a signifcant impact in predicting the dependant variable, so it should not be neglected.

2. Another reason of scaling, height and weight are in diff units, to make that unit less
   

 We have 2 techniques for scaling:
**Standardization and Normalisation**


In [37]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [38]:
from sklearn.preprocessing import StandardScaler
st=StandardScaler()
x=st.fit_transform(x)

In [39]:
print(x)

[[ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01  7.58874362e-01
   7.49473254e-01]
 [-8.16496581e-01 -6.54653671e-01  1.52752523e+00 -1.71150388e+00
  -1.43817841e+00]
 [-8.16496581e-01  1.52752523e+00 -6.54653671e-01 -1.27555478e+00
  -8.91265492e-01]
 [-8.16496581e-01 -6.54653671e-01  1.52752523e+00 -1.13023841e-01
  -2.53200424e-01]
 [-8.16496581e-01  1.52752523e+00 -6.54653671e-01  1.77608893e-01
   6.63219199e-16]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01 -5.48972942e-01
  -5.26656882e-01]
 [-8.16496581e-01 -6.54653671e-01  1.52752523e+00  0.00000000e+00
  -1.07356980e+00]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01  1.34013983e+00
   1.38753832e+00]
 [-8.16496581e-01  1.52752523e+00 -6.54653671e-01  1.63077256e+00
   1.75214693e+00]
 [ 1.22474487e+00 -6.54653671e-01 -6.54653671e-01 -2.58340208e-01
   2.93712492e-01]]


# Splitting the data into train and test

In [40]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2)


In [41]:
print(X_train)

[[ 1.22474487 -0.65465367 -0.65465367 -0.54897294 -0.52665688]
 [-0.81649658 -0.65465367  1.52752523  0.         -1.0735698 ]
 [-0.81649658  1.52752523 -0.65465367  1.63077256  1.75214693]
 [-0.81649658 -0.65465367  1.52752523 -1.71150388 -1.43817841]
 [ 1.22474487 -0.65465367 -0.65465367  1.34013983  1.38753832]
 [ 1.22474487 -0.65465367 -0.65465367  0.75887436  0.74947325]
 [-0.81649658 -0.65465367  1.52752523 -0.11302384 -0.25320042]
 [ 1.22474487 -0.65465367 -0.65465367 -0.25834021  0.29371249]]


In [42]:
print(X_test)

[[-8.16496581e-01  1.52752523e+00 -6.54653671e-01  1.77608893e-01
   6.63219199e-16]
 [-8.16496581e-01  1.52752523e+00 -6.54653671e-01 -1.27555478e+00
  -8.91265492e-01]]


In [43]:
print(Y_train)

[1 0 0 1 1 0 0 1]


In [44]:
print(Y_test)

[1 0]
