# Data Preprocessing Tools

### Importing libraries

In [2]:
# Import and preprocess data, creates matrix of features and vectors of dependent var
import pandas as pd

# ML models require to work with arrays, np is used to convert data into desirable arrays and also other operations
import numpy as np

# Data visualization
import matplotlib.pyplot as plt

### Importing dataset

In [22]:
# All dataset should be of 2 sets - Features or independent variables (X) and dependent variable (Y)
df = pd.read_csv('Data.csv')

# Numpy array of matrices of Features
X = df.iloc[:, :-1].values
X1 = df.iloc[:, :-1].values

# Numpy array of vector of dependent var
y = df.iloc[:, -1].values

In [None]:
# iloc vs loc

In [23]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
print(type(X))
print(type(y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


### Handling missing data

In [24]:
# Mostly delete or remove row if missing data is less
# Other option is to find average of all rows of the coloumn and put it into the missing value cell
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

In [25]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Encoding categorical data

#### Encoding independent variable

In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer
# transformers is a list for all the coloumns which needs transformation
# it takes name, transformer and columns to transform
ct = ColumnTransformer(remainder='passthrough', transformers=[('encoder', OneHotEncoder(), [0])])
X = ct.fit_transform(X)

In [34]:
X

array([[1.0, 0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [38]:
print(type(X))

<class 'numpy.ndarray'>


In [36]:
from sklearn.preprocessing import LabelEncoder

y = LabelEncoder().fit_transform(y)

In [37]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

### Split data

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

##### Note - Feature sclaing is always done after splitting data, because feature scaling involves calculating mean, median, etc of all the features provided and then scaling them. As we want our test dataset to be totally new for the model, we will prevent information leakage by spliting data first and then scaling all train features only.

### Feature scaling