# Data Transformations

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style(style='whitegrid')

## Function Transformer - `FunctionTransformer()`
Transforms data based on user-defined function

In [4]:
from sklearn.preprocessing import FunctionTransformer

In [6]:
X = np.array([[128, 2], [2, 256], [4, 1], [512, 64]])
ft = FunctionTransformer(np.log2)
ft.fit_transform(X)

array([[7., 1.],
       [1., 8.],
       [2., 0.],
       [9., 6.]])

In [7]:
X = np.array([[128, 2], [2, 256], [4, 1], [512, 64]])
ft = FunctionTransformer(np.mean)
ft.fit_transform(X)

121.125

## Polynomial Transformation - `PolynomialFeatures()`
Creates all polynomial transformation

In [8]:
from sklearn.preprocessing import PolynomialFeatures

In [21]:
X = np.array([[2, 3],[4, 5],[6,7]])
pf = PolynomialFeatures(degree=2)
res = pf.fit_transform(X)
print(X)
res

[[2 3]
 [4 5]
 [6 7]]


array([[ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.],
       [ 1.,  6.,  7., 36., 42., 49.]])

## Categorical Transformers

### One Hot Encoder - `OneHotEncoder()`

In [24]:
from sklearn.preprocessing import OneHotEncoder

In [39]:
X = np.array([1, 2, 6, 1, 8, 6]).reshape(6, 1) 
print(X, "\nUnique Values : ", np.unique(X)) # Each unique value forms a column in the One Hot Encoder Transformation 
on = OneHotEncoder(sparse=False)
on.fit_transform(X)

[[1]
 [2]
 [6]
 [1]
 [8]
 [6]] 
Unique Values :  [1 2 6 8]


array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.]])

### Label Encoder - `LabelEncoder()`

In [41]:
from sklearn.preprocessing import LabelEncoder

#### Example 1

In [77]:
X = np.array([1, 2, 6, 1, 8, 6])
print("Original Array : ", X)
print("\nUnique Values : ", np.unique(X), "\n") # Encode target labels with value between 0 and n_classes-1.
le1 = LabelEncoder()
res1 = le1.fit_transform(X)
print("Label Encoded : ", res1)
res1_decoded = le1.inverse_transform(res1)
print("\nLabel Decoded : ", res1_decoded)

Original Array :  [1 2 6 1 8 6]

Unique Values :  [1 2 6 8] 

Label Encoded :  [0 1 2 0 3 2]

Label Decoded :  [1 2 6 1 8 6]


#### Example 2

In [76]:
Y = np.array(['apple', 'orange', 'banana', 'mango', 'peach', 'apple', 'banana', 'orange', 'peach', 'peach', 'plum', 'grapes'])
print("Original Array : ", Y)
print("\nUnique Values : ", np.unique(Y), "\n")
le2 = LabelEncoder()
res2 = le2.fit_transform(Y)
print("Label Encoded : ", res2)
res2_decoded = le2.inverse_transform(res2)
print("\nLabel Decoded : ", res2_decoded)

Original Array :  ['apple' 'orange' 'banana' 'mango' 'peach' 'apple' 'banana' 'orange'
 'peach' 'peach' 'plum' 'grapes']

Unique Values :  ['apple' 'banana' 'grapes' 'mango' 'orange' 'peach' 'plum'] 

Label Encoded :  [0 4 1 3 5 0 1 4 5 5 6 2]

Label Decoded :  ['apple' 'orange' 'banana' 'mango' 'peach' 'apple' 'banana' 'orange'
 'peach' 'peach' 'plum' 'grapes']


### Ordinal Encoder - `OrdinalEncoder()`
OrdinalEncoder can operate multi dimensional data, while LabelEncoder can transform only 1D data

In [66]:
from sklearn.preprocessing import OrdinalEncoder

In [75]:
X = np.array([[1, 2, 6, 1, 8, 6], ['male', 'female', 'female', 'male', 'male', 'female']]).T
print("Original Array : \n", X)
print("\nUnique Values : ", np.unique(X), "\n")
oe = OrdinalEncoder()
res = oe.fit_transform(X)
print("Label Encoded : \n", res)
res_decoded = oe.inverse_transform(res)
print("\nLabel Decoded : \n", res_decoded)

Original Array : 
 [['1' 'male']
 ['2' 'female']
 ['6' 'female']
 ['1' 'male']
 ['8' 'male']
 ['6' 'female']]

Unique Values :  ['1' '2' '6' '8' 'female' 'male'] 

Label Encoded : 
 [[0. 1.]
 [1. 0.]
 [2. 0.]
 [0. 1.]
 [3. 1.]
 [2. 0.]]

Label Decoded : 
 [['1' 'male']
 ['2' 'female']
 ['6' 'female']
 ['1' 'male']
 ['8' 'male']
 ['6' 'female']]


### Label Binarizer - `LabelBinarizer()`
The only difference between OneHotEncoder and LabelBinarizer appears to be that the former return a sparse matrix by default, while the latter returns a dense matrix by default.

In [78]:
from sklearn.preprocessing import LabelBinarizer

In [79]:
X = np.array(['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']).reshape(10, 1)
print("Original Array : \n", X)
print("\nUnique Values : ", np.unique(X), "\n")
lb = LabelBinarizer()
res = lb.fit_transform(X)
print("Label Encoded : \n", res)
res_decoded = lb.inverse_transform(res)
print("\nLabel Decoded : \n", res_decoded)

Original Array : 
 [['cold']
 ['cold']
 ['warm']
 ['cold']
 ['hot']
 ['hot']
 ['warm']
 ['cold']
 ['warm']
 ['hot']]

Unique Values :  ['cold' 'hot' 'warm'] 

Label Encoded : 
 [[1 0 0]
 [1 0 0]
 [0 0 1]
 [1 0 0]
 [0 1 0]
 [0 1 0]
 [0 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 0]]

Label Decoded : 
 ['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']


### Multi Label Binarizer - `MultiLabelBinarizer()`

In [81]:
from sklearn.preprocessing import MultiLabelBinarizer

In [92]:
X = np.array([{'action', 'comedy' }, {'comedy'}, {'action', 'thriller'}, {'science-fiction', 'action', 'thriller'}])
print("Original Array : \n", X)
mlb = MultiLabelBinarizer()
res = mlb.fit_transform(X) # The columns are 'action', 'comedy', 'science-fiction' and 'thriller'
print("\nUnique Values : \n", mlb.classes_, "\n")
print("Label Encoded : \n", res)
res_decoded = lb.inverse_transform(res)
print("\nLabel Decoded : \n", res_decoded)

Original Array : 
 [{'action', 'comedy'} {'comedy'} {'thriller', 'action'}
 {'thriller', 'action', 'science-fiction'}]

Unique Values : 
 ['action' 'comedy' 'science-fiction' 'thriller'] 

Label Encoded : 
 [[1 1 0 0]
 [0 1 0 0]
 [1 0 0 1]
 [1 0 1 1]]

Label Decoded : 
 ['cold' 'hot' 'cold' 'cold']


### K Bins Discretizer (Binning + One hot/ordinal encoding) - `KBinsDiscretizer()`

In [1]:
from sklearn.preprocessing import KBinsDiscretizer

In [4]:
X = np.array([0, 0.125, 0.25, 0.375, 0.5, 0.675, 0.75, 0.875, 1.0]).reshape(9,1)
KBD = KBinsDiscretizer(n_bins=5, strategy='uniform', encode='onehot')
X_fit = KBD.fit_transform(X) # Return sparse matrix
X_fit.toarray() # Convert to numpy array

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])

### Other Useful Methods

#### Add Dummy Featuer - `add_dummy_feature()`

In [94]:
from sklearn.preprocessing import add_dummy_feature

Using `numpy` arrays and `sklearn` package

In [97]:
X = np.array([[7, 1], [1, 8], [2, 0], [9, 6]])
print("Original Array : \n", X)
res = add_dummy_feature(X)
print("Modified Array : \n", res)

Original Array : 
 [[7 1]
 [1 8]
 [2 0]
 [9 6]]
Modified Array : 
 [[1. 7. 1.]
 [1. 1. 8.]
 [1. 2. 0.]
 [1. 9. 6.]]


Using `pandas` dataframe

In [98]:
# Create separate columns for each unique value
X = np.array(['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot'])
pd.get_dummies(X, prefix='weather') # Specify columns parameter, when used on a dataframe.

Unnamed: 0,weather_cold,weather_hot,weather_warm
0,1,0,0
1,1,0,0
2,0,0,1
3,1,0,0
4,0,1,0
5,0,1,0
6,0,0,1
7,1,0,0
8,0,0,1
9,0,1,0


### Column Transformer - `ColumnTransformer()`
Applies transformers to columns of an array or pandas DataFrame. This estimator allows different columns or column subsets of the input
to be transformed separately and the features generated by each transformer
will be concatenated to form a single feature space.

**NOTE** : To make the column pass through without getting transformed, use `('some_name', 'passthrough', [column_index])` in the `ColumnTransformer()` object

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler, OneHotEncoder

In [5]:
X = np.array([[20.0, 11.2, 15.6, 13.0, 18.6, 16.4], ['male', 'female', 'female', 'male', 'male', 'female']]).T
print("Original Array : \n", X)

# 'ageScaler' and 'genderEncoder' are arbitary names
cl = ColumnTransformer(
    [
        ('ageScaler', MaxAbsScaler(), [0]),
        ('genderEncoder', OneHotEncoder(dtype='int'), [1])
    ]
)

column_trans = ColumnTransformer([('ageScaler', MaxAbsScaler(), [0]), ('pass', 'passthrough', [0]), ('genderEncoder', OneHotEncoder(dtype='int'), [1])])

res = cl.fit_transform(X) 
print("\nColumn Transformed : \n", res)

Original Array : 
 [['20.0' 'male']
 ['11.2' 'female']
 ['15.6' 'female']
 ['13.0' 'male']
 ['18.6' 'male']
 ['16.4' 'female']]

Column Transformed : 
 [[1.   0.   1.  ]
 [0.56 1.   0.  ]
 [0.78 1.   0.  ]
 [0.65 0.   1.  ]
 [0.93 0.   1.  ]
 [0.82 1.   0.  ]]


### Transformer Target Regressor - `TransformerTargetRegressor()` 

- Meta-estimator to regress on a transformed target ( An estimator which takes another estimator as a parameter. Examples include pipeline. Pipeline , model_selection )

- **Useful for applying a non-linear transformation** to the target y in regression problems. This transformation can be given as a Transformer such as the QuantileTransformer or as a function and its inverse such as np.log and np.exp.

- The steps taken during this are :
    1. `regressor.fit(X, func(y))` fits regressor
    2. `inverse_func(regressor.predict(X))` predicts X from y

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor

In [14]:
X = np.arange(4).reshape(-1, 1)
y = np.exp(2 * X).ravel()
print("Original Array : \n", X)
print("Original Array : \n", y)

tf = np.log # The transformer function
tf_inv = np.exp # Inverse of the transformation function
tt = TransformedTargetRegressor(regressor=LinearRegression(), func=tf, inverse_func=tf_inv)


tt.fit(X, y)
tt.predict(X)  # This is equal to np.exp(2 * X) and basically equivalent to reverse fitting X for given y

Original Array : 
 [[0]
 [1]
 [2]
 [3]]
Original Array : 
 [  1.           7.3890561   54.59815003 403.42879349]


array([  1.        ,   7.3890561 ,  54.59815003, 403.42879349])