# Importing the packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
import math

# Handling missing values

In [2]:
#Creating a dataframe
X=pd.DataFrame(
np.array([np.NaN, np.NaN, np.NaN,5,6,7,-5,0,25,999,1,-1,np.NaN,0,np.NaN])\
    .reshape((5,3)))
X.columns = ['f1', 'f2', 'f3']

In [3]:
X

Unnamed: 0,f1,f2,f3
0,,,
1,5.0,6.0,7.0
2,-5.0,0.0,25.0
3,999.0,1.0,-1.0
4,,0.0,


In [4]:
#dropping null valued row
X.dropna(axis=0,thresh=1,inplace=True)
X.reset_index(inplace=True)
X.drop(['index'], axis=1,inplace=True)

In [5]:
X

Unnamed: 0,f1,f2,f3
0,5.0,6.0,7.0
1,-5.0,0.0,25.0
2,999.0,1.0,-1.0
3,,0.0,


In [6]:
# replacing NaN value
X.replace({999.0 : np.NaN}, inplace=True)
indicator = MissingIndicator(missing_values=np.NaN)
indicator= indicator.fit_transform(X)
indicator= pd.DataFrame(indicator, columns=['m1','m3'])

In [7]:
indicator

Unnamed: 0,m1,m3
0,False,False
1,False,False
2,True,False
3,True,True


In [8]:
X

Unnamed: 0,f1,f2,f3
0,5.0,6.0,7.0
1,-5.0,0.0,25.0
2,,1.0,-1.0
3,,0.0,


In [9]:
#filling with mean
imp= SimpleImputer(missing_values=np.NaN, strategy='mean')
imp.fit_transform(X)

array([[ 5.        ,  6.        ,  7.        ],
       [-5.        ,  0.        , 25.        ],
       [ 0.        ,  1.        , -1.        ],
       [ 0.        ,  0.        , 10.33333333]])

In [10]:
X.fillna(X.mean(), inplace=True)

In [11]:
X

Unnamed: 0,f1,f2,f3
0,5.0,6.0,7.0
1,-5.0,0.0,25.0
2,0.0,1.0,-1.0
3,0.0,0.0,10.333333


# polynomial features

In [12]:
poly= PolynomialFeatures(degree=3, interaction_only=True)
polynomials = pd.DataFrame(poly\
                          .fit_transform(X),
                          columns = ['0','1','2','3',
                                    'p1','p2','p3','p4'])\
[['p1','p2','p3','p4']]

In [13]:
polynomials

Unnamed: 0,p1,p2,p3,p4
0,30.0,35.0,42.0,210.0
1,-0.0,-125.0,0.0,-0.0
2,0.0,-0.0,-1.0,-0.0
3,0.0,0.0,0.0,0.0


In [14]:
X=pd.concat([X, indicator, polynomials], axis=1)

In [15]:
X

Unnamed: 0,f1,f2,f3,m1,m3,p1,p2,p3,p4
0,5.0,6.0,7.0,False,False,30.0,35.0,42.0,210.0
1,-5.0,0.0,25.0,False,False,-0.0,-125.0,0.0,-0.0
2,0.0,1.0,-1.0,True,False,0.0,-0.0,-1.0,-0.0
3,0.0,0.0,10.333333,True,True,0.0,0.0,0.0,0.0


# categorical features

In [16]:
X = pd.DataFrame(
    np.array(['M', 'O-', 'medium',
             'M', 'O-', 'high',
              'F', 'O+', 'high',
              'F', 'AB', 'low',
              'F', 'B+', np.NaN])
              .reshape((5,3)))
X.columns = ['gender', 'blood_type', 'edu_level']

In [17]:

X['edu_level'].unique()

array(['medium', 'high', 'low', 'nan'], dtype=object)

In [18]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X.edu_level = encoder.fit_transform(X.edu_level)

In [19]:
X


Unnamed: 0,gender,blood_type,edu_level
0,M,O-,2
1,M,O-,0
2,F,O+,0
3,F,AB,1
4,F,B+,3


In [29]:
#cat= pd.Categorical(X.edu_level,
                   categories=['missing','low','medium','high'],ordered=True)

In [59]:
#cat

[NaN, NaN, NaN, NaN, NaN]
Categories (4, object): [missing < low < medium < high]

In [22]:
#cat.fillna('missing')

[missing, missing, missing, missing, missing]
Categories (4, object): [missing < low < medium < high]

In [60]:
#labels, unique = pd.factorize(cat,sort=True)
X.edu_level = labels

In [20]:
X.edu_level

0    2
1    0
2    0
3    1
4    3
Name: edu_level, dtype: int32

In [21]:
onehot = OneHotEncoder(dtype=np.int, sparse=True)
nominals = pd.DataFrame(
    onehot.fit_transform(X[['gender', 'blood_type']])\
    .toarray(),
    columns=['F', 'M', 'AB', 'B+','O+', 'O-'])
nominals['edu_level'] = X.edu_level

In [22]:
nominals

Unnamed: 0,F,M,AB,B+,O+,O-,edu_level
0,0,1,0,0,0,1,2
1,0,1,0,0,0,1,0
2,1,0,0,0,1,0,0
3,1,0,1,0,0,0,1
4,1,0,0,1,0,0,3


# Numerical features 

# Discretization

In [23]:
disc = KBinsDiscretizer(n_bins=3, encode='ordinal', 
                        strategy='uniform')
disc.fit_transform(nominals)

array([[0., 2., 0., 0., 0., 2., 2.],
       [0., 2., 0., 0., 0., 2., 0.],
       [2., 0., 0., 0., 2., 0., 0.],
       [2., 0., 2., 0., 0., 0., 1.],
       [2., 0., 0., 2., 0., 0., 2.]])

In [24]:
X=pd.DataFrame(
np.array([np.NaN, np.NaN, np.NaN,5,6,7,-5,0,25,999,1,-1,np.NaN,0,np.NaN])\
    .reshape((5,3)))
X.columns = ['f1', 'f2', 'f3']

In [25]:
X

Unnamed: 0,f1,f2,f3
0,,,
1,5.0,6.0,7.0
2,-5.0,0.0,25.0
3,999.0,1.0,-1.0
4,,0.0,


In [26]:
X.dropna(axis=0,thresh=1,inplace=True)
X.reset_index(inplace=True)
X.drop(['index'], axis=1,inplace=True)

In [27]:
X.replace({ np.NaN : 2 }, inplace=True)

# Binarization

In [28]:
binarizer = Binarizer(threshold=0, copy=True)
binarizer.fit_transform(X.f3.values.reshape(-1, 1))

array([[1.],
       [1.],
       [0.],
       [1.]])

# Custom Transformations

In [29]:
transformer = FunctionTransformer(np.log1p, validate=True)
transformer.fit_transform(X.f2.values.reshape(-1, 1))
X.f2.apply(lambda x : np.log1p(x)) 

0    1.945910
1    0.000000
2    0.693147
3    0.000000
Name: f2, dtype: float64

# Feature Scaling

# standard scaler

In [30]:
scaler = StandardScaler()
scaler.fit_transform(X.f3.values.reshape(-1,1))

array([[-0.12395848],
       [ 1.66104358],
       [-0.91729272],
       [-0.61979238]])

In [31]:
X

Unnamed: 0,f1,f2,f3
0,5.0,6.0,7.0
1,-5.0,0.0,25.0
2,999.0,1.0,-1.0
3,2.0,0.0,2.0


# Minmax Scaler

In [32]:
scaler = MinMaxScaler(feature_range=(-3,3))
scaler.fit_transform(X.f3.values.reshape(-1,1))

array([[-1.15384615],
       [ 3.        ],
       [-3.        ],
       [-2.30769231]])

# Maxabs Scaler

In [33]:
scaler  = MaxAbsScaler ()
scaler.fit_transform(X.f3.values.reshape(-1,1))

array([[ 0.28],
       [ 1.  ],
       [-0.04],
       [ 0.08]])

# Robust scaler

In [34]:
robust = RobustScaler (quantile_range = (0.1,0.9))
robust.fit_transform(X.f3.values.reshape(-1,1))

array([[ 34.72222222],
       [284.72222222],
       [-76.38888889],
       [-34.72222222]])

# Normalization

# Max

In [139]:
norm_max = list(max(list(abs(i) for i in X.iloc[r])) for r in range(len(X)))

In [140]:
norm_max

[7.0, 25.0, 999.0, 2.0]

# L1

In [132]:
norm_l1 = list(sum(list(abs(i) for i in X.iloc[r])) for r in range(len(X)))

In [133]:
norm_l1

[18.0, 30.0, 1001.0, 4.0]

# L2

In [136]:
norm_l2 = list(math.sqrt(sum(list((i**2) for i in X.iloc[r]))) 
 for r in range(len(X)))

In [137]:
norm_l2

[10.488088481701515, 25.495097567963924, 999.0010010004995, 2.8284271247461903]