# Data Preprocessing

In [1]:
import pandas as pd
import io

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

## Missing value

In [2]:
csv_data = \
"""
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
"""

df = pd.read_csv(io.StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [3]:
# sum across column(axis=0)
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [72]:
# sum across row
df.isnull().sum(axis=1)

0    0
1    1
2    1
dtype: int64

## Imputing missing values

In [4]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
pd.DataFrame(imputed_data)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [6]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


## Categorical data encoding with pandas

In [7]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']], columns = ['color', 'size', 'price', 'classlabel'])
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [8]:
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


## Encoding class labels

In [11]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

## Performing one-hot encoding on nominal features

In [12]:
X = df[['color', 'size', 'price']].values
# >>> X
# array([['green', 1, 10.1],
#       ['red', 2, 13.5],
#       ['blue', 3, 15.3]], dtype=object)
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])

pd.DataFrame(X, columns=['color', 'size', 'price'])


Unnamed: 0,color,size,price
0,1,1,10.1
1,2,2,13.5
2,0,3,15.3


In [79]:
from sklearn.preprocessing import OneHotEncoder
X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:,0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [18]:
# other kinds of Transformer
from sklearn.compose import ColumnTransformer
X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),
    ('nothing', 'passthrough', [1, 2])
    ])

c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [14]:
from sklearn.preprocessing import OrdinalEncoder
c_transf = ColumnTransformer([
    ('onehot', OrdinalEncoder(), [0]),
    ('nothing', 'passthrough', [1, 2])
])
pd.DataFrame(c_transf.fit_transform(X), columns=['color', 'size', 'price'])

Unnamed: 0,color,size,price
0,1.0,1,10.1
1,2.0,2,13.5
2,0.0,3,15.3


In [94]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,False,True,False
1,13.5,2,False,False,True
2,15.3,3,True,False,False
