# Preprocessing steps
1. Imputating missing values (```sklearn.impute```)

In [1]:
import pandas as pd
import numpy as np

a= pd.DataFrame({
        'A' : [1,2, 3, None, None, 23, 45,5,6, 76],
        'B' : [54,None,6,None,1,2,4,5,4,6],
        'C': ['A','B',np.nan, np.nan,'B','A','B','A','B',np.nan],
        'D': ['Purchase','Not Purchase','Purchase','Not Purchase',
              'Purchase','Not Purchase','Purchase','Not Purchase',
              'Purchase','Not Purchase']
})
a

Unnamed: 0,A,B,C,D
0,1.0,54.0,A,Purchase
1,2.0,,B,Not Purchase
2,3.0,6.0,,Purchase
3,,,,Not Purchase
4,,1.0,B,Purchase
5,23.0,2.0,A,Not Purchase
6,45.0,4.0,B,Purchase
7,5.0,5.0,A,Not Purchase
8,6.0,4.0,B,Purchase
9,76.0,6.0,,Not Purchase


Rule: never impute the missing values in target variable, drop the rows with missing variable values

In [2]:
from sklearn.impute import SimpleImputer

#Object creation
si = SimpleImputer()
#fit the imputer
a[['A','B']] = si.fit_transform(a[['A','B']])
si2 = SimpleImputer(strategy ='most_frequent')
a[['C']] = si2.fit_transform(a[['C']])
a

Unnamed: 0,A,B,C,D
0,1.0,54.0,A,Purchase
1,2.0,10.25,B,Not Purchase
2,3.0,6.0,B,Purchase
3,20.125,10.25,B,Not Purchase
4,20.125,1.0,B,Purchase
5,23.0,2.0,A,Not Purchase
6,45.0,4.0,B,Purchase
7,5.0,5.0,A,Not Purchase
8,6.0,4.0,B,Purchase
9,76.0,6.0,B,Not Purchase


2. Encoding categorical variables(`sklearn.preprocessing`)
    - Ordinal encoding(`Ordinal Encoder`)
    - Label encoding(`LabelEncoder`) for target variable only
    - One-hot encoding(`OneHotEncoder`)

In [3]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
a['C'] = oe.fit_transform(a[['C']])
a

Unnamed: 0,A,B,C,D
0,1.0,54.0,0.0,Purchase
1,2.0,10.25,1.0,Not Purchase
2,3.0,6.0,1.0,Purchase
3,20.125,10.25,1.0,Not Purchase
4,20.125,1.0,1.0,Purchase
5,23.0,2.0,0.0,Not Purchase
6,45.0,4.0,1.0,Purchase
7,5.0,5.0,0.0,Not Purchase
8,6.0,4.0,1.0,Purchase
9,76.0,6.0,1.0,Not Purchase


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
a['D']=le.fit_transform(a[['D']])
a

  y = column_or_1d(y, warn=True)


Unnamed: 0,A,B,C,D
0,1.0,54.0,0.0,1
1,2.0,10.25,1.0,0
2,3.0,6.0,1.0,1
3,20.125,10.25,1.0,0
4,20.125,1.0,1.0,1
5,23.0,2.0,0.0,0
6,45.0,4.0,1.0,1
7,5.0,5.0,0.0,0
8,6.0,4.0,1.0,1
9,76.0,6.0,1.0,0


In [5]:
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({
    'City': ['Delhi','Mumbai','Hyderabad','Mumbai','Delhi','Lucknow','Banglore'],
    'pop': [12,13,14,15,16,17,18]
})
he = OneHotEncoder(drop='first',sparse_output=False)
enc_city =he.fit_transform(df[['City']])
df = pd.concat([df, pd.DataFrame(enc_city)],axis=1)
df.drop('City',axis=1,inplace=True)
df

Unnamed: 0,pop,0,1,2,3
0,12,1.0,0.0,0.0,0.0
1,13,0.0,0.0,0.0,1.0
2,14,0.0,1.0,0.0,0.0
3,15,0.0,0.0,0.0,1.0
4,16,1.0,0.0,0.0,0.0
5,17,0.0,0.0,1.0,0.0
6,18,0.0,0.0,0.0,0.0


In [6]:
he.inverse_transform([[1,0,0,0],[0,1,0,0]])

array([['Delhi'],
       ['Hyderabad']], dtype=object)

In [7]:
from sklearn.preprocessing import StandardScaler
df = pd.DataFrame({
    'salary': [1230000,340000,450000,398000,240000],
    'age':[23,45,63,56,47]
})
sc = StandardScaler()
df[['salary','age']]=sc.fit_transform(df)
df

Unnamed: 0,salary,age
0,1.961285,-1.757619
1,-0.538062,-0.132929
2,-0.229154,1.196362
3,-0.375183,0.679416
4,-0.818887,0.01477
