# <font color="maganta"><h3 align="center">Topic - Encoding Part 3</h3></font> 

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

<h1 style = "color:green"> Ordinal Encoding</h1>

In [2]:
enc = preprocessing.OrdinalEncoder()
X = [['male'], ['female'], [np.nan], ['female']]
enc.fit_transform(X)

array([[ 1.],
       [ 0.],
       [nan],
       [ 0.]])

In [3]:
type(X)

list

<h3 style = "color:indigo">Encoding misssing value</h3>

In [4]:
#OrdinalEncoder provides a parameter encoded_missing_value to encode the missing values 
#without the need to create a pipeline and using SimpleImputer.

enc = preprocessing.OrdinalEncoder(encoded_missing_value=-1)
X = [['male'], ['female'], [np.nan], ['female']]
enc.fit_transform(X)

array([[ 1.],
       [ 0.],
       [-1.],
       [ 0.]])

<h3 style = "color:indigo">Encoding misssing value using Pipeline</h3>

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
enc = Pipeline(steps=[
    ("encoder", preprocessing.OrdinalEncoder()),
    ("imputer", SimpleImputer(strategy="constant", fill_value=-1)),
])
enc.fit_transform(X)

array([[ 1.],
       [ 0.],
       [-1.],
       [ 0.]])

<h1 style = "color:green">One Hot Encoder</h1>

<h4 style = "color:indigo">exercise 1</h4>

In [6]:
enc = preprocessing.OneHotEncoder()
X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox']]

enc.fit_transform(X).toarray()

array([[0., 1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1., 0.]])

<h4 style = "color:indigo">exercise 2</h4>

In [7]:
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
# Note that for there are missing categorical values for the 2nd and 3rd feature

X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit_transform(X).toarray()

array([[0., 1., 0., 0., 0., 1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0., 0.]])

In [8]:
df = pd.DataFrame(enc.fit_transform(X).toarray(),columns=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


<h4 style = "color:indigo">exercise 3</h4>

In [9]:
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
# Note that for there are missing categorical values for the 2nd and 3rd feature
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()

array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])

<h3 style = "color:indigo">#handle_unknown='infrequent_if_exist'</h3>

In [11]:
"""
Note: If there is a possibility that the training data might have missing categorical features, it can often be better
to specify handle_unknown='infrequent_if_exist' instead of setting the categories manually as above.

When handle_unknown='infrequent_if_exist' is specified and unknown categories are encountered during transform,no error
will be raised but the resulting one-hot encoded columns for this feature will be all zeros or considered as an infrequent
category if enabled. (handle_unknown='infrequent_if_exist' is only supported for one-hot encoding).
"""
print()




In [12]:
enc = preprocessing.OneHotEncoder(handle_unknown='infrequent_if_exist')
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()

array([[1., 0., 0., 0., 0., 0.]])

<h3 style = "color:indigo">parameter: (drop='first')</h3>

In [13]:
"""
Note: It is also possible to encode each column into n_categories - 1 columns instead of n_categories columns by using 
the drop parameter. This parameter allows the user to specify a category for each feature to be dropped. This is useful
to avoid co-linearity in the input matrix in some classifiers. Such functionality is useful, for example, when using
non-regularized regression (LinearRegression), since co-linearity would cause the covariance matrix to be non-invertible.

"""
print()




In [14]:
X = [['male', 'from US', 'uses Safari'],
     ['female', 'from Europe', 'uses Firefox']]

drop_enc = preprocessing.OneHotEncoder(drop='first').fit(X)
drop_enc.transform(X).toarray()

array([[1., 1., 1.],
       [0., 0., 0.]])

In [15]:
X = [['male', 'from US', 'uses Safari'],
     ['female', 'from Europe', 'uses Firefox']]

drop_enc = preprocessing.OneHotEncoder().fit(X)
drop_enc.transform(X).toarray()

array([[0., 1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1., 0.]])

<h3 style = "color:indigo">parameter (drop='if_binary')</h3>

We can drop one of the two columns only for features with 2 categories. In this case, we can set the parameter drop='if_binary'.

In [17]:
X = [['male', 'US', 'Safari'],
     ['female', 'Europe', 'Firefox'],
     ['female', 'Asia', 'Chrome']]
drop_enc = preprocessing.OneHotEncoder(drop='if_binary').fit(X)
drop_enc.transform(X).toarray()
#here encode for female is dropped.

array([[1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0.]])

In [18]:
drop_enc.categories_

[array(['female', 'male'], dtype=object),
 array(['Asia', 'Europe', 'US'], dtype=object),
 array(['Chrome', 'Firefox', 'Safari'], dtype=object)]

<h3 style = "color:indigo">parameter (handle_unknown='ignore')</h3>

In [25]:
from sklearn.preprocessing import OneHotEncoder
hot = OneHotEncoder(handle_unknown='ignore',sparse_output=False)

df = pd.DataFrame({
    'Country': ['USA', 'USA', 'IND', 'UK', 'UK', 'UK'],
    'Fruits': ['Apple', 'Strawberry', 'Mango', 'Berries', 'Banana', 'Grape'],
    'Flower': ['Rose', 'Lily', 'Orchid', 'Petunia', 'Lotus', 'Dandelion'],
    'Result': [1, 2, 3, 4, 5, 6]
})

hot.fit_transform(df[['Country']])

array([[0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [27]:
# Create a test dataframe with a new value in the 'Country' column
test_df = pd.DataFrame({
    'Country': ['USA', 'CAN', 'IND', 'UK', 'UK', 'UK'],
    'Fruits': ['Apple', 'Strawberry', 'Mango', 'Berries', 'Banana', 'Grape'],
    'Flower': ['Rose', 'Lily', 'Orchid', 'Petunia', 'Lotus', 'Dandelion'],
    'Result': [1, 2, 3, 4, 5, 6]
})

hot.transform(test_df[['Country']])

array([[0., 0., 1.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

Note: The second row has all zeros, indicating that ‘CAN’ is an unknown category.

<h3 style = "color:indigo">Handling missing value</h3>

In [28]:
X = [['male', 'Safari'],
     ['female', None],
     [np.nan, 'Firefox']]
enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X)
enc.transform(X).toarray()

array([[0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0.]])

In [22]:
enc.categories_

[array(['female', 'male', nan], dtype=object),
 array(['Firefox', 'Safari', None], dtype=object)]

Note: OneHotEncoder supports categorical features with missing values by considering the missing values as an additional category.

In [29]:
# feature contains both np.nan and None
X = [['Safari'], [None], [np.nan], ['Firefox']]
enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X)
enc.categories_
enc.transform(X).toarray()

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]])

Note: If a feature contains both np.nan and None, they will be considered separate categories.