In [1]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import pickle
import numpy as np

# Sample Data
data = {
    'Size': ['Small', 'Medium', 'Large'],
    'Color': ['Red', 'Blue', 'Green'],
    'Shape': ['Circle', 'Square', 'Triangle']
}

df = pd.DataFrame(data)

# Initialize OrdinalEncoder and fit it
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(df)

# Create a mapping of each category to its encoded value, with -1 as a default for unknowns
category_mapping = {
    'Size': dict(zip(ordinal_encoder.categories_[0], range(len(ordinal_encoder.categories_[0])))),
    'Color': dict(zip(ordinal_encoder.categories_[1], range(len(ordinal_encoder.categories_[1])))),
    'Shape': dict(zip(ordinal_encoder.categories_[2], range(len(ordinal_encoder.categories_[2]))))
}

# Function to handle unknown categories
def encode_with_unknown_handling(row, column, mapping):
    return mapping.get(row[column], -1)  # Returns -1 if the category is unknown in training data

# Transform new data, including handling unknown categories
new_data = pd.DataFrame({
    'Size': ['Medium', 'Small', 'ExtraLarge'],  # 'ExtraLarge' is unknown
    'Color': ['Blue', 'Red', 'Purple'],        # 'Purple' is unknown
    'Shape': ['Square', 'Hexagon', 'Circle']   # 'Hexagon' is unknown
})

for column in ['Size', 'Color', 'Shape']:
    new_data[column + '_encoded'] = new_data.apply(encode_with_unknown_handling, axis=1, column=column, mapping=category_mapping[column])

# Display the new transformed data
print(new_data)

         Size   Color    Shape  Size_encoded  Color_encoded  Shape_encoded
0      Medium    Blue   Square             1              0              1
1       Small     Red  Hexagon             2              2             -1
2  ExtraLarge  Purple   Circle            -1             -1              0


In [3]:
ordinal_encoder.categories_[0]

array(['Large', 'Medium', 'Small'], dtype=object)

In [4]:
range(len(ordinal_encoder.categories_[0]))

range(0, 3)

In [1]:
import pandas as pd
values = ['b','b','d','a','c','a']
val = pd.factorize(values)
print(val)

(array([0, 0, 1, 2, 3, 2]), array(['b', 'd', 'a', 'c'], dtype=object))


In [2]:
val = pd.factorize(values, sort=True)
print(val)

(array([1, 1, 3, 0, 2, 0]), array(['a', 'b', 'c', 'd'], dtype=object))
