In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('customer.csv')
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


## Ordinal Encoding

In [5]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:,['review', 'education']], df.iloc[:,-1], test_size=0.2)

In [10]:
X_train.head()

Unnamed: 0,review,education
17,Poor,UG
44,Average,UG
39,Poor,PG
6,Good,School
25,Good,School


In [11]:
# specify order
oe = OrdinalEncoder(
    categories=[['Poor','Average','Good'],['School','UG','PG']]
) 
# You have to mention the categories in the order of lower importance to higher otherwise encoder will automatically assign new categories as per they come into picture.

In [12]:
X_train = oe.fit_transform(X_train)
X_test = oe.transform(X_test) # Only transform because of data leakage problem

X_train[:5]

array([[0., 1.],
       [1., 1.],
       [0., 2.],
       [2., 0.],
       [2., 0.]])

In [17]:
# Attributes
print(oe.categories_)
print(oe.feature_names_in_)
print(oe.n_features_in_)

oe.get_feature_names_out()

[array(['Poor', 'Average', 'Good'], dtype=object), array(['School', 'UG', 'PG'], dtype=object)]
['review' 'education']
2


array(['review', 'education'], dtype=object)

In [20]:
# Inverse Encoding
oe.inverse_transform(np.array([[0,2]])) # The passing array must be in 2D

array([['Poor', 'PG']], dtype=object)

In [22]:
# Handling unknown value
oe = OrdinalEncoder(
    categories=[['Poor','Average','Good'],['School','UG','PG']],
    handle_unknown='use_encoded_value', # The value is provided with unknown_value parameter - This will be applied for all the columns that have been encoded through this single encoding object
    unknown_value=-1 # Encoding value
)
'''
The handle_unknown parameter determines how the encoder should handle categories that were not seen during the training phase (i.e., new categories that appear during transformation). There are two possible values:

- 'error' (default): Raises an error when an unknown category is encountered during transformation.

- 'use_encoded_value': When this option is used, you must also specify an unknown_value parameter. Any unknown categories encountered during transformation will be encoded with this specified value. In your code example, unknown categories will be encoded as -1.
'''

X_train, X_test, y_train, y_test = train_test_split(df.loc[:,['review', 'education']], df.iloc[:,-1], test_size=0.2)
X_train = oe.fit_transform(X_train)

# Create a DataFrame with proper column names to avoid UserWarning
test_data = pd.DataFrame([['Poor', 'college']], columns=['review', 'education'])
oe.transform(test_data)

array([[ 0., -1.]])

In [24]:
# handling infrequent/rare categories
X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +['snake'] * 3 + ['horse'] * 2], dtype=object).T
np.random.permutation(X)[:5, :]

array([['cat'],
       ['cat'],
       ['horse'],
       ['rabbit'],
       ['rabbit']], dtype=object)

In [27]:
pd.Series(X.ravel()).value_counts()

cat       20
rabbit    10
dog        5
snake      3
horse      2
Name: count, dtype: int64

In [32]:
# By using max_categories parameter
enc = OrdinalEncoder(max_categories=4).fit(X)
# The lowest n categories are considered as rare categories.

In [34]:
enc.infrequent_categories_

[array(['horse', 'snake'], dtype=object)]

In [37]:
enc.transform(np.array([['cat','rabbit','snake','dog','horse']]).reshape(5,1))

array([[0.],
       [2.],
       [3.],
       [1.],
       [3.]])

In [38]:
# By using min_frequency parameter
enc = OrdinalEncoder(min_frequency=4).fit(X) # By using frequency threshold
enc.infrequent_categories_

[array(['horse', 'snake'], dtype=object)]

In [39]:
enc.transform(np.array([['cat','rabbit','snake','dog','horse']]).reshape(5,1))

array([[0.],
       [2.],
       [3.],
       [1.],
       [3.]])

In [40]:
# handling missing data
data = [['Cat'], [np.nan], ['Dog'], ['Fish'], [np.nan]]
encoder = OrdinalEncoder(encoded_missing_value=-1)
encoded_data = encoder.fit_transform(data)

print(encoded_data)

[[ 0.]
 [-1.]
 [ 1.]
 [ 2.]
 [-1.]]


---

## Label Encoding

In [43]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:3], df.iloc[:,-1], test_size=0.2)

In [44]:
from sklearn.preprocessing import LabelEncoder

In [45]:
le = LabelEncoder() # There is no hyperparameters
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [46]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [48]:
le.inverse_transform(np.array([1,1,0]))

array(['Yes', 'Yes', 'No'], dtype=object)

---

## One Hot Encoding

In [49]:
cars = pd.read_csv('cars.csv')
cars.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [53]:
X = cars.iloc[:, [0, 2]]
y = cars.iloc[:, -1]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [54]:
X_train['fuel'].nunique()

4

In [55]:
from sklearn.preprocessing import OneHotEncoder

In [58]:
ohe = OneHotEncoder(sparse_output = False, dtype = np.int32) # All the columns get be encoded seperately by single object
# sparse_output = False - Directly returns the encoded matrix
ohe.fit_transform(X_train)

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], shape=(6502, 36), dtype=int32)

In [59]:
ohe.categories_

[array(['Ambassador', 'Ashok', 'Audi', 'BMW', 'Chevrolet', 'Daewoo',
        'Datsun', 'Fiat', 'Force', 'Ford', 'Honda', 'Hyundai', 'Isuzu',
        'Jaguar', 'Jeep', 'Kia', 'Land', 'Lexus', 'MG', 'Mahindra',
        'Maruti', 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Opel',
        'Peugeot', 'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen',
        'Volvo'], dtype=object),
 array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object)]

In [60]:
ohe.feature_names_in_

array(['brand', 'fuel'], dtype=object)

In [61]:
ohe.n_features_in_

2

In [62]:
ohe.get_feature_names_out()

array(['brand_Ambassador', 'brand_Ashok', 'brand_Audi', 'brand_BMW',
       'brand_Chevrolet', 'brand_Daewoo', 'brand_Datsun', 'brand_Fiat',
       'brand_Force', 'brand_Ford', 'brand_Honda', 'brand_Hyundai',
       'brand_Isuzu', 'brand_Jaguar', 'brand_Jeep', 'brand_Kia',
       'brand_Land', 'brand_Lexus', 'brand_MG', 'brand_Mahindra',
       'brand_Maruti', 'brand_Mercedes-Benz', 'brand_Mitsubishi',
       'brand_Nissan', 'brand_Opel', 'brand_Peugeot', 'brand_Renault',
       'brand_Skoda', 'brand_Tata', 'brand_Toyota', 'brand_Volkswagen',
       'brand_Volvo', 'fuel_CNG', 'fuel_Diesel', 'fuel_LPG',
       'fuel_Petrol'], dtype=object)

In [67]:
ohe.inverse_transform(np.array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]).reshape(1,36))

array([['Audi', 'LPG']], dtype=object)

## Dummy Variable Trap
When you have a categorical variable with n categories, you only need n-1 binary columns to represent it completely. This is because:
- If you have n categories and create n binary columns
- `The last column can always be perfectly predicted from the other n-1 columns`
- This creates `perfect multicollinearity`, which can cause problems in statistical models

Why First Column?
- The first column is typically dropped by convention
- It could be any column, but dropping the first one is a common practice
- The dropped category becomes the "reference" or "baseline" category
- All other categories are interpreted relative to this baseline

In [68]:
# Dummy variable trap
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

ohe = OneHotEncoder(drop='first', sparse_output=False)
ohe.fit_transform(X_train).shape

(6502, 34)

In [None]:
ohe.drop_idx_ # 0th category from 1st column and 0th category from nd column

array([0, 0], dtype=object)

In [69]:
# handling rare categories
X_train['brand'].value_counts()

brand
Maruti           1953
Hyundai          1127
Mahindra          635
Tata              586
Toyota            391
Honda             369
Ford              320
Chevrolet         185
Renault           183
Volkswagen        154
BMW                96
Skoda              82
Nissan             62
Jaguar             59
Volvo              54
Datsun             48
Mercedes-Benz      43
Fiat               35
Audi               30
Jeep               26
Lexus              22
Mitsubishi         13
Force               6
Land                5
Kia                 4
Daewoo              3
MG                  3
Ambassador          3
Isuzu               2
Ashok               1
Peugeot             1
Opel                1
Name: count, dtype: int64

In [70]:
cars['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [71]:
# using min frequency
ohe = OneHotEncoder(sparse_output=False, min_frequency=100)
ohe.fit_transform(X_train).shape

(6502, 14)

In [72]:
ohe.get_feature_names_out()

array(['brand_Chevrolet', 'brand_Ford', 'brand_Honda', 'brand_Hyundai',
       'brand_Mahindra', 'brand_Maruti', 'brand_Renault', 'brand_Tata',
       'brand_Toyota', 'brand_Volkswagen', 'brand_infrequent_sklearn',
       'fuel_Diesel', 'fuel_Petrol', 'fuel_infrequent_sklearn'],
      dtype=object)

In [73]:
# using max_categories
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', max_categories=15)
ohe.fit_transform(X_train).shape

(6502, 19)

In [74]:
ohe.get_feature_names_out()

array(['brand_BMW', 'brand_Chevrolet', 'brand_Ford', 'brand_Honda',
       'brand_Hyundai', 'brand_Jaguar', 'brand_Mahindra', 'brand_Maruti',
       'brand_Nissan', 'brand_Renault', 'brand_Skoda', 'brand_Tata',
       'brand_Toyota', 'brand_Volkswagen', 'brand_infrequent_sklearn',
       'fuel_CNG', 'fuel_Diesel', 'fuel_LPG', 'fuel_Petrol'], dtype=object)

In [78]:
# Handling unknowk category
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # If the category is not present in trained data then all the digits will be encoded by 0
ohe.fit_transform(X_train)

ohe.transform(pd.DataFrame([['local','Petrol']], columns = ['brand', 'fuel']))

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1.]])

In [79]:
ohe.inverse_transform(np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1.]).reshape(1,36))

array([[None, 'Petrol']], dtype=object)

---

### LabelBinarizer

In [None]:
from sklearn.preprocessing import LabelBinarizer

# Sample target variable for a multi-class classification problem
y = ['cat', 'dog', 'fish', 'dog', 'cat']

# Initialize the LabelBinarizer
lb = LabelBinarizer()

# Fit and transform the target variable
y_binarized = lb.fit_transform(y)

print("Binarized labels:\n", y_binarized)

# Inverse transform to recover original labels
y_original = lb.inverse_transform(y_binarized)

print("Original labels:\n", y_original)


Binarized labels:
 [[1 0 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]
 [1 0 0]]
Original labels:
 ['cat' 'dog' 'fish' 'dog' 'cat']


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Example multi-label data
y = [('red', 'blue'), ('blue', 'green'), ('green',), ('red',)]

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the data to binary matrix format
Y = mlb.fit_transform(y)

print("Binary matrix:\n", Y)
print("Class labels:", mlb.classes_)

# Inverse transform to recover original labels
y_inv = mlb.inverse_transform(Y)
print("Inverse transformed labels:", y_inv)


Binary matrix:
 [[1 0 1]
 [1 1 0]
 [0 1 0]
 [0 0 1]]
Class labels: ['blue' 'green' 'red']
Inverse transformed labels: [('blue', 'red'), ('blue', 'green'), ('green',), ('red',)]


### 3. Count Encoder/Frequency Encoder

In [None]:
!pip install category_encoders



In [None]:
# dataset generation
import pandas as pd
import numpy as np
import category_encoders as ce

# Simulating a dataset
data = {
    'Age': np.random.randint(20, 60, size=100).astype(float),  # Random ages between 20 and 60
    'State': np.random.choice(['Karnataka', 'Tamil Nadu', 'Maharashtra', 'Delhi', 'Telangana'], size=100),
    'Education': np.random.choice(['High School', 'UG', 'PG'], size=100),
    'Package': np.random.rand(100) * 100  # Random package values for demonstration
}

# Introducing missing values in 'Age' column (5%)
np.random.seed(0)  # For reproducibility
missing_indices = np.random.choice(data['Age'].shape[0], replace=False, size=int(data['Age'].shape[0] * 0.05))
data['Age'][missing_indices] = np.nan

df = pd.DataFrame(data)

df.head()

Unnamed: 0,Age,State,Education,Package
0,54.0,Tamil Nadu,High School,40.612049
1,49.0,Delhi,PG,56.921076
2,,Telangana,High School,34.36055
3,54.0,Delhi,PG,78.887278
4,33.0,Delhi,PG,41.137241


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Package']), df['Package'], test_size=0.2, random_state=42)

In [None]:
X_train.head()

Unnamed: 0,Age,State,Education
55,,Tamil Nadu,High School
88,42.0,Maharashtra,PG
26,,Maharashtra,PG
42,39.0,Delhi,PG
69,33.0,Delhi,High School


In [None]:
X_train['State'].value_counts()

Delhi          18
Tamil Nadu     17
Telangana      17
Maharashtra    14
Karnataka      14
Name: State, dtype: int64

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import sklearn

In [None]:
class CountEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.count_map = {}

    def fit(self, X, y=None):
        if self.columns is None:
            self.columns = X.columns
        for col in self.columns:
            self.count_map[col] = X[col].value_counts().to_dict()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].map(self.count_map[col]).fillna(0)
        return X

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('age_missing', SimpleImputer(strategy='mean'), ['Age']),
        ('cat_state', CountEncoder(), ['State']),
        ('education_ordinal', OrdinalEncoder(), ['Education'])
    ])

sklearn.set_config(transform_output="pandas")

In [None]:
preprocessor.fit_transform(X_train)

Unnamed: 0,age_missing__Age,cat_state__State,education_ordinal__Education
55,38.133333,16,1.0
88,40.000000,18,0.0
26,38.133333,16,1.0
42,22.000000,15,0.0
69,51.000000,18,1.0
...,...,...,...
60,44.000000,18,1.0
71,40.000000,16,0.0
14,59.000000,15,2.0
92,55.000000,19,2.0


In [None]:
# using category encoders
from category_encoders.count import CountEncoder

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('age_missing', SimpleImputer(strategy='mean'), ['Age']),
        ('cat_state', CountEncoder(normalize=True), ['State']),
        ('education_ordinal', OrdinalEncoder(), ['Education'])
    ])
sklearn.set_config(transform_output="pandas")

In [None]:
preprocessor.fit_transform(X_train)

Unnamed: 0,age_missing__Age,cat_state__State,education_ordinal__Education
55,38.666667,0.2125,0.0
88,42.000000,0.1750,1.0
26,38.666667,0.1750,1.0
42,39.000000,0.2250,1.0
69,33.000000,0.2250,0.0
...,...,...,...
60,45.000000,0.2125,0.0
71,44.000000,0.2125,0.0
14,50.000000,0.1750,2.0
92,31.000000,0.1750,2.0


In [None]:
# frequency encoding

In [None]:
# parameters
import pandas as pd
import numpy as np
import category_encoders as ce

# Simulating a dataset
np.random.seed(42)  # For reproducibility
data = {
    'State': np.random.choice(['Karnataka', 'Tamil Nadu', 'Maharashtra', 'Delhi', 'Telangana', np.NaN], size=100),
    'Education': np.random.choice(['High School', 'UG', 'PG', np.NaN], size=100)
}
df = pd.DataFrame(data)

df.head(25)


Unnamed: 0,State,Education
0,Delhi,PG
1,Telangana,High School
2,Maharashtra,High School
3,Telangana,High School
4,Telangana,PG
5,Tamil Nadu,High School
6,Maharashtra,
7,Maharashtra,High School
8,Maharashtra,
9,Telangana,


In [None]:
df.isnull().sum()

State        0
Education    0
dtype: int64

In [None]:
# Initialize the CountEncoder with various parameters
encoder = ce.CountEncoder(
    cols=['State', 'Education'],  # Specify columns to encode. None would automatically select categorical columns.
    handle_missing='error',  # Treat NaNs as a countable category
    handle_unknown='error',  # Treat unknown categories as NaNs (if seen during transform but not in fit)
)

In [None]:
# Fit and transform the dataset
encoder.fit_transform(df)

#print(encoded_df.head(25))

Unnamed: 0,State,Education
0,25,34
1,17,27
2,11,27
3,17,27
4,17,34
...,...,...
95,25,27
96,25,16
97,17,23
98,11,23


In [None]:
encoder.mapping

{'State': Delhi          25
 Tamil Nadu     19
 Telangana      17
 nan            17
 Maharashtra    11
 Karnataka      11
 Name: State, dtype: int64,
 'Education': PG             34
 High School    27
 nan            23
 UG             16
 Name: Education, dtype: int64}

In [None]:
new_data = pd.DataFrame({'State': ['Bihar'], 'Education': ['UG']})

encoder.transform(new_data)

ValueError: Missing data found in column State at transform time.

In [None]:
np.random.seed(0)  # For reproducibility
data = {
    'Category': np.random.choice(['A', 'B', 'C', 'D', 'E', 'F', np.nan], size=100, p=[0.3, 0.25, 0.15, 0.15, 0.05, 0.05, 0.05]),
    'Value': np.random.rand(100)
}

df = pd.DataFrame(data)

df.sample(10)


Unnamed: 0,Category,Value
91,C,0.209844
29,B,0.290078
2,C,0.735194
50,C,0.149448
44,C,0.806194
78,A,0.704414
33,C,0.298282
65,B,0.855803
75,A,0.223925
45,C,0.703889


In [None]:
df['Category'].value_counts()

A      34
B      22
C      21
D      12
nan     5
F       4
E       2
Name: Category, dtype: int64

In [None]:
encoder = ce.CountEncoder(
    cols=['Category'],
    min_group_size=10,  # Groups with counts less than 5 will be combined
    min_group_name='salman',  # Use default naming for combined minimum groups
)

# Fit and transform the dataset
encoded_df = encoder.fit_transform(df['Category'])

# Display the original and encoded data for comparison
df['Encoded'] = encoded_df
print(df.head(20))

   Category     Value  Encoded
0         B  0.677817       22
1         D  0.270008       12
2         C  0.735194       21
3         B  0.962189       22
4         B  0.248753       22
5         C  0.576157       21
6         B  0.592042       22
7         E  0.572252       11
8       nan  0.223082       11
9         B  0.952749       22
10        D  0.447125       12
11        B  0.846409       22
12        C  0.699479       21
13        F  0.297437       11
14        A  0.813798       34
15        A  0.396506       34
16        A  0.881103       34
17        D  0.581273       12
18        D  0.881735       12
19        E  0.692532       11


In [None]:
encoder.mapping

{'Category': A         34
 B         22
 C         21
 D         12
 salman    11
 Name: Category, dtype: int64}

### Binary Encoder

In [None]:
import pandas as pd
import category_encoders as ce

# Sample dataset
data = {
    'Item': ['Item1', 'Item2', 'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8'],
    'Fruit': ['Apple', 'Banana', 'Cherry', 'Date', 'Elderberry', 'Fig', 'Grape', 'Honeydew']
}
df = pd.DataFrame(data)

df


Unnamed: 0,Item,Fruit
0,Item1,Apple
1,Item2,Banana
2,Item3,Cherry
3,Item4,Date
4,Item5,Elderberry
5,Item6,Fig
6,Item7,Grape
7,Item8,Honeydew


In [None]:
# Initialize the Binary Encoder
encoder = ce.BinaryEncoder(cols=['Fruit'], return_df=True)

# Fit and transform the data
df_encoded = encoder.fit_transform(df)

# Display the original and encoded data
print(df_encoded)

    Item  Fruit_0  Fruit_1  Fruit_2  Fruit_3
0  Item1        0        0        0        1
1  Item2        0        0        1        0
2  Item3        0        0        1        1
3  Item4        0        1        0        0
4  Item5        0        1        0        1
5  Item6        0        1        1        0
6  Item7        0        1        1        1
7  Item8        1        0        0        0


### Target Encoder

In [None]:
# using category_encoder

import pandas as pd
import category_encoders as ce

# Sample data
data = {
    'Feature': ['A', 'B', 'A', 'B', 'C', 'A', 'B', 'C'],
    'Target': [1, 0, 0, 1, 1, 1, 0, 1]
}
df = pd.DataFrame(data)

# Separating the feature and target columns
X = df.drop('Target', axis=1)
y = df['Target']

# Initialize the TargetEncoder
encoder = ce.TargetEncoder(cols=['Feature'])

# Fit the encoder using the feature data and target variable
encoder.fit(X, y)

# Transform the data
encoded = encoder.transform(X)

# Show the original and encoded data
print(pd.concat([df, encoded], axis=1))


   Feature  Target   Feature
0        A       1  0.631436
1        B       0  0.579948
2        A       0  0.631436
3        B       1  0.579948
4        C       1  0.678194
5        A       1  0.631436
6        B       0  0.579948
7        C       1  0.678194


In [None]:
encoder.mapping

{'Feature': Feature
  1    0.631436
  2    0.579948
  3    0.678194
 -1    0.625000
 -2    0.625000
 dtype: float64}

In [None]:
!pip install --upgrade scikit-learn==1.4.0

In [None]:
# using sklearn
import pandas as pd
from sklearn.preprocessing import TargetEncoder

# Sample data
data = {
    'Feature': ['A', 'B', 'A', 'B', 'C', 'A', 'B', 'C'],
    'Target': [1, 0, 0, 1, 1, 1, 0, 1]
}
df = pd.DataFrame(data)

# Separating the feature and target columns
X = df.drop('Target', axis=1)
y = df['Target']

# Initialize the TargetEncoder
encoder = TargetEncoder(smooth=0.0)

# Fit the encoder using the feature data and target variable
encoder.fit(X, y)

# Transform the data
encoded = encoder.transform(X)

encoded


array([[0.66666667],
       [0.33333333],
       [0.66666667],
       [0.33333333],
       [1.        ],
       [0.66666667],
       [0.33333333],
       [1.        ]])

### Weight of Evidence

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m994.7 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
import pandas as pd
import category_encoders as ce

# Example dataset
data = {
    'Feature': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'C'],
    'Target': [1, 0, 0, 1, 1, 0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

# Define the features and target
X = df[['Feature']]
y = df['Target']

# Initialize and fit the TargetEncoder
encoder = ce.WOEEncoder(cols=['Feature'])
X_encoded = encoder.fit_transform(X, y)

# Display the original and encoded data
df['Feature_Encoded'] = X_encoded
print(df)


  Feature  Target  Feature_Encoded
0       A       1         0.000000
1       B       0        -0.405465
2       A       0         0.000000
3       C       1         0.405465
4       B       1        -0.405465
5       A       0         0.000000
6       C       1         0.405465
7       B       0        -0.405465
8       A       1         0.000000
9       C       0         0.405465
