In [4]:
# One Hot Encoding using OneHotEncoder of sklearn.preprocessing

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [5]:
# Building a dummy employee dataset for example
data = {'Employee id': [10, 20, 15, 25, 30],
        'Gender': ['M', 'F', 'F', 'M', 'F'],
        'Remarks': ['Good', 'Nice', 'Good', 'Great', 'Nice'],
}

# Converting into a pandas dataframe
df = pd.DataFrame(data)

# Print the dataframe:
print(f"Employee Data : \n{df}")

Employee Data : 
   Employee id Gender Remarks
0           10      M    Good
1           20      F    Nice
2           15      F    Good
3           25      M   Great
4           30      F    Nice


In [12]:
# Extract categorical columns from the dataFrame
# Here we extract the columns with object datatype as they are the categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_columns)

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)    # here, we have set sparse_output to False to get a dense array as output otherwise it will return a sparse matrix
print(encoder)
print("----------")

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(df[categorical_columns])
print(one_hot_encoded)
print("----------")

# Create a dataframe with the one-hot encoded columns
# We use get_feature_names_out() to get the columns names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
print(encoder.get_feature_names_out(categorical_columns))
print(one_hot_df)
print("----------")

# Concatenate the one-hot encoded columns back to the original dataframe
df_encoded = pd.concat([df, one_hot_df], axis=1)
print(df_encoded)

# Drop the original categorical columns if not needed
df_encoded = df_encoded.drop(columns=categorical_columns, axis=1)

# Display the resulting dataframe
print(f"Encoded Employee Data : \n{df_encoded}")

['Gender', 'Remarks']
OneHotEncoder(sparse_output=False)
----------
[[0. 1. 1. 0. 0.]
 [1. 0. 0. 0. 1.]
 [1. 0. 1. 0. 0.]
 [0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1.]]
----------
['Gender_F' 'Gender_M' 'Remarks_Good' 'Remarks_Great' 'Remarks_Nice']
   Gender_F  Gender_M  Remarks_Good  Remarks_Great  Remarks_Nice
0       0.0       1.0           1.0            0.0           0.0
1       1.0       0.0           0.0            0.0           1.0
2       1.0       0.0           1.0            0.0           0.0
3       0.0       1.0           0.0            1.0           0.0
4       1.0       0.0           0.0            0.0           1.0
----------
   Employee id Gender Remarks  Gender_F  Gender_M  Remarks_Good  \
0           10      M    Good       0.0       1.0           1.0   
1           20      F    Nice       1.0       0.0           0.0   
2           15      F    Good       1.0       0.0           1.0   
3           25      M   Great       0.0       1.0           0.0   
4           30      F   

### Factoring: Use your knowledge of dta to come up with own encoding
eg: A+, A-, B+, B-, O+, O-, AB+, AB-

oneHotEncoding will require 8 columns to represent data but if we factor the data as [A,B,O,AB] & [+,-], then this will need only 6 cols, that speeds up the calculation.

### Standardizing Numerical Features
Suppose we have several features F_i, where lets say, one has values ranging [0,2], second has range [3,3000], another features has range [5, 89]. This will take a lot of time to deal with these data.

##### Workaround: 
##### X_i = (X_i - MU_X)/(SIGMA_X)