## Categorical Embedder
### Example

In [3]:
# Necessary imports
import categorical_embedder as ce

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [4]:
# Reading data
df = pd.read_csv('HR_Attrition_Data.csv')
df.shape

(54808, 14)

In [5]:
df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [7]:
X = df.drop(['employee_id', 'is_promoted'], axis=1)
y = df['is_promoted']

In [8]:
# ce.get_embedding_info identifies the categorical variables, # of unique values and embedding size and returns a dictionary
embedding_info = ce.get_embedding_info(X)
embedding_info

{'department': (9, 5),
 'region': (34, 17),
 'education': (3, 2),
 'gender': (2, 1),
 'recruitment_channel': (3, 2)}

In [10]:
# ce.get_label_encoded_data integer encodes the categorical variables and prepares it to feed it to neural network
X_encoded,encoders = ce.get_label_encoded_data(X)
X_encoded.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,7,31,2,0,2,1,35,5.0,8,1,0,49
1,4,14,0,1,0,1,30,5.0,4,0,0,60
2,7,10,0,1,2,1,34,3.0,7,0,0,50
3,7,15,0,1,0,2,39,1.0,10,0,0,50
4,8,18,0,1,0,1,45,3.0,2,0,0,73


In [11]:
# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y)

# ce.get_embeddings trains NN, extracts embeddings and return a dictionary containing the embeddings
embeddings = ce.get_embeddings(X_train, y_train, categorical_embedding_info=embedding_info, 
                            is_classification=True, epochs=100,batch_size=256)

HBox(children=(IntProgress(value=0, description='Training', style=ProgressStyle(description_width='initial')),…




In [12]:
embeddings

{'department': array([[ 0.44909748,  0.592682  , -0.2689146 , -0.6076638 , -0.47688553],
        [ 0.14439532,  0.23831578, -0.09904855, -0.1884861 , -0.23708323],
        [ 0.02280043,  0.14768346,  0.00430288, -0.05229405, -0.06076226],
        [ 0.08651688,  0.33048603, -0.10082451, -0.24717978, -0.23439746],
        [ 0.06930665,  0.26183563, -0.099448  , -0.22151738, -0.24915719],
        [ 0.3246719 ,  0.13284945, -0.49051526, -0.13767388, -0.35033587],
        [ 0.39557138,  0.6303038 , -0.31711328, -0.6432047 , -0.5024501 ],
        [ 0.105141  ,  0.00382448, -0.16800691,  0.14332129, -0.09635292],
        [ 0.5065225 ,  0.33804703, -0.4578551 , -0.3261275 , -0.34876052]],
       dtype=float32),
 'region': array([[ 0.06167015, -0.09331849, -0.00821102,  0.35873163,  0.27501398,
         -0.18806422,  0.42246535,  0.16405596,  0.10364748, -0.01732335,
          0.08967754, -0.4844684 ,  0.1706062 ,  0.07629129, -0.46060166,
         -0.250795  , -0.20905156],
        [ 0.1522791

In [13]:
# if you don't like the dictionary format; convert it to dataframe for easy readibility
dfs = ce.get_embeddings_in_dataframe(embeddings=embeddings, encoders=encoders)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [15]:
dfs['department']

Unnamed: 0,department_embedding_0,department_embedding_1,department_embedding_2,department_embedding_3,department_embedding_4
Analytics,0.449097,0.592682,-0.268915,-0.607664,-0.476886
Finance,0.144395,0.238316,-0.099049,-0.188486,-0.237083
HR,0.0228,0.147683,0.004303,-0.052294,-0.060762
Legal,0.086517,0.330486,-0.100825,-0.24718,-0.234397
Operations,0.069307,0.261836,-0.099448,-0.221517,-0.249157
Procurement,0.324672,0.132849,-0.490515,-0.137674,-0.350336
R&D,0.395571,0.630304,-0.317113,-0.643205,-0.50245
Sales & Marketing,0.105141,0.003824,-0.168007,0.143321,-0.096353
Technology,0.506522,0.338047,-0.457855,-0.326127,-0.348761


In [16]:
dfs['education']

Unnamed: 0,education_embedding_0,education_embedding_1
Bachelor's,-0.380689,0.289853
Below Secondary,-0.308726,0.300239
Master's & above,-0.189884,0.466314


In [20]:
# include these embeddings in your dataset:
data = ce.fit_transform(X, embeddings=embeddings, encoders=encoders, drop_categorical_vars=True)
data.head()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_embedding_0,department_embedding_1,department_embedding_2,...,region_embedding_12,region_embedding_13,region_embedding_14,region_embedding_15,region_embedding_16,education_embedding_0,education_embedding_1,gender_embedding_0,recruitment_channel_embedding_0,recruitment_channel_embedding_1
0,1,35,5.0,8,1,0,49,0.105141,0.003824,-0.168007,...,0.178687,0.252653,-0.297972,-0.118256,-0.039458,-0.189884,0.466314,0.382333,-0.397045,0.278604
1,1,30,5.0,4,0,0,60,0.069307,0.261836,-0.099448,...,0.19646,0.210553,-0.130724,-0.215601,-0.147089,-0.380689,0.289853,0.384869,-0.297997,0.373936
2,1,34,3.0,7,0,0,50,0.105141,0.003824,-0.168007,...,0.179156,0.165248,-0.212073,-0.186645,-0.157056,-0.380689,0.289853,0.384869,-0.397045,0.278604
3,2,39,1.0,10,0,0,50,0.105141,0.003824,-0.168007,...,0.137184,0.316768,-0.195833,-0.188082,-0.012853,-0.380689,0.289853,0.384869,-0.297997,0.373936
4,1,45,3.0,2,0,0,73,0.506522,0.338047,-0.457855,...,0.193554,0.230044,-0.097342,-0.129108,-0.080017,-0.380689,0.289853,0.384869,-0.297997,0.373936
