# Node generation

Since id_ultimo_jefe has many categories that cannot be reduced, let's create node embeddings to catch their interactions

## Preparing environment

In [1]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec as n2v
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Importing data

In [2]:
train_df = pd.read_csv(paths.data_interim_dir('train_clean.csv'))
test_df = pd.read_csv(paths.data_interim_dir('test_clean.csv'))

In [3]:
target = train_df['abandono_6meses']
combined_df = pd.concat([train_df.drop('abandono_6meses', axis=1), test_df])

In [4]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4172 entries, 0 to 2019
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_colaborador        4172 non-null   int64  
 1   id_ultimo_jefe        4172 non-null   object 
 2   seniority             4172 non-null   int64  
 3   modalidad_trabajo     4172 non-null   object 
 4   distancia_oficina     4172 non-null   float64
 5   dias_baja_salud       4172 non-null   int64  
 6   genero                4172 non-null   object 
 7   canal_reclutamiento   4172 non-null   object 
 8   permanencia_promedio  4172 non-null   int64  
 9   salario               4172 non-null   int64  
 10  performance_score     4172 non-null   float64
 11  psi_score             4172 non-null   int64  
 12  estado_civil          4172 non-null   object 
 13  age                   4172 non-null   int64  
 14  join_year             4172 non-null   int64  
 15  join_month            4172

## Creating edges

In [5]:
edges = combined_df[['id_colaborador', 'id_ultimo_jefe']].values.tolist()

## Creating graph

In [6]:
G = nx.Graph()
G.add_edges_from(edges)

## Generating Node Embeddings

In [7]:
# Precompute probabilities and generate walks

node2vec = n2v(G, dimensions=64, walk_length=30, num_walks=200, workers=4)

# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)

Computing transition probabilities:   0%|          | 0/4346 [00:00<?, ?it/s]

Computing transition probabilities: 100%|██████████| 4346/4346 [00:00<00:00, 11106.02it/s]


## Storing Node Embeddings

In [8]:
# Create a dataframe to store the embeddings
node_embeddings = pd.DataFrame([model.wv[str(node)] for node in G.nodes()], index=G.nodes())

In [10]:
node_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
100247,-1.208497,0.584044,0.501762,0.191002,0.463499,0.160902,0.258567,-0.188635,-0.253489,0.843109,...,0.571494,1.078342,-0.404903,0.359761,-0.473558,0.211984,0.098701,0.260059,-0.099771,-0.405889
102074.0,-2.033880,1.567113,1.249674,1.158858,0.401372,0.091219,0.595954,-0.033343,-1.268325,1.341976,...,1.049352,2.176144,-1.050692,0.820663,-0.889509,0.120322,0.591797,0.866077,-0.550841,-0.723592
103355,-1.216124,-0.070381,-0.685654,0.683935,0.233835,-0.035867,0.105389,-0.726332,-0.588331,0.980879,...,1.067517,0.465736,-0.521211,0.092901,-0.626370,0.294451,0.008587,0.444508,0.114222,-0.207144
102115.0,-1.482928,-0.269587,-0.958590,1.110772,-0.004303,-0.213561,0.386331,-0.506779,-1.198685,1.104772,...,1.540121,0.674904,-0.641393,0.215567,-0.930798,0.426226,0.359258,0.682229,-0.038215,-0.297015
100669,-1.520101,-0.261058,0.251060,-0.739334,0.659555,0.184285,0.150205,-0.017884,0.199660,0.621867,...,0.042893,0.295017,-0.568597,0.260654,0.057325,0.414595,-0.603350,0.133116,-0.135542,0.120284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103755,-0.904602,0.357356,0.333971,-0.041260,1.490299,-0.226121,0.930183,-0.044985,0.092763,0.718671,...,0.679968,0.472354,-0.080846,0.534388,-0.255796,0.549578,-0.207278,0.229855,0.101068,0.102280
103976,-1.075785,-0.142823,0.753274,-0.511229,0.660110,-0.402771,0.636521,-0.321436,0.899026,1.239976,...,0.447152,0.012073,-0.175676,0.807267,0.602364,0.248607,0.362763,0.806506,0.573435,-0.561659
104115,-0.726761,0.214652,-0.073644,-0.397174,1.023877,-0.266387,0.180812,-0.122875,0.591294,0.775500,...,-0.279576,0.878455,-0.068295,0.597112,-0.302268,0.916951,0.026212,0.236854,0.366351,0.036911
103920,-1.669657,0.049895,0.387587,0.038337,1.211918,0.537539,0.400646,-0.379161,0.234294,0.964517,...,0.392032,0.893673,-0.090157,0.644120,-0.386454,0.281980,-0.392755,0.400653,0.721805,0.112954


## Merge node embeddings with original data

In [15]:
id_colaborador_embeddings = node_embeddings.loc[combined_df['id_colaborador']].reset_index(drop=True)
id_ultimo_jefe_embeddings = node_embeddings.loc[combined_df['id_ultimo_jefe'].astype(str)].reset_index(drop=True)

In [20]:
# Rename columns to distinguish between the embeddings
id_colaborador_embeddings.columns = [f'id_colaborador_emb_{i}' for i in range(id_colaborador_embeddings.shape[1])]
id_ultimo_jefe_embeddings.columns = [f'id_ultimo_jefe_emb_{i}' for i in range(id_ultimo_jefe_embeddings.shape[1])]

In [21]:
# Merge embeddings with the combined dataframe
combined_df = pd.concat([combined_df.reset_index(drop=True), id_colaborador_embeddings, id_ultimo_jefe_embeddings], axis=1)

## Split back into Train and Test DataFrames

In [36]:
train_new = combined_df.loc[:len(train_df)-1,:].reset_index(drop=True)
test_new = combined_df.loc[len(train_df):, :].reset_index(drop=True)

In [37]:
train_new

Unnamed: 0,id_colaborador,id_ultimo_jefe,seniority,modalidad_trabajo,distancia_oficina,dias_baja_salud,genero,canal_reclutamiento,permanencia_promedio,salario,...,id_ultimo_jefe_emb_54,id_ultimo_jefe_emb_55,id_ultimo_jefe_emb_56,id_ultimo_jefe_emb_57,id_ultimo_jefe_emb_58,id_ultimo_jefe_emb_59,id_ultimo_jefe_emb_60,id_ultimo_jefe_emb_61,id_ultimo_jefe_emb_62,id_ultimo_jefe_emb_63
0,100247,102074.0,1,Híbrida,1.760,1,Mujer,Ferias & Networking,1,140011,...,1.049352,2.176144,-1.050692,0.820663,-0.889509,0.120322,0.591797,0.866077,-0.550841,-0.723592
1,103355,102115.0,1,Híbrida,0.760,2,Hombre,Ferias & Networking,2,182774,...,1.540121,0.674904,-0.641393,0.215567,-0.930798,0.426226,0.359258,0.682229,-0.038215,-0.297015
2,100669,102060.0,1,Híbrida,4.950,3,Mujer,Referidos,11,682106,...,-0.756675,0.012549,-1.788070,0.192864,0.763654,0.777990,-1.448484,0.587417,-1.082647,0.643463
3,103760,102062.0,1,Híbrida,13.030,2,Hombre,Linkedin,2,270232,...,3.548049,-0.192672,-1.055250,-1.110643,-2.039503,1.191401,-0.997697,1.442482,0.172444,-0.671334
4,100965,102062.0,1,Híbrida,13.045,2,Hombre,Linkedin,2,266804,...,3.548049,-0.192672,-1.055250,-1.110643,-2.039503,1.191401,-0.997697,1.442482,0.172444,-0.671334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2147,103567,102171.0,1,Presencial,1.965,8,Mujer,Portal Web,3,281159,...,0.381682,0.591231,-0.933768,0.650452,1.322455,-0.257942,0.402153,0.723248,1.031927,-0.752369
2148,104098,102172.0,1,Presencial,0.805,12,Mujer,Linkedin,13,460337,...,0.412334,1.279112,-0.327395,1.579410,-0.737667,1.092553,-0.864464,-0.387034,0.865117,-0.321175
2149,103987,102155.0,1,Presencial,1.625,1,Hombre,Portal Web,11,633879,...,-0.942364,0.847157,0.519598,0.373489,1.571187,1.772639,1.044827,1.900588,1.748371,-1.636623
2150,103810,102141.0,1,Presencial,5.665,2,Hombre,Portal Web,6,793977,...,-0.342117,0.452211,0.861997,-0.267220,-0.845212,1.053854,0.883732,-0.497578,-1.611750,1.191192


In [38]:
test_new

Unnamed: 0,id_colaborador,id_ultimo_jefe,seniority,modalidad_trabajo,distancia_oficina,dias_baja_salud,genero,canal_reclutamiento,permanencia_promedio,salario,...,id_ultimo_jefe_emb_54,id_ultimo_jefe_emb_55,id_ultimo_jefe_emb_56,id_ultimo_jefe_emb_57,id_ultimo_jefe_emb_58,id_ultimo_jefe_emb_59,id_ultimo_jefe_emb_60,id_ultimo_jefe_emb_61,id_ultimo_jefe_emb_62,id_ultimo_jefe_emb_63
0,100486,102115.0,1,Híbrida,0.795,2,Hombre,Ferias & Networking,2,181654,...,1.540121,0.674904,-0.641393,0.215567,-0.930798,0.426226,0.359258,0.682229,-0.038215,-0.297015
1,103752,102074.0,1,Híbrida,1.715,1,Mujer,Ferias & Networking,1,140986,...,1.049352,2.176144,-1.050692,0.820663,-0.889509,0.120322,0.591797,0.866077,-0.550841,-0.723592
2,103937,102150.0,1,Híbrida,2.375,1,Hombre,Portal Web,6,406690,...,-0.711243,-0.829307,-2.017607,2.575541,0.802143,3.165653,0.455581,-0.539101,0.760924,-0.944239
3,101744,102172.0,1,Híbrida,2.185,3,Hombre,Linkedin,3,249107,...,0.412334,1.279112,-0.327395,1.579410,-0.737667,1.092553,-0.864464,-0.387034,0.865117,-0.321175
4,101037,102060.0,1,Híbrida,4.660,3,Hombre,Referidos,10,698318,...,-0.756675,0.012549,-1.788070,0.192864,0.763654,0.777990,-1.448484,0.587417,-1.082647,0.643463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015,103755,102161.0,1,Presencial,2.995,2,Mujer,Portal Web,16,700814,...,0.381334,-1.369624,-0.362420,1.153134,0.824333,0.737606,0.017917,0.879374,0.295898,2.562610
2016,103976,102171.0,1,Presencial,2.775,0,Mujer,Portal Web,5,677071,...,0.381682,0.591231,-0.933768,0.650452,1.322455,-0.257942,0.402153,0.723248,1.031927,-0.752369
2017,104115,102144.0,1,Presencial,3.990,1,Hombre,Portal Web,7,876285,...,-0.920747,1.786916,-0.121925,1.052400,-0.271034,1.982778,0.189099,0.420293,0.674963,0.056658
2018,103920,102152.0,1,Presencial,7.100,1,Mujer,Linkedin,8,719102,...,0.581979,1.415647,0.175420,1.656245,-1.585642,-2.217968,-1.998916,2.154059,3.729241,1.530018


## Saving new DataFrames

In [39]:
train_new.to_csv(paths.data_interim_dir('train_node.csv'), index=False, sep=',')
test_new.to_csv(paths.data_interim_dir('test_node.csv'), index=False, sep=',')