# Modeling full data

Since our full data has many features due to node embeddings, we need to use robust models as XGBoost and a Neural Network. The metric chosen metric for this evaluation is F1-Score because both classes have the same weight

## Preparing environment

In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Subset
import optuna
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Importing data

In [3]:
train_df = pd.read_csv(paths.data_processed_dir('reduced_train.csv'))
test_df = pd.read_csv(paths.data_processed_dir('reduced_test.csv'))

In [4]:
# Saving id_employee_employee for submission

id_col = test_df['id_employee']

In [5]:
y = train_df['resign']

In [6]:
# Dropping unnecessary columns

X = train_df.drop(columns=['id_employee', 'id_last_boss', 'resign'])
X_test = test_df.drop(columns=['id_employee', 'id_last_boss'])

In [7]:
categorical = ['gender', 'gender_boss', 'seniority', 'recruitment_channel', 'join_year', 'marital_estatus', 'join_month_boss', 'join_year_boss', 'performance']
X[categorical] = X[categorical].astype('category')
X_test[categorical] = X_test[categorical].astype('category')

In [23]:
X.columns

Unnamed: 0,gender,Abs(performance_score)/performance_score,gender_boss,gender_Mujer*gender_boss_Mujer,seniority_2*Abs(psi_score_boss),gender_boss_Mujer*Abs(join_age_boss),performance_score,gender_Mujer*recruitment_channel_PortalWeb,gender_Mujer*marital_estatus_Soltero,seniority,...,embedding_boss_55,embedding_boss_56,embedding_boss_57,embedding_boss_58,embedding_boss_59,embedding_boss_60,embedding_boss_61,embedding_boss_62,embedding_boss_63,embedding_boss_64
0,Mujer,1.0,Hombre,0.0,0.0,0.000000,99,0.0,1.0,1,...,-1.025180,-0.510359,0.867795,-0.236739,-0.020616,2.016547,-1.538088,0.673502,-3.966547,0.690487
1,Hombre,1.0,Mujer,0.0,0.0,0.442183,99,0.0,0.0,1,...,-0.111634,0.099191,-1.411987,0.763236,-0.620354,3.045463,0.109220,2.197151,-2.345624,-2.300716
2,Mujer,1.0,Hombre,0.0,0.0,0.000000,96,0.0,0.0,1,...,0.991722,0.476858,0.012612,-0.322904,-2.159082,0.858201,-0.037781,2.198319,-3.286139,0.638560
3,Hombre,1.0,Mujer,0.0,0.0,1.683178,96,0.0,0.0,1,...,0.079957,-0.502130,0.991737,-0.679418,-2.373596,2.779397,0.748496,-0.247417,-1.656327,-0.252358
4,Hombre,1.0,Mujer,0.0,0.0,1.683178,95,0.0,0.0,1,...,0.079957,-0.502130,0.991737,-0.679418,-2.373596,2.779397,0.748496,-0.247417,-1.656327,-0.252358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2147,Mujer,1.0,Hombre,0.0,0.0,0.000000,80,1.0,0.0,1,...,-0.109445,-0.196705,-0.777604,-1.874209,-3.026741,3.315532,0.186572,1.629825,-2.550516,-1.192857
2148,Mujer,1.0,Hombre,0.0,0.0,0.000000,71,0.0,0.0,1,...,0.344714,-0.182652,1.170694,0.491200,-1.659328,0.995259,-0.232451,0.376152,-3.619655,-1.564334
2149,Hombre,1.0,Mujer,0.0,0.0,1.771820,77,0.0,0.0,1,...,-0.314309,1.200129,-0.637064,-0.786244,-3.396498,1.790005,-1.436900,0.385597,-2.636923,-0.108442
2150,Hombre,1.0,Mujer,0.0,0.0,0.708110,77,0.0,0.0,1,...,-0.708151,0.358218,-0.413883,-1.762365,-1.915706,2.726416,0.027674,2.701519,-2.674003,-0.791803


## Preparing cross validation

In [8]:
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

## Evaluating XGBoost Model

In [9]:
# Creating base model

xgb_base = XGBClassifier(random_state=42, eval_metric='logloss', objective='binary:logistic', booster='dart', tree_method='hist', enable_categorical=True, device='cuda')

In [10]:
xgb_base.fit(X, y)

In [11]:
# Making cross-validation in xgboost base

cv_results = cross_val_score(xgb_base, X, y, cv=cv, scoring='f1')
cv_results

array([0.71204188, 0.71392405, 0.71938776, 0.74093264, 0.70559611])

Initial f1 results are good, let's optimize the model with cross-validation

In [12]:
# Defining param grid

param_dist = {
    'n_estimators': np.arange(100, 1100, 100),
    'max_depth': np.arange(3, 11),
    'learning_rate': np.logspace(-3, 0, 10),
    'subsample': np.linspace(0.5, 1.0, 6),
    'colsample_bytree': np.linspace(0.5, 1.0, 6),
    'reg_alpha': np.logspace(-3, 1, 10),
    'reg_lambda': np.logspace(-3, 1, 10),
    'min_child_weight': np.arange(1, 6)
}

In [13]:
# Defining random search

random_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    scoring=make_scorer(f1_score),
    n_iter=50,  # Number of different combinations to try
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

In [14]:
# Optimizing model

random_search.fit(X, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [15]:
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation F1 score: {random_search.best_score_}")

Best parameters found: {'subsample': 0.9, 'reg_lambda': 3.593813663804626, 'reg_alpha': 0.1668100537200059, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.046415888336127774, 'colsample_bytree': 0.5}
Best cross-validation F1 score: 0.7305970467640016


In [16]:
xgb_params = random_search.best_params_

In [19]:
best_xgboost = XGBClassifier(random_state=42, eval_metric='logloss', objective='binary:logistic', booster='dart', tree_method='hist', enable_categorical=True,
                             **xgb_params)

In [20]:
# Fitting the model 

best_xgboost.fit(X, y)

In [21]:
# Predicting values

y_pred_xgb = best_xgboost.predict(X_test)

In [22]:
# Saving predicted values

sub_xgb = pd.DataFrame(y_pred_xgb, columns=['abandono_6meses'])
sub_xgb = pd.concat([id_col, sub_xgb], axis=1)
sub_xgb.rename(columns={'id_employee': 'ID'}, inplace=True)
sub_xgb.to_csv('../results/sub_xgb_rfe_manual.csv', index=False, sep=',')

Kaggle Score: 0.6106