# Modeling full data

Since our full data has many features due to node embeddings, we need to use robust models as XGBoost, Support Vector Machine and a Neural Network. The metric chosen metric for this evaluation is F1-Score because both classes have the same weight

## Preparing environment

In [57]:
import pandas as pd
import numpy as np
import sys
from sklearn.svm import SVC
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from kerastuner import HyperModel
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Importing data

In [3]:
train_df = pd.read_csv(paths.data_processed_dir('train_cluster_full.csv'))
test_df = pd.read_csv(paths.data_processed_dir('test_cluster_full.csv'))

In [6]:
# Saving id_colaborador for submission

id_col = test_df['id_colaborador']

In [8]:
y = train_df['abandono_6meses']

In [9]:
# Dropping unnecessary columns

X = train_df.drop(columns=['id_colaborador', 'id_ultimo_jefe', 'abandono_6meses'])
X_test = test_df.drop(columns=['id_colaborador', 'id_ultimo_jefe'])

## Preparing cross validation

In [22]:
# Folding data

kf = KFold(n_splits=5, shuffle=True, random_state=42)

## Evaluating XGBoost Model

In [24]:
# Creating base model

xgb_base = XGBClassifier(random_state=42, eval_metric='mlogloss')

In [25]:
# Making cross-validation in xgboost

cv_results = cross_val_score(xgb_base, X, y, cv=kf, scoring='f1')
cv_results

array([0.62085308, 0.61917808, 0.6377551 , 0.64516129, 0.69437653])

Initial f1 results are not that good, let's optimize the model with cross-validation

In [26]:
# Defining param grid

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'min_child_weight': [1, 2, 3, 4, 5]
}

In [28]:
# Defining random search

random_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    scoring=make_scorer(f1_score),
    n_iter=100,  # Number of different combinations to try
    cv=kf,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

In [29]:
# Optimizing model

random_search.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [30]:
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation F1 score: {random_search.best_score_}")

Best parameters found: {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 4, 'max_depth': 5, 'learning_rate': 0.2, 'gamma': 0.2, 'colsample_bytree': 1.0}
Best cross-validation F1 score: 0.6688500699163168


## Evaluating SVC model

In [31]:
# Creating base model

svc_base = SVC(random_state=42)

In [32]:
# Making cross-validation in SVC

cv_results = cross_val_score(svc_base, X, y, cv=kf, scoring='f1')
cv_results

array([0.06222222, 0.27509294, 0.30107527, 0.27419355, 0.27436823])

Well... SVC was even worse than xgboost. Let's check if it can be better with hyperparameters optimization

In [33]:
# Defining param grid

param_dist = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

In [35]:
# Defining random search

random_search = RandomizedSearchCV(
    estimator=svc_base,
    param_distributions=param_dist,
    scoring=make_scorer(f1_score),
    n_iter=100,  # Number of different combinations to try
    cv=kf,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

In [36]:
# Optimizing model

random_search.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [37]:
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation F1 score: {random_search.best_score_}")

Best parameters found: {'kernel': 'rbf', 'gamma': 0.001, 'C': 1000}
Best cross-validation F1 score: 0.6395125384135041


## Evaluating neural network

Defining optimization function

In [39]:
class MyHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        model.add(Dense(hp.Int('units1', min_value=32, max_value=256, step=32), activation='relu', input_dim=X.shape[1]))
        model.add(Dropout(hp.Float('dropout1', min_value=0.0, max_value=0.5, step=0.1)))
        model.add(Dense(hp.Int('units2', min_value=32, max_value=256, step=32), activation='relu'))
        model.add(Dropout(hp.Float('dropout2', min_value=0.0, max_value=0.5, step=0.1)))
        model.add(Dense(1, activation='sigmoid'))
        
        model.compile(optimizer=Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        
        return model

In [40]:
# Setting F1 Score as a default metric

class F1ScoreCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        val_predict = (self.model.predict(self.validation_data[0]) > 0.5).astype(int)
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        print(f' — val_f1: {_val_f1}')
        logs['val_f1'] = _val_f1

# Include the custom callback in the tuner search
f1_callback = F1ScoreCallback(validation_data=(X, y))

In [42]:
# Setup Keras Tuner to find the best hyperparameters:

from kerastuner.tuners import RandomSearch

# Define the tuner
tuner = RandomSearch(
    MyHyperModel(),
    objective='val_accuracy',
    max_trials=20,
    executions_per_trial=3,
    directory='..\keras_tuner',
    project_name='binary_classification'
)

In [43]:
# Search for best hyperparameters
tuner.search(X, y, epochs=50, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=5), f1_callback])

Trial 20 Complete [00h 00m 11s]
val_accuracy: 0.5684454838434855

Best val_accuracy So Far: 0.6187161803245544
Total elapsed time: 00h 05m 34s


In [45]:
# Retrieve the best model

best_model = tuner.get_best_models(num_models=1)[0]



Cross validation

In [46]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

def create_model():
    return best_model

# Create the KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=16, verbose=0)

# Define a custom F1 score metric for cross-validation
f1_scorer = make_scorer(f1_score)

# Perform cross-validation with F1 score
scores = cross_val_score(model, X, y, cv=5, scoring=f1_scorer)

print(f"Cross-validation F1 scores: {scores}")
print(f"Mean cross-validation F1 score: {scores.mean()}")

  model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=16, verbose=0)


Cross-validation F1 scores: [0.52367688 0.79187817 0.7751938  0.91866029 0.94035088]
Mean cross-validation F1 score: 0.7899520031071215


The neural network had better results than XGBoost and SVC. Let's evaluate them on test data

## Evaluating best models

### XGBoost

In [77]:
best_xgboost = XGBClassifier(random_state=42, eval_metric='mlogloss',
                             subsample=0.7, n_estimators=500,
                             min_child_weight=4, max_depth=5, learning_rate=0.2,
                             gamma=0.2, colsample_bytree=1.0)

In [78]:
# Fitting the model 

best_xgboost.fit(X, y)

In [79]:
# Predicting values

y_pred_xgb = best_xgboost.predict(X_test)

In [80]:
# Saving predicted values

sub_xgb = pd.DataFrame(y_pred_xgb, columns=['abandono_6meses'])
sub_xgb = pd.concat([id_col, sub_xgb], axis=1)
sub_xgb.rename(columns={'id_colaborador': 'ID'}, inplace=True)
sub_xgb.to_csv('../results/sub_xgb_full.csv', index=False, sep=',')

Kaggle Score:

![image.png](attachment:image.png)

### SVC

In [70]:
best_svc = SVC(random_state=42, kernel='rbf', gamma=0.001, C=1000)

In [71]:
# Fitting the model

best_svc.fit(X, y)

In [72]:
# Predicting values

y_pred_svc = best_svc.predict(X_test)

In [73]:
# Saving predicted values

sub_svc = pd.DataFrame(y_pred_svc, columns=['abandono_6meses'])
sub_svc = pd.concat([id_col, sub_svc], axis=1)
sub_svc.rename(columns={'id_colaborador': 'ID'}, inplace=True)
sub_svc.to_csv('../results/sub_svc_full.csv', index=False, sep=',')

Kaggle Score:

![image.png](attachment:image.png)

### Neural Network

In [74]:
best_model.fit(X, y, epochs=50, batch_size=16, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=5), f1_callback])

Epoch 1/50
 — val_f1: 0.9396984924623115
Epoch 2/50
 — val_f1: 0.9488325881768505
Epoch 3/50
 — val_f1: 0.9460255152109912
Epoch 4/50
 — val_f1: 0.9436758893280631
Epoch 5/50
 — val_f1: 0.9140177175612297
Epoch 6/50
 — val_f1: 0.9375
Epoch 7/50
 — val_f1: 0.9303238469087339
Epoch 8/50
 — val_f1: 0.9449677739216659


<keras.callbacks.History at 0x196caea8130>

In [75]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.5).astype(int)



In [76]:
# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_colaborador': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full.csv', index=False, sep=',')

Kaggle Score:

![image.png](attachment:image.png)