# Modeling reduced data

Since our reduced data has many features due to node embeddings, we need to use robust models as XGBoost, Support Vector Machine and a Neural Network. The metric chosen metric for this evaluation is F1-Score because both classes have the same weight

## Preparing environment

In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from kerastuner import HyperModel
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

  from kerastuner import HyperModel


## Importing data

In [2]:
train_df = pd.read_csv(paths.data_processed_dir('train_full_red_processed.csv'))
test_df = pd.read_csv(paths.data_processed_dir('test_full_red_processed.csv'))

In [3]:
# Saving id_colaborador for submission

id_col = test_df['id_employee_employee']

In [4]:
y = train_df['resign']

In [5]:
# Dropping unnecessary columns

X = train_df.drop(columns=['id_employee_employee', 'id_last_boss_employee', 'id_last_boss_boss', 'resign'])
X_test = test_df.drop(columns=['id_employee_employee', 'id_last_boss_employee', 'id_last_boss_boss'])

## Preparing cross validation

In [6]:
# Folding data

kf = KFold(n_splits=5, shuffle=True, random_state=42)

## Evaluating XGBoost Model

In [7]:
# Creating base model

xgb_base = XGBClassifier(random_state=42, eval_metric='mlogloss')

In [8]:
# Making cross-validation in xgboost

cv_results = cross_val_score(xgb_base, X, y, cv=kf, scoring='f1')
cv_results

array([0.71078431, 0.68888889, 0.72062663, 0.68632708, 0.725     ])

Initial f1 results are good, let's optimize the model with cross-validation

In [9]:
# Defining param grid

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'min_child_weight': [1, 2, 3, 4, 5]
}

In [10]:
# Defining random search

random_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    scoring=make_scorer(f1_score),
    n_iter=100,  # Number of different combinations to try
    cv=kf,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

In [11]:
# Optimizing model

random_search.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [12]:
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation F1 score: {random_search.best_score_}")

Best parameters found: {'subsample': 0.6, 'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 1.0}
Best cross-validation F1 score: 0.7355424033866459


## Evaluating neural network

Defining optimization function

In [13]:
class MyHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        model.add(Dense(hp.Int('units1', min_value=32, max_value=256, step=32), activation='relu', input_dim=X.shape[1]))
        model.add(Dropout(hp.Float('dropout1', min_value=0.0, max_value=0.5, step=0.1)))
        model.add(Dense(hp.Int('units2', min_value=32, max_value=256, step=32), activation='relu'))
        model.add(Dropout(hp.Float('dropout2', min_value=0.0, max_value=0.5, step=0.1)))
        model.add(Dense(1, activation='sigmoid'))
        
        model.compile(optimizer=Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        
        return model

In [14]:
# Setting F1 Score as a default metric

class F1ScoreCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        val_predict = (self.model.predict(self.validation_data[0]) > 0.5).astype(int)
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        print(f' — val_f1: {_val_f1}')
        logs['val_f1'] = _val_f1

# Include the custom callback in the tuner search
f1_callback = F1ScoreCallback(validation_data=(X, y))

In [21]:
# Setup Keras Tuner to find the best hyperparameters:

from kerastuner.tuners import RandomSearch

# Define the tuner
tuner = RandomSearch(
    MyHyperModel(),
    objective='val_accuracy',
    max_trials=20,
    executions_per_trial=3,
    directory='..\keras_tuner_3',
    project_name='binary_classification'
)

In [22]:
# Search for best hyperparameters
tuner.search(X, y, epochs=50, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=5), f1_callback])

Trial 20 Complete [00h 00m 19s]
val_accuracy: 0.6674400568008423

Best val_accuracy So Far: 0.7092034022013346
Total elapsed time: 00h 07m 19s


In [23]:
# Retrieve the best model

best_model = tuner.get_best_models(num_models=1)[0]

In [24]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [25]:
print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units1')} and the optimal number of units in the second densely-connected layer is {best_hps.get('units2')}.
The best dropout rates are {best_hps.get('dropout1')} and {best_hps.get('dropout2')}.
The best learning rate for the optimizer is {best_hps.get('learning_rate')}.
""")


The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 224 and the optimal number of units in the second densely-connected layer is 128.
The best dropout rates are 0.0 and 0.30000000000000004.
The best learning rate for the optimizer is 0.0005765399284932614.



Cross validation

In [26]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

def create_model():
    return best_model

# Create the KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=16, verbose=0)

# Define a custom F1 score metric for cross-validation
f1_scorer = make_scorer(f1_score)

# Perform cross-validation with F1 score
scores = cross_val_score(model, X, y, cv=kf, scoring=f1_scorer)

print(f"Cross-validation F1 scores: {scores}")
print(f"Mean cross-validation F1 score: {scores.mean()}")

  model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=16, verbose=0)


Cross-validation F1 scores: [0.69976359 0.75543478 0.8241206  0.86021505 0.88361045]
Mean cross-validation F1 score: 0.804628896814848


The neural network had better results than XGBoost. Let's evaluate them on test data

## Evaluating best models

### XGBoost

In [27]:
best_xgboost = XGBClassifier(random_state=42, eval_metric='mlogloss',
                             subsample=0.6, n_estimators=100,
                             min_child_weight=5, max_depth=3, learning_rate=0.05,
                             gamma=0.2, colsample_bytree=1.0)

In [28]:
# Fitting the model 

best_xgboost.fit(X, y)

In [29]:
# Predicting values

y_pred_xgb = best_xgboost.predict(X_test)

In [30]:
# Saving predicted values

sub_xgb = pd.DataFrame(y_pred_xgb, columns=['abandono_6meses'])
sub_xgb = pd.concat([id_col, sub_xgb], axis=1)
sub_xgb.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_xgb.to_csv('../results/sub_xgb_full_red.csv', index=False, sep=',')

Kaggle Score:

![image.png](attachment:image.png)

### Neural Network

In [31]:
best_model.fit(X, y, epochs=50, batch_size=16, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=5), f1_callback])

Epoch 1/50
 — val_f1: 0.9438648782911079
Epoch 2/50
 — val_f1: 0.9391824526420738
Epoch 3/50
 — val_f1: 0.9457441513190642
Epoch 4/50
 — val_f1: 0.9493734335839599
Epoch 5/50
 — val_f1: 0.9445544554455446
Epoch 6/50
 — val_f1: 0.9502487562189055
Epoch 7/50
 — val_f1: 0.9503261414952334
Epoch 8/50
 — val_f1: 0.9475766567754699
Epoch 9/50
 — val_f1: 0.9475728155339807


<keras.callbacks.History at 0x197e2e549d0>

In [32]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.5).astype(int)



In [33]:
# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red.csv', index=False, sep=',')

Kaggle Score:

![image.png](attachment:image.png)

In [34]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.55).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_55.csv', index=False, sep=',')



In [35]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.60).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_60.csv', index=False, sep=',')



In [36]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.65).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_65.csv', index=False, sep=',')



In [37]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.70).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_70.csv', index=False, sep=',')



In [38]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.75).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_75.csv', index=False, sep=',')



In [39]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.80).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_80.csv', index=False, sep=',')



In [40]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.85).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_85.csv', index=False, sep=',')



In [41]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.90).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_90.csv', index=False, sep=',')



In [42]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.45).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_45.csv', index=False, sep=',')



In [43]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.40).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_40.csv', index=False, sep=',')



In [44]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.35).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_35.csv', index=False, sep=',')



In [45]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.30).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_30.csv', index=False, sep=',')



In [46]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.25).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_25.csv', index=False, sep=',')



In [47]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.20).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_20.csv', index=False, sep=',')



In [48]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.15).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_15.csv', index=False, sep=',')



In [49]:
# Predicting values

y_pred_nn = (best_model.predict(X_test) > 0.10).astype(int)

# Saving predicted values

sub_nn = pd.DataFrame(y_pred_nn, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_full_red_10.csv', index=False, sep=',')

