# Modeling full data

Since our full data has many features due to node embeddings, we need to use robust models as XGBoost, Support Vector Machine and a Neural Network. The metric chosen metric for this evaluation is F1-Score because both classes have the same weight

## Preparing environment

In [11]:
import pandas as pd
import numpy as np
import sys
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Subset
import optuna
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Importing data

In [3]:
train_df = pd.read_csv(paths.data_processed_dir('train_processed.csv'))
test_df = pd.read_csv(paths.data_processed_dir('test_processed.csv'))

In [4]:
# Saving id_employee_employee for submission

id_col = test_df['id_employee']

In [5]:
y = train_df['resign']

In [6]:
# Dropping unnecessary columns

X = train_df.drop(columns=['id_employee', 'id_last_boss', 'resign'])
X_test = test_df.drop(columns=['id_employee', 'id_last_boss'])

# Evaluating and optimizing neural network

In [7]:
X_train_tensor = torch.from_numpy(X.to_numpy()).float()
X_test_tensor = torch.from_numpy(X_test.to_numpy()).float()
y_tensor = torch.from_numpy(y.to_numpy()).long()

In [8]:
# Defining Neural Network

class Net(nn.Module):
    def __init__(self, input_size, num_classes, num_layers, hidden_size, dropout_rate, activation_fn):
        super(Net, self).__init__()
        layers = []
        in_size = input_size
        
        for _ in range(num_layers):
            layers.append(nn.Linear(in_size, hidden_size))
            layers.append(activation_fn)
            layers.append(nn.Dropout(dropout_rate))
            in_size = hidden_size
        
        layers.append(nn.Linear(in_size, num_classes))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

## Defining function to optimize with Optuna

In [12]:
# Defining objective function

def objective(trial):
    # Hyperparameters to tune
    num_layers = trial.suggest_int('num_layers', 1, 3)
    hidden_size = trial.suggest_int('hidden_size', 32, 256, log=True)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    activation_name = trial.suggest_categorical('activation', ['ReLU', 'LeakyReLU', 'ELU'])

    # Choose activation function
    if activation_name == 'ReLU':
        activation_fn = nn.ReLU()
    elif activation_name == 'LeakyReLU':
        activation_fn = nn.LeakyReLU()
    else:
        activation_fn = nn.ELU()
        
    skf = KFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    

    for train_index, val_index in skf.split(X_train_tensor, y_tensor):
        X_train_fold = X_train_tensor[train_index]
        y_train_fold = y_tensor[train_index]
        X_val_fold = X_train_tensor[val_index]
        y_val_fold = y_tensor[val_index]

        train_dataset = TensorDataset(X_train_fold, y_train_fold)
        val_dataset = TensorDataset(X_val_fold, y_val_fold)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        model = Net(input_size=X_train_tensor.shape[1],
                    num_classes=2,
                    num_layers=num_layers,
                    hidden_size=hidden_size,
                    dropout_rate=dropout_rate,
                    activation_fn=activation_fn)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

        # Early stopping parameters
        best_val_f1 = 0
        patience = 10
        trigger_times = 0

        for epoch in range(50):  # Max epochs
            model.train()
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()

            # Validation
            model.eval()
            all_preds = []
            all_labels = []
            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    outputs = model(X_batch)
                    _, predicted = torch.max(outputs.data, 1)
                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(y_batch.cpu().numpy())

            val_f1 = f1_score(all_labels, all_preds, average='binary')

            # Early stopping check
            if val_f1 > best_val_f1:
                best_val_f1 = val_f1
                trigger_times = 0
            else:
                trigger_times += 1
                if trigger_times >= patience:
                    break  # Early stopping

        f1_scores.append(best_val_f1)

    # Return the average F1-score across folds
    return np.mean(f1_scores)

In [13]:
# Optimizing hyperparameters:

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-10-27 22:03:08,497] A new study created in memory with name: no-name-e6dbe675-a897-4cfa-95f9-e9e83d114dd9
[I 2024-10-27 22:03:26,129] Trial 0 finished with value: 0.740130659655074 and parameters: {'num_layers': 1, 'hidden_size': 38, 'dropout_rate': 0.12453051999907157, 'weight_decay': 1.3858937404360746e-05, 'learning_rate': 0.08941043367029315, 'batch_size': 64, 'activation': 'LeakyReLU'}. Best is trial 0 with value: 0.740130659655074.
[I 2024-10-27 22:03:36,105] Trial 1 finished with value: 0.7347869153080807 and parameters: {'num_layers': 3, 'hidden_size': 88, 'dropout_rate': 0.24315879283583125, 'weight_decay': 2.8796386944145745e-05, 'learning_rate': 0.002818712122603081, 'batch_size': 64, 'activation': 'ReLU'}. Best is trial 0 with value: 0.740130659655074.
[I 2024-10-27 22:03:44,057] Trial 2 finished with value: 0.7492799464777041 and parameters: {'num_layers': 1, 'hidden_size': 86, 'dropout_rate': 0.452977629854759, 'weight_decay': 0.007220364515880996, 'learning_rate'

In [14]:
print("Best hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

Best hyperparameters:
  num_layers: 1
  hidden_size: 169
  dropout_rate: 0.2691771974330077
  weight_decay: 9.887535613980591e-05
  learning_rate: 0.0031214454579186267
  batch_size: 64
  activation: ELU


In [15]:
best_params = study.best_params

# Use the entire training set
train_dataset = TensorDataset(X_train_tensor, y_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

# Initialize the model with best hyperparameters
if best_params['activation'] == 'ReLU':
    activation_fn = nn.ReLU()
elif best_params['activation'] == 'LeakyReLU':
    activation_fn = nn.LeakyReLU()
else:
    activation_fn = nn.ELU()

model = Net(input_size=X_train_tensor.shape[1],
            num_classes=2,
            num_layers=best_params['num_layers'],
            hidden_size=best_params['hidden_size'],
            dropout_rate=best_params['dropout_rate'],
            activation_fn=activation_fn
            )

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])

# Train the model
for epoch in range(50):  # Adjust epochs as needed
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

In [16]:
# Loading test data
test_loader = DataLoader(X_test_tensor, batch_size=best_params['batch_size'])

# Evaluation
model.eval()
nn_pred = []

with torch.no_grad():
    for X_batch in test_loader:
        # Getting outputs
        outputs = model(X_batch)
        
        # Get the predicted class (0 or 1)
        _, predicted = torch.max(outputs.data, 1)
        
        # Append the predicted class to test_predictions
        nn_pred.extend(predicted.cpu().numpy())

In [17]:
# Saving predicted values

sub_nn = pd.DataFrame(nn_pred, columns=['abandono_6meses'])
sub_nn = pd.concat([id_col, sub_nn], axis=1)
sub_nn.rename(columns={'id_employee': 'ID'}, inplace=True)
sub_nn.to_csv('../results/sub_nn_rfe_manual.csv', index=False, sep=',')

Kaggle Score: 0.5989