# Modeling full data

Since our data is composed by multiple boss-employees relationships, we can model it as multiple Directed Acyclic Graphs. Graph Neural Networks are specialized on detecting and learning hidden patterns and characteristics from each graph and their nodes, in order to make more accurate predictions than traditional Machine Learning or Deep Learning models on graphs, since they assume that all the data is independent.

## Preparing environment

In [1]:
import pandas as pd
import numpy as np
import sys
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
import optuna
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

In [2]:
import warnings

# Ignore all future warnings
warnings.filterwarnings('ignore', category=FutureWarning)

## Importing data

In [3]:
train_df = pd.read_csv(paths.data_processed_dir('train_red_processed.csv'))
test_df = pd.read_csv(paths.data_processed_dir('test_red_processed.csv'))

In [4]:
# Saving test id_employee for submission

id_col = test_df['id_employee']

## Modeling the graph

In [5]:
# Combining both dataframes to model the graph
combined_df = pd.concat([train_df, test_df], ignore_index=True)

In [6]:
# Modeling the graph
# Create a mapping of employees to indices (node IDs)
employee_ids = combined_df['id_employee'].unique()
employee_id_map = {id_: idx for idx, id_ in enumerate(employee_ids)}

# Map employees and bosses to indices for graph construction
combined_df['employee_idx'] = combined_df['id_employee'].map(employee_id_map)
combined_df['boss_idx'] = combined_df['id_last_boss'].map(employee_id_map)

# Split train and test datasets again, but with the processed features
train_mask = combined_df['resign'].notna()
test_mask = ~train_mask

train_labels = combined_df.loc[train_mask, 'resign'].astype(int).values
test_data = combined_df.loc[test_mask].reset_index(drop=True)

# Node features
node_features = torch.tensor(combined_df.drop(columns=['id_employee', 'id_last_boss', 'employee_idx', 'boss_idx', 'resign']).to_numpy(), dtype=torch.float)

# Create edge index
edge_index = torch.tensor([combined_df['employee_idx'].to_numpy(), combined_df['boss_idx'].to_numpy()], dtype=torch.long)

  edge_index = torch.tensor([combined_df['employee_idx'].to_numpy(), combined_df['boss_idx'].to_numpy()], dtype=torch.long)


In [7]:
# Computing class weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=train_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# Modeling using GATConv

GATConv is a framework that can handle varying importance of node connections, useful for hierarchical structure.

In [8]:
# Creating the model

class GAT(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=1, dropout=0.6):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=dropout)
        self.conv2 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=dropout)
        
    def forward(self, x, edge_index):
        # First GAT Layer
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        
        # Second GAT Layer
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

## Defining the objective function to optimize

In [9]:
# Creating function to optimize the model

def objective(trial):
    # Hyperparameter search space
    hidden_dim = trial.suggest_int('hidden_dim', 16, 128)
    heads = trial.suggest_int('heads', 1, 8)
    dropout = trial.suggest_float('dropout', 0.2, 0.7)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2)
    weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-3)
    
    # Initialize cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in skf.split(combined_df[train_mask].index, train_labels):
        train_x = combined_df.iloc[train_idx]
        val_x = combined_df.iloc[val_idx]
        train_y = torch.tensor(train_labels[train_idx], dtype=torch.long)
        val_y = torch.tensor(train_labels[val_idx], dtype=torch.long)
    
    # Model
    model = GAT(input_dim=node_features.shape[1], hidden_dim=hidden_dim, output_dim=2, heads=heads, dropout=dropout)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    # Training loop
    patience = 10
    best_val_f1 = 0
    no_improve_counter = 0
    
    for epoch in range(100):
        model.train()
        optimizer.zero_grad()
        out = model(node_features, edge_index)
        loss = F.nll_loss(out[train_idx], train_y, weight=class_weights_tensor)
        loss.backward()
        optimizer.step()
        
        # Validation
        model.eval()
        out_val = model(node_features, edge_index)
        val_pred = out_val[val_idx].argmax(dim=1)
        val_f1 = f1_score(val_y.cpu(), val_pred.cpu(), average='macro')
        
        # Early stopping logic
        if val_f1 > best_val_f1 + 0.01:
            best_val_f1 = val_f1
            no_improve_counter = 0
        else:
            no_improve_counter += 1
        
        if no_improve_counter >= patience:
            break
    
    f1_scores.append(best_val_f1)

    mean_f1 = sum(f1_scores) / len(f1_scores)
    return mean_f1

## Optimizing hyperparameters

In [10]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print hyperparameters and best F1 score
print('Best hyperparameters: ', study.best_params)
print(f"Best F1 score: {study.best_value}")

[I 2024-09-08 20:44:39,175] A new study created in memory with name: no-name-160917d7-70ca-45fb-a56f-db4589322988
[I 2024-09-08 20:44:42,261] Trial 0 finished with value: 0.6951493405635922 and parameters: {'hidden_dim': 54, 'heads': 8, 'dropout': 0.6939289139602758, 'learning_rate': 0.0028841204630924307, 'weight_decay': 0.0003078305489260507}. Best is trial 0 with value: 0.6951493405635922.
[I 2024-09-08 20:44:44,364] Trial 1 finished with value: 0.6688013027882309 and parameters: {'hidden_dim': 61, 'heads': 8, 'dropout': 0.33241731158045107, 'learning_rate': 0.008022026281975403, 'weight_decay': 2.9684249017019193e-05}. Best is trial 0 with value: 0.6951493405635922.
[I 2024-09-08 20:44:45,192] Trial 2 finished with value: 0.6472326808251702 and parameters: {'hidden_dim': 17, 'heads': 6, 'dropout': 0.3961642852617505, 'learning_rate': 0.006047805094970257, 'weight_decay': 0.0005035185786571049}. Best is trial 0 with value: 0.6951493405635922.
[I 2024-09-08 20:44:50,535] Trial 3 fini

Best hyperparameters:  {'hidden_dim': 54, 'heads': 8, 'dropout': 0.6939289139602758, 'learning_rate': 0.0028841204630924307, 'weight_decay': 0.0003078305489260507}
Best F1 score: 0.6951493405635922


## Training model with optimized hyperparameters

In [11]:
# Final training using best hyperparameters
best_params = study.best_params


model = GAT(input_dim=node_features.shape[1], hidden_dim=best_params['hidden_dim'], 
            output_dim=2, heads=best_params['heads'], dropout=best_params['dropout'])
optimizer = torch.optim.Adam(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])

# Train on the full train data
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(node_features, edge_index)
    loss = F.nll_loss(out[train_mask], torch.tensor(train_labels, dtype=torch.long), weight=class_weights_tensor)
    loss.backward()
    optimizer.step()

In [12]:
# Make predictions on the test set
model.eval()
test_out = model(node_features, edge_index)
test_pred = test_out[test_mask].argmax(dim=1)

# Save or process predictions
test_df['resign'] = test_pred.cpu().numpy()

In [13]:
test_df['resign'].value_counts()

resign
0    1699
1     321
Name: count, dtype: int64

In [14]:
test_df[['id_employee', 'resign']].rename({'id_employee':'ID', 'resign':'abandono_6meses'}, axis=1).to_csv('sub_GAT_red.csv', index=False)