# Ensemble RoBERTa and Gradient Boosting

## Load data

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import os
import re
from skmultilearn.model_selection import iterative_train_test_split

# Load data
embeddings_df = pd.read_csv('test_embeddings.csv', index_col='id')
labels_df = pd.read_csv('test_binary_predictions.csv', index_col='id')
data_df = embeddings_df.join(labels_df)

In [2]:
data_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,764,765,766,767,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
680,0.899451,-0.208868,-0.258338,0.616397,0.481268,-0.963264,0.121004,-0.098333,-0.837639,0.109306,...,1.392062,-0.169403,0.048053,-0.253487,0.0,0.0,1.0,0.0,0.0,0.0
156,1.056375,-0.066681,-0.390495,0.637470,0.460121,-0.925095,0.166863,-0.185645,-0.803199,0.129026,...,1.540491,-0.201815,-0.224661,-0.083501,0.0,0.0,1.0,0.0,0.0,0.0
177,-0.309735,0.278226,-0.732789,0.280921,0.320329,-0.098903,-0.275068,-0.134264,-0.184539,-0.239287,...,0.951008,0.941368,0.702085,-1.465269,0.0,0.0,0.0,0.0,0.0,1.0
1143,0.438672,0.516860,0.331353,0.699284,-0.869384,-1.597421,-0.469358,-0.729871,-0.878476,-0.368760,...,1.285161,-0.974465,-0.697173,-0.442264,0.0,1.0,0.0,0.0,0.0,0.0
241,1.437573,-0.363553,-0.149948,0.275516,0.364479,-1.436586,-0.151523,-0.004006,-1.172178,-0.019456,...,1.454338,-0.389838,0.076883,0.139695,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,-0.345128,0.111756,-0.678271,0.291190,0.264875,-0.298870,-0.113554,-0.079041,-0.350187,-0.239035,...,0.901038,0.924326,0.607427,-1.470072,0.0,0.0,0.0,0.0,0.0,1.0
25,0.204115,-1.023382,-0.383540,0.148752,0.228421,-0.928503,0.539898,-1.130774,0.058532,0.245917,...,-0.609860,-0.420694,-0.470939,-0.715793,1.0,0.0,0.0,0.0,0.0,0.0
1983,0.739164,-0.163803,-0.015124,0.803291,1.099536,0.266697,0.162955,-0.286600,-0.523420,0.523757,...,0.019834,-1.513483,-0.080587,-0.880310,0.0,0.0,0.0,1.0,0.0,0.0
1068,-0.274032,-0.057643,-0.719626,0.451183,0.724459,-0.561719,-0.104400,-0.246095,-0.341149,0.002154,...,0.869031,0.731986,0.607069,-1.539341,0.0,0.0,0.0,0.0,0.0,1.0


## Train LightGBM models

In [3]:
# Folder to save the models
model_dir = "lightgbm_models"
os.makedirs(model_dir, exist_ok=True)

# Prepare features and labels
X = data_df.iloc[:, :-6].values  # Assuming the last 6 columns are the labels
y = data_df.iloc[:, -6:].values  # Assuming the last 6 columns are the labels

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'max_depth': 6,
    'min_data_in_leaf': 20,
    'min_gain_to_split': 0.1
}

predictions_dict = {}

# Loop through each label
for label_index, label_name in tqdm(enumerate(data_df.columns[-6:]), desc="Training models", total=6):
    print(f"\nTraining model for label: {label_name}")
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X, label=y[:, label_index])
    
    # Train the LightGBM model
    model = lgb.train(params, train_data, num_boost_round=100)
    
    # Get predictions
    preds = model.predict(X)

    predictions_dict[label_name] = preds
    
    # Check if y_val contains more than one class
    if len(set(y[:, label_index])) > 1:
        # Calculate AUC score
        auc = roc_auc_score(y[:, label_index], preds)
        print(f"AUC for {label_name}: {auc}")
    else:
        print(f"Skipping AUC calculation for {label_name} due to only one class present in y_val.")
    
    safe_label_name = re.sub(r'[\/:*?"<>|]', '_', label_name)

    # Save the model
    model_filename = os.path.join(model_dir, f'model_{safe_label_name}.txt')
    model.save_model(model_filename)

Training models:  17%|█▋        | 1/6 [00:00<00:00,  7.42it/s]


Training model for label: Chief Officer
[LightGBM] [Info] Number of positive: 28, number of negative: 418
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017852 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114594
[LightGBM] [Info] Number of data points in the train set: 446, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062780 -> initscore=-2.703277
[LightGBM] [Info] Start training from score -2.703277
AUC for Chief Officer: 1.0

Training model for label: Director
[LightGBM] [Info] Number of positive: 97, number of negative: 349
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020529 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114594
[LightGBM] [Info] Number of data points in the train set: 446, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.

Training models:  50%|█████     | 3/6 [00:00<00:00,  5.06it/s]

AUC for Director: 1.0

Training model for label: Individual Contributor/Staff
[LightGBM] [Info] Number of positive: 219, number of negative: 227
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114594
[LightGBM] [Info] Number of data points in the train set: 446, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.491031 -> initscore=-0.035878
[LightGBM] [Info] Start training from score -0.035878
AUC for Individual Contributor/Staff: 1.0

Training model for label: Manager


Training models:  67%|██████▋   | 4/6 [00:00<00:00,  6.05it/s]

[LightGBM] [Info] Number of positive: 25, number of negative: 421
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114594
[LightGBM] [Info] Number of data points in the train set: 446, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.056054 -> initscore=-2.823757
[LightGBM] [Info] Start training from score -2.823757
AUC for Manager: 1.0

Training model for label: Owner
[LightGBM] [Info] Number of positive: 0, number of negative: 446
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114594
[LightGBM] [Info] Number of data points in the train set: 446, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] St

Training models: 100%|██████████| 6/6 [00:00<00:00,  6.29it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114594
[LightGBM] [Info] Number of data points in the train set: 446, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.179372 -> initscore=-1.520607
[LightGBM] [Info] Start training from score -1.520607
AUC for Vice President: 1.0





In [4]:
# Save lightgbm predictions to a DataFrame

lightgbm_preds_df = pd.DataFrame(predictions_dict)
lightgbm_preds_df

Unnamed: 0,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
0,0.000422,0.001505,0.996632,0.000377,1.000000e-15,0.001203
1,0.000422,0.001505,0.996632,0.000377,1.000000e-15,0.001203
2,0.000422,0.001505,0.003253,0.000377,1.000000e-15,0.994712
3,0.000422,0.994487,0.003253,0.000377,1.000000e-15,0.001203
4,0.000422,0.001532,0.996632,0.000377,1.000000e-15,0.001203
...,...,...,...,...,...,...
441,0.000422,0.001505,0.003253,0.000377,1.000000e-15,0.994712
442,0.994156,0.001505,0.003253,0.000462,1.000000e-15,0.001203
443,0.000422,0.001505,0.003253,0.994126,1.000000e-15,0.001203
444,0.000422,0.001831,0.003253,0.000377,1.000000e-15,0.994712


In [5]:
roberta_probs_df = pd.read_csv('test_predictions.csv', index_col='id')
roberta_probs_df

Unnamed: 0_level_0,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
680,0.040123,0.051085,0.952900,0.063829,0.044800,0.041578
156,0.039122,0.049021,0.947415,0.060235,0.042879,0.045037
177,0.075464,0.073186,0.085484,0.085020,0.080040,0.902297
1143,0.074483,0.918322,0.100298,0.069065,0.071296,0.076047
241,0.038620,0.054086,0.918931,0.060147,0.032665,0.040798
...,...,...,...,...,...,...
525,0.078300,0.081174,0.071168,0.081243,0.068423,0.898852
25,0.511038,0.053993,0.233327,0.212737,0.038421,0.154518
1983,0.160222,0.195489,0.120989,0.863343,0.113927,0.182420
1068,0.080485,0.056734,0.083758,0.074143,0.054537,0.852796


In [6]:
lightgbm_preds_df.index = roberta_probs_df.index

In [7]:
# Save lightgbm predictions to a file

lightgbm_preds_df.to_csv('lightgbm_predictions.csv')

## Combine Predictions

### Average ROC AUC scores

0.963732215132055 1 0

0.963746769901653 0.9 0.1

0.963746769901653 0.8 0.2

0.963746769901653 0.7 0.3

0.9637564730813849 0.5 0.5

0.9638729112381691 0.1 0.9

0.9645666885890067 0.01 0.99

0.9647163724883697 0.001 0.999 - The best values

0.8462990203745937 0 1

In [8]:
# Define weights
weight_roberta = .001
weight_lightgbm = .999

# Normalize weights
total_weight = weight_roberta + weight_lightgbm
weight_roberta /= total_weight
weight_lightgbm /= total_weight

In [9]:
# Define a small value to identify near-zero predictions
small_value = 1e-15

# Create a mask for LightGBM predictions that are essentially zero
lightgbm_mask = lightgbm_preds_df < small_value

In [10]:
# Apply mask to LightGBM predictions
# Replace LightGBM predictions with NaN where they are essentially zero
lightgbm_preds_df_adjusted = lightgbm_preds_df.where(~lightgbm_mask, other=None)

# Combine predictions by weighted average
# For cells with NaN in LightGBM predictions, only use RoBERTa predictions
combined_preds_df = (roberta_probs_df * weight_roberta + 
                      lightgbm_preds_df_adjusted * weight_lightgbm).fillna(roberta_probs_df)
combined_preds_df

Unnamed: 0_level_0,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
680,0.000462,0.001555,0.996588,0.000441,0.000045,0.001243
156,0.000461,0.001553,0.996583,0.000437,0.000043,0.001246
177,0.000497,0.001577,0.003335,0.000462,0.000080,0.994619
1143,0.000496,0.994411,0.003350,0.000446,0.000071,0.001277
241,0.000461,0.001584,0.996554,0.000437,0.000033,0.001242
...,...,...,...,...,...,...
525,0.000500,0.001585,0.003320,0.000458,0.000068,0.994616
25,0.993672,0.001557,0.003483,0.000674,0.000038,0.001356
1983,0.000582,0.001699,0.003370,0.993995,0.000114,0.001384
1068,0.000502,0.001886,0.003333,0.000451,0.000055,0.994570


In [11]:
# Save combined predictions to a file

combined_preds_df.to_csv('weighted_combined_predictions.csv')

In [12]:
combined_preds_df

Unnamed: 0_level_0,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
680,0.000462,0.001555,0.996588,0.000441,0.000045,0.001243
156,0.000461,0.001553,0.996583,0.000437,0.000043,0.001246
177,0.000497,0.001577,0.003335,0.000462,0.000080,0.994619
1143,0.000496,0.994411,0.003350,0.000446,0.000071,0.001277
241,0.000461,0.001584,0.996554,0.000437,0.000033,0.001242
...,...,...,...,...,...,...
525,0.000500,0.001585,0.003320,0.000458,0.000068,0.994616
25,0.993672,0.001557,0.003483,0.000674,0.000038,0.001356
1983,0.000582,0.001699,0.003370,0.993995,0.000114,0.001384
1068,0.000502,0.001886,0.003333,0.000451,0.000055,0.994570


In [13]:
import ast

# Load the test dataset with true labels

title_test = pd.read_csv('test_dataset.csv')

title_test['Labels'] = title_test['Labels'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

title_test

Unnamed: 0,Id,Title,Labels
0,0,Vice President / Director of Systems Engineering,"[0, 0, 0, 0, 0, 1]"
1,3,CTO/Executive Director of Technology Services,"[1, 1, 0, 0, 0, 0]"
2,6,"Chief Information Officer, Platform Services","[1, 0, 0, 0, 0, 0]"
3,8,Chief Information Systems Officer,"[1, 0, 0, 0, 0, 0]"
4,10,"Vice President, Chief Information Security Off...","[1, 0, 0, 0, 0, 0]"
...,...,...,...
441,2004,"Paraplanning, Operations Manager","[0, 0, 0, 1, 0, 0]"
442,2006,Group Finance Reporting Manager,"[0, 0, 0, 1, 0, 0]"
443,2012,Indirect Tax Technology Manager,"[0, 0, 0, 1, 0, 0]"
444,2016,Manager Manufacturing Engineering,"[0, 0, 0, 1, 0, 0]"


In [14]:
# Define the class labels
class_labels = ['Chief Officer', 'Director', 'Individual Contributor/Staff', 'Manager', 'Owner', 'Vice President']

# Convert 'Labels' back into separate columns
label_df = pd.DataFrame(title_test['Labels'].tolist(), columns=class_labels)

# Concatenate the original dataset with the new label columns
test_labels = pd.concat([title_test.drop(columns=['Labels']), label_df], axis=1)

# Drop unnecessary columns and set the index
test_labels.drop(columns=['Title'], inplace=True)
test_labels.set_index('Id', inplace=True)

# Display the updated dataset
test_labels


Unnamed: 0_level_0,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,0,0,0,1
3,1,1,0,0,0,0
6,1,0,0,0,0,0
8,1,0,0,0,0,0
10,1,0,0,0,0,0
...,...,...,...,...,...,...
2004,0,0,0,1,0,0
2006,0,0,0,1,0,0
2012,0,0,0,1,0,0
2016,0,0,0,1,0,0


In [15]:
combined_preds_df.sort_values(by='id', inplace=True)
combined_preds_df

Unnamed: 0_level_0,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.000490,0.001607,0.003317,0.000445,0.000067,0.994602
3,0.993796,0.992925,0.003302,0.000444,0.000064,0.001318
6,0.993939,0.001670,0.003358,0.000466,0.000068,0.001431
8,0.993909,0.001633,0.003367,0.000465,0.000063,0.001422
10,0.000594,0.001575,0.003304,0.000434,0.000057,0.994574
...,...,...,...,...,...,...
2004,0.000529,0.001610,0.003525,0.993960,0.000072,0.001305
2006,0.000526,0.001875,0.003343,0.993952,0.000094,0.001359
2012,0.000528,0.007036,0.003383,0.993976,0.000095,0.001350
2016,0.000476,0.007140,0.003361,0.993804,0.000051,0.001312


In [16]:
# Check the possible threshold values more than 4 labels with value more than 0.5

filtered_rows = combined_preds_df[(combined_preds_df > 0.5).sum(axis=1) > 4]
filtered_rows

# As you can see from the table under, there are no rows with more than 4 labels with a value greater than 0.5.

Unnamed: 0_level_0,Chief Officer,Director,Individual Contributor/Staff,Manager,Owner,Vice President
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


## ROC AUC Score

In [17]:
from sklearn.metrics import roc_auc_score

roc_auc_scores = {}
for role in test_labels.columns:
    y_true = test_labels[role]
    y_scores = combined_preds_df[role]
    roc_auc = roc_auc_score(y_true, y_scores)
    roc_auc_scores[role] = roc_auc

roc_auc_scores

{'Chief Officer': 0.9925893315723824,
 'Director': 0.9826507146391872,
 'Individual Contributor/Staff': 0.9917747611865259,
 'Manager': 0.917079207920792,
 'Owner': 0.952808988764045,
 'Vice President': 0.9513952308472856}

In [18]:
# Final evaluation score

avg_roc_auc = sum(roc_auc_scores.values()) / len(roc_auc_scores)
avg_roc_auc

0.9647163724883697