# HR Churn - Eindwerk Data Science 2023-24 - Raf Ledeganck

## Imports

<i>(Development on Tensorflow 2.11.0 Docker Container)</i>

In [1]:
!pip install scikit-learn
#!pip install scikit-image
!pip install pandas
!pip install seaborn
!pip install scikit-optimize
!pip install imbalanced-learn
!pip install scikeras
!pip install feature_engine

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 2.8 MB/s eta 0:00:01
Collecting joblib>=1.1.1
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 2.7 MB/s eta 0:00:01
[?25hCollecting scipy>=1.5.0
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 687 kB/s eta 0:00:01    |████████                        | 8.6 MB 3.9 MB/s eta 0:00:07     |█████████████▊                  | 14.8 MB 718 kB/s eta 0:00:28
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: joblib, scipy, threadpoolctl, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.3.2 scipy-1.10.1 threadpoolctl-3.5.0
You should consider upgrading via the '/usr/bin/python3 -m pip ins

Installing collected packages: scikeras
Successfully installed scikeras-0.12.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Collecting feature_engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[K     |████████████████████████████████| 328 kB 3.0 MB/s eta 0:00:01
Collecting statsmodels>=0.11.1
  Downloading statsmodels-0.14.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[K     |████████████████████████████████| 10.9 MB 3.5 MB/s eta 0:00:01    |██████████▍                     | 3.5 MB 697 kB/s eta 0:00:11     |████████████▋                   | 4.3 MB 697 kB/s eta 0:00:10
Collecting patsy>=0.5.4
  Downloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
[K     |████████████████████████████████| 233 kB 3.3 MB/s eta 0:00:01
Installing collected packages: patsy, statsmodels, feature-engine
Successfully installed feature-engine-1.6.2 patsy-0.5.6 statsmodels-0.14.1
You should consider upgrading vi

In [2]:
%matplotlib inline
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
# Sklearn
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
#from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.svm import SVC # Support Vector Classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [4]:
# Scipy
from scipy import stats

In [5]:
# Tensorflow, Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import SGD, Adam, Adagrad
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import BinaryCrossentropy, BinaryFocalCrossentropy
from tensorflow.keras import initializers
from tensorflow.keras import regularizers
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

In [6]:
from scikeras.wrappers import KerasClassifier

In [7]:
from feature_engine.selection import DropConstantFeatures

In [8]:
#import numpy as np
#import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
#import sklearn
#from sklearn.compose import ColumnTransformer
#from sklearn.preprocessing import OneHotEncoder, StandardScaler
#from sklearn.decomposition import PCA
#from sklearn.model_selection import GridSearchCV
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [9]:
from IPython.display import display_html 

## Functions

In [10]:
def fill_NaN_via_dupl(df, nan_cols, dup_colset):
    """
    1) Checks whether a row with a NaN value has duplicates by comparing it with similar rows.
    2) If all duplicate rows share the same value for the column with NaN, then the NaN is replaced with this value.

    Parameters
    ----------
    df: DataFrame
        Dataframe in which NaN values are to be replaced.
    nan_cols: list
        List of columns in which we want to replace the NaN
        Example: ['EMPLOYEE_CLASS', 'EMPLOYEE_LEVEL']
    dup_colset: list
        List of columns used to derive a fill value for the NaN
        If a row has a NaN value in a specified column, the function checks for similar rows by comparing the values in this
        column set.  If all columns in this set have an identical value to the row with NaN, then the rows are considered
        'similar'.
        Example: ['EMPLOYEE_TYPE', 'DEPARTMENT', 'JOB', 'COMPANY', 'SITE', 'PERMANENT', 'EMPLOYEE_CLASS', 'EMPLOYEE_LEVEL']

    Return
    ------
    df_stats: DataFrame
        Statistics on the performance of the function
        - 'col': column name
        - 'nan': no. of lines with NaN
        - 'fill': no. of NaN replaced with value from similar lines
        - 'mult': no. of NaN lines not filled because similar lines have multiple values and look-up is inconclusive
        - 'nodup': no. of NaN lines not filled because there are no similar lines to derive a replacement value from

    """

    col_lst, nan_lst, fill_lst, mult_lst, nodup_lst = list(), list(), list(), list(), list()
    
    # Function assumes NAN_COLS is a subset of DUP_COLSET.  If this is not the case, it is enforced here.
    dup_colset = dup_colset + nan_cols
    dup_colset = np.unique(dup_colset).tolist()

    # Look-up takes 10 columns as input.  If less than 10 columns were given, add empty ones.
    df_subset = df[dup_colset].copy()
    i = len(df_subset.columns)
    if i > 11:
        raise Exception("Max. 10 columns can be used to find similar rows.")
    while i < 11:
        i += 1
        col_name = 'EMPTY' + str(i)
        df_subset[col_name] = "0"
        dup_colset.append(col_name)
        
    for col in nan_cols:
        count_nan = 0
        count_fill = 0
        count_mult = 0
        count_nodup = 0
        dup_cols = dup_colset.copy()
        dup_cols.remove(col)

        # Build a Series which indicates for each line in DF whether it has a duplicate row 
        # considering all columns except COL
        ser_dup = df_subset.duplicated(subset=dup_cols, keep=False)

        # Build a list with the index of all rows in DF that have NaN for COL
        lst_na = df_subset.loc[pd.isna(df[col]), :].index

        for i in lst_na:
            count_nan += 1

            # For each row where COL == NaN, check if it has a duplicate
            if ser_dup.iloc[i] == True:

                # Row has duplicate.
                # SEARCH_LINE is the row for which we aim to remove the NaN
                search_line = df_subset.iloc[i, :]

                # Build dataframe with all duplicate lines for SEARCH_LINE ignoring COL
                df_dup = df_subset.loc[(df_subset[dup_cols[0]] == search_line[dup_cols[0]]) &   
                                       (df_subset[dup_cols[1]] == search_line[dup_cols[1]]) &
                                       (df_subset[dup_cols[2]] == search_line[dup_cols[2]]) &
                                       (df_subset[dup_cols[3]] == search_line[dup_cols[3]]) &
                                       (df_subset[dup_cols[4]] == search_line[dup_cols[4]]) &
                                       (df_subset[dup_cols[5]] == search_line[dup_cols[5]]) &
                                       (df_subset[dup_cols[6]] == search_line[dup_cols[6]]) &
                                       (df_subset[dup_cols[7]] == search_line[dup_cols[7]]) &
                                       (df_subset[dup_cols[8]] == search_line[dup_cols[8]]) &
                                       (df_subset[dup_cols[9]] == search_line[dup_cols[9]])
                                        ]

                # Check if all lines in DF_DUP have same value in COL, if so we assume we can replace NaN with this value
                if len(df_dup[col].value_counts()) == 1:
                    # All duplicate lines have the same value for the NaN field so we replace NaN with this value
                    df.iloc[i, df.columns.tolist().index(col)] = \
                                      df_dup[df_dup[col].notna()].iloc[0, df_subset.columns.tolist().index(col)]
                    count_fill += 1
                else:
                    # Multiple values for the NaN field, result is inconclusive, NaN is left
                    count_mult += 1     

            else:
                # No duplicate row for SEARCH_LINE, NaN is left
                count_nodup += 1

        col_lst.append(col)
        nan_lst.append(count_nan)
        fill_lst.append(count_fill)
        mult_lst.append(count_mult)
        nodup_lst.append(count_nodup)
    
    stats = {'col' : col_lst,
             'nan' : nan_lst,
             'fill' : fill_lst,
             'mult' : mult_lst,
             'nodup' : nodup_lst
            }
    return pd.DataFrame.from_dict(stats)


In [11]:
def agg_feat_imp(df_feat_imp):

    df_abs = df_feat_imp.abs().copy()

    # Calculate total correlation per feature rather than by OneHotEncoder split
    lst_row= ['MANAGER', 'DEPARTMENT', 'DPT_CHANGE_FLAG', 'COMPANY', 'SITE', 'FULL_TIME', 'EMPLOYEE_LEVEL', 'HANDICAP', 'CITIZENSHIP']
    for row in lst_row:
        prefix = 'cat__' + row

        # Add row with total correlation
        df_abs.loc[row] = df_abs.filter(regex=prefix, axis='index').sum().values[0]

        # Remove rows with correlations for individual OneHotEncoder splits
        df_abs.drop(list(df_abs[df_abs.index.str.startswith(prefix)].index), axis='index', inplace=True)

    # Remove prefixes from row names
    dict_rownames = {}
    for row in df_abs.index:
        if row.find('__') > 0:
            dict_rownames[row] = row[ (row.find('__')+2) : ]

    df_abs.rename(index=dict_rownames, inplace=True)

    return(df_abs)

In [12]:
def plot_feat_imp(df_feat_imp, export=False, file_name='plot.jpg', show=True):

    # Sort from high to low importance
    df_feat_imp.sort_values('Importance', ascending=False, inplace=True)

    # Generate plot
    fig, ax = plt.subplots(figsize=(12,6),dpi=100)
    sns.barplot(df_feat_imp, x=df_feat_imp.index, y=df_feat_imp['Importance'])
    plt.xticks(rotation=90)

    # Export plot
    if export:
        plt.savefig(file_name, bbox_inches='tight')

    # Print plot
    if show:
        plt.show()
    
    plt.close(fig);

## Data

In [13]:
df_train_raw = pd.read_csv("Data/Employee_Churn_train.csv", sep=';')
#df_train_raw = pd.read_csv('Employee_Churn_train.csv', sep=';')
df_train_raw.head()

Unnamed: 0,Employee ID,NAME,FIRST_NAME,GENDER,MANAGER,EMPLOYEE_TYPE,DEPARTMENT,DPT_CHANGE_FLAG,JOB,STATUS,...,EVOLUTION_BONUS_LANGUAGE,EVOLUTION_BONUS_MISC,EVOLUTION_BONUS_OBJECTIVE,EVOLUTION_BONUS_SHARING,EVOLUTION_BONUS_TECHNICAL,EVOLUTION_BONUS_TOTAL,EVOLUTION_BONUS_UNEXPECTED,EVOLUTION_BONUS_WELCOME,EVOLUTION_BONUS_YIELD,Target_Churn
0,33675,Reverdy,ELEANA,F,212.0,EMP,1135,R,CLIENT AVISOR,A,...,,,0.668724,,,0.371666,,,0.300012,0
1,35535,Ravet,CHRISTIAN,M,11780.0,EMP,1332,R,CLIENT AVISOR,A,...,,,0.91545,,,0.644679,,,0.624565,0
2,35826,Mandon,ADEL,M,36149.0,EMP,2055,R,CLIENT AVISOR,A,...,,,1.174547,,,1.016536,,,,0
3,35352,Ch,MOHAMMED,F,2309.0,EMP,1802,R,CLIENT AVISOR,A,...,,,1.852381,,,1.852381,,,,1
4,35997,Grand,JASSIM,M,22035.0,EMP,2055,R,CLIENT AVISOR,A,...,,,0.0,,,0.001837,,,,0


### Data Exploration

In [14]:
df_train_raw.describe()

Unnamed: 0,Employee ID,MANAGER,DEPARTMENT,COMPANY,AGE,CONTRACT_TENURE,EMPLOYEE_TENURE,SUM_BONUS_UNEXPECTED_3Mago,SUM_BONUS_WELCOME_3Mago,SUM_BONUS_CHALLENGE_3Mago,...,EVOLUTION_BONUS_LANGUAGE,EVOLUTION_BONUS_MISC,EVOLUTION_BONUS_OBJECTIVE,EVOLUTION_BONUS_SHARING,EVOLUTION_BONUS_TECHNICAL,EVOLUTION_BONUS_TOTAL,EVOLUTION_BONUS_UNEXPECTED,EVOLUTION_BONUS_WELCOME,EVOLUTION_BONUS_YIELD,Target_Churn
count,4741.0,4710.0,4741.0,4741.0,4741.0,4741.0,4741.0,3901.0,3901.0,3901.0,...,160.0,0.0,1891.0,20.0,1.0,3411.0,22.0,11.0,1935.0,4741.0
mean,34434.665893,13691.126115,1575.31196,103.156929,30.519933,1.045349,3.669479,14.743655,2.81979,216.859282,...,1.036458,,3.561783,0.128553,0.0,2.717623,1.505207,0.0,1.442746,0.136891
std,16220.648895,12053.696041,300.026668,2.346073,7.363317,0.234768,2.777664,207.441647,53.03357,745.333554,...,0.443003,,29.126128,0.278075,,21.613516,1.346645,0.0,1.844412,0.343768
min,3.0,80.0,1025.0,100.0,18.0,1.0,1.0,0.0,0.0,0.0,...,0.0,,0.0,0.0,0.0,0.0,0.238095,0.0,0.0,0.0
25%,22309.0,2635.0,1266.0,102.0,25.0,1.0,1.0,0.0,0.0,0.0,...,1.0,,0.58857,0.0,0.0,0.770377,0.997499,0.0,0.729704,0.0
50%,38751.0,10284.0,1567.0,102.0,29.0,1.0,3.0,0.0,0.0,0.0,...,1.0,,1.0,0.0,0.0,1.055409,1.018913,0.0,1.022007,0.0
75%,47514.0,23807.0,1884.0,105.0,34.0,1.0,5.0,0.0,0.0,0.0,...,1.0,,1.744527,0.0,0.0,1.649027,1.3,0.0,1.561656,0.0
max,57145.0,56574.0,2055.0,108.0,61.0,4.0,13.0,4200.0,1000.0,8937.0,...,3.0,,1074.187817,0.844444,0.0,1074.187817,6.0,0.0,36.660723,1.0


In [15]:
df_train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4741 entries, 0 to 4740
Data columns (total 55 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Employee ID                 4741 non-null   int64  
 1   NAME                        4741 non-null   object 
 2   FIRST_NAME                  4741 non-null   object 
 3   GENDER                      4741 non-null   object 
 4   MANAGER                     4710 non-null   float64
 5   EMPLOYEE_TYPE               4741 non-null   object 
 6   DEPARTMENT                  4741 non-null   int64  
 7   DPT_CHANGE_FLAG             4741 non-null   object 
 8   JOB                         4741 non-null   object 
 9   STATUS                      4741 non-null   object 
 10  COMPANY                     4741 non-null   int64  
 11  SITE                        4741 non-null   object 
 12  PERMANENT                   4741 non-null   object 
 13  EMPLOYEE_CLASS              4621 

#### Distinct values per (non-numeric) column

In [16]:
for col in df_train_raw.columns:
    if not df_train_raw[col].dtype.kind in 'iuf' and not col=='NAME' and not col=='FIRST_NAME':
        print('Distinct values in', col, ':', df_train_raw[col].unique())

Distinct values in GENDER : ['F' 'M']
Distinct values in EMPLOYEE_TYPE : ['EMP']
Distinct values in DPT_CHANGE_FLAG : ['R' 'E' 'C']
Distinct values in JOB : ['CLIENT AVISOR']
Distinct values in STATUS : ['A']
Distinct values in SITE : ['LYON7' 'BORDEAUX' 'PARIS3' 'LYON2' 'LYON6' 'NANTES3' 'STRASBOURG' 'NICE'
 'PARIS' 'PARIS 2' 'LYON1' 'PARIS P 1' 'LYON8' 'NANTES4' 'NANTES2'
 'NANTES1' 'BORDEAUX 2']
Distinct values in PERMANENT : ['R' 'E']
Distinct values in EMPLOYEE_CLASS : ['INT' nan 'IMP']
Distinct values in FULL_TIME : ['F' 'E' 'P']
Distinct values in EMPLOYEE_LEVEL : ['C' 'A' 'B' 'T' nan]
Distinct values in HANDICAP : [nan 'MOTL' 'MOTD' 'VISU']
Distinct values in CITIZENSHIP : ['FRA' 'SWE' 'BEL' 'DEU' 'PRT' 'IRL' 'SRB' 'TUN' 'LUX' 'NDL' 'MAR' 'ESP'
 'GBR' 'CZE' nan]


### Data Transformation

In [17]:
df_train = df_train_raw.copy()

# Data cleaning

## Remove NaN

In [18]:
# Check for NaN
df_train.isna().sum()

Employee ID                      0
NAME                             0
FIRST_NAME                       0
GENDER                           0
MANAGER                         31
EMPLOYEE_TYPE                    0
DEPARTMENT                       0
DPT_CHANGE_FLAG                  0
JOB                              0
STATUS                           0
COMPANY                          0
SITE                             0
PERMANENT                        0
EMPLOYEE_CLASS                 120
FULL_TIME                        0
EMPLOYEE_LEVEL                   1
HANDICAP                      4729
CITIZENSHIP                      2
AGE                              0
CONTRACT_TENURE                  0
EMPLOYEE_TENURE                  0
SUM_BONUS_UNEXPECTED_3Mago     840
SUM_BONUS_WELCOME_3Mago        840
SUM_BONUS_CHALLENGE_3Mago      840
SUM_BONUS_MISC_3Mago           840
SUM_BONUS_EXC_3Mago            840
SUM_BONUS_LANGUAGE_3Mago       842
SUM_BONUS_SHARING_3Mago        840
SUM_BONUS_OBJECTIVE_

In [19]:
# Naming convention: replace NaN with -1 in columns that are not meant to be numeric
df_train['MANAGER'].fillna(-1, inplace=True)
df_train['DEPARTMENT'].fillna(-1, inplace=True)
df_train['COMPANY'].fillna(-1, inplace=True)

In [20]:
# Type conversions
df_train['Employee ID'] = df_train['Employee ID'].astype('str')
df_train['MANAGER'] = df_train['MANAGER'].astype('int').astype('str')
df_train['DEPARTMENT'] = df_train['DEPARTMENT'].astype('int').astype('str')
df_train['COMPANY'] = df_train['COMPANY'].astype('int').astype('str')

<i>
<b>Data exploration</b><br>
Check occurrence of values for a feature to check how evenly distributed they are.
</i>

In [21]:
# Check occurrence of each value in columns with NaN
nan_cols = ['EMPLOYEE_CLASS', 'EMPLOYEE_LEVEL', 'CITIZENSHIP', 'MANAGER']

for col in nan_cols:
    print("Value occurrences in", col, ":")
    print(df_train_raw[col].value_counts())
    print("\n")


Value occurrences in EMPLOYEE_CLASS :
EMPLOYEE_CLASS
INT    4618
IMP       3
Name: count, dtype: int64


Value occurrences in EMPLOYEE_LEVEL :
EMPLOYEE_LEVEL
C    2034
A    1642
B    1059
T       5
Name: count, dtype: int64


Value occurrences in CITIZENSHIP :
CITIZENSHIP
FRA    4620
BEL      59
DEU      22
IRL      14
SWE       5
LUX       5
ESP       5
PRT       2
NDL       2
SRB       1
TUN       1
MAR       1
GBR       1
CZE       1
Name: count, dtype: int64


Value occurrences in MANAGER :
MANAGER
7376.0     71
32204.0    44
2139.0     40
27370.0    31
31874.0    31
           ..
2581.0      1
28997.0     1
34310.0     1
20857.0     1
1537.0      1
Name: count, Length: 384, dtype: int64




# Build dataframe with only master data
df_masterdata = df_train_raw.copy()

# Remove master data columns not relevant when searching for duplicates
df_masterdata.drop(['Employee ID', 'NAME', 'FIRST_NAME', 'MANAGER', 'DEPARTMENT', 'Target_Churn'], axis='columns', inplace=True)

# Remove 'bonus' columns, also not relevant when searching for duplicates
for col in df_masterdata.columns:
    if 'BONUS' in col:
        df_masterdata.drop(col, axis='columns', inplace=True)

for col in nan_cols:
    df_corr = df_masterdata.copy()

    # Drop lines with NaN
    df_corr.dropna(subset=col, inplace=True)
    
    # Dummies
    df_corr = pd.get_dummies(df_corr, columns=df_corr.columns,  drop_first=True)

    # Correlation matrix
    df_corr = df_corr.corr()

    for corr in df_corr.columns.values.tolist():
        if col in corr:
            idx = df_corr.columns.values.tolist().index(corr)
            print(col, corr, "index=", idx)
            corr_col = df_corr.iloc[idx, :]
#            print(corr_col)
            print(df_corr.nlargest(5, corr).iloc[:, idx])
            print(df_corr.nsmallest(5, corr).iloc[:, idx])


# Heatmap indicates that age, contract tenure and employee tenure have low correlations
for col in df_masterdata.columns:
    if 'AGE' in col or 'TENURE' in col:
        df_masterdata.drop(col, axis='columns', inplace=True)

In [22]:
df_backup = df_train.copy()
#df_train = df_backup.copy()

In [23]:
print("Value counts before clean-up:")
print(df_train['MANAGER'].value_counts(dropna=False), "\n")

dup_colset = ['DEPARTMENT', 'COMPANY', 'SITE']
nan_cols = ['MANAGER']

df_stats = fill_NaN_via_dupl(df_train, nan_cols, dup_colset)

print(df_stats, "\n")

print("Value counts after clean-up:")
print(df_train['MANAGER'].value_counts(dropna=False), "\n")

Value counts before clean-up:
MANAGER
7376     71
32204    44
2139     40
31874    31
-1       31
         ..
2581      1
28997     1
34310     1
20857     1
1537      1
Name: count, Length: 385, dtype: int64 

       col  nan  fill  mult  nodup
0  MANAGER    0     0     0      0 

Value counts after clean-up:
MANAGER
7376     71
32204    44
2139     40
31874    31
-1       31
         ..
2581      1
28997     1
34310     1
20857     1
1537      1
Name: count, Length: 385, dtype: int64 



In [24]:
print("Value counts before clean-up:")
print(df_train['EMPLOYEE_CLASS'].value_counts(dropna=False), "\n")
print(df_train['EMPLOYEE_LEVEL'].value_counts(dropna=False), "\n")

dup_colset = ['EMPLOYEE_TYPE', 'DEPARTMENT', 'JOB', 'COMPANY', 'SITE', 'PERMANENT', 'EMPLOYEE_CLASS', 'EMPLOYEE_LEVEL']
nan_cols = ['EMPLOYEE_CLASS', 'EMPLOYEE_LEVEL']

df_stats = fill_NaN_via_dupl(df_train, nan_cols, dup_colset)

print(df_stats, "\n")

print("Value counts after clean-up:")
print(df_train['EMPLOYEE_CLASS'].value_counts(dropna=False), "\n")
print(df_train['EMPLOYEE_LEVEL'].value_counts(dropna=False))

Value counts before clean-up:
EMPLOYEE_CLASS
INT    4618
NaN     120
IMP       3
Name: count, dtype: int64 

EMPLOYEE_LEVEL
C      2034
A      1642
B      1059
T         5
NaN       1
Name: count, dtype: int64 

              col  nan  fill  mult  nodup
0  EMPLOYEE_CLASS  120   115     1      4
1  EMPLOYEE_LEVEL    1     0     1      0 

Value counts after clean-up:
EMPLOYEE_CLASS
INT    4733
NaN       5
IMP       3
Name: count, dtype: int64 

EMPLOYEE_LEVEL
C      2034
A      1642
B      1059
T         5
NaN       1
Name: count, dtype: int64


# List of columns we want to use to derive a fill value for the NaN
dup_colset = ['DEPARTMENT', 'COMPANY', 'SITE', 'MANAGER']
# List of columns in which we want to replace the NaN
nan_cols = ['MANAGER']

df_subset = df_train[dup_colset].copy()
i = len(df_subset.columns)
while i < 10:
    i += 1
    col_name = 'EMPTY' + str(i)
    print(col_name)
    df_subset[col_name] = "0"
    
for col in nan_cols:
    count_fill = 0
    count_left = 0
    count_false = 0
    count_all = 0
    print("Value counts before clean-up:")
    print(df_train[col].value_counts(dropna=False))
    dup_cols = dup_colset.copy()
    dup_cols.remove(col)
    
    # Build a Series which indicates for each line in DF_TRAIN whether it has a duplicate row 
    # considering all columns except COL
    ser_dup = df_subset.duplicated(subset=dup_cols, keep=False)

    # Build a list with the index of all rows in DF_TRAIN that have NaN for COL
    lst_na = df_subset.loc[pd.isna(df_train[col]), :].index

    for i in lst_na:
        count_all += 1
    
        # For each row where COL == NaN, check if it has a duplicate
        if ser_dup.iloc[i] == True:
            
            # Row has duplicate.
            # SEARCH_LINE is the row for which we aim to remove the NaN
            search_line = df_subset.iloc[i, :]

            # Build dataframe with all duplicate lines for SEARCH_LINE ignoring COL
            df_dup = df_subset.loc[(df_subset[dup_cols[0]] == search_line[dup_cols[0]]) &   
                                   (df_subset[dup_cols[1]] == search_line[dup_cols[1]]) &
                                   (df_subset[dup_cols[2]] == search_line[dup_cols[2]]) &
                                   (df_subset[dup_cols[3]] == search_line[dup_cols[3]]) &
                                   (df_subset[dup_cols[4]] == search_line[dup_cols[4]]) &
                                   (df_subset[dup_cols[5]] == search_line[dup_cols[5]]) &
                                   (df_subset[dup_cols[6]] == search_line[dup_cols[6]])
                                    ]

            # Check if all lines in DF_DUP have same value in COL, if so we assume we can replace NaN with this value
            if len(df_dup[col].value_counts()) == 1:
                # All duplicate lines have the same value for the NaN field so we replace NaN with this value
#                 df_train.iloc[i, df_subset.columns.tolist().index(col)] = \
#                                   df_dup[df_dup[col].notna()].iloc[0, df_train.columns.tolist().index(col)]
                count_fill += 1
            else:
                # Multiple values for the NaN field, result is inconclusive, NaN is left
                count_left += 1     

        else:
            # No duplicate row for SEARCH_LINE, NaN is left
            count_false += 1

    print("\nValue counts after clean-up:")
    print(df_train[col].value_counts(dropna=False))
    print("Total NaN:", count_all, ", NaN replaced:", count_fill, \
          ", Inconclusive (NaN left):", count_left, ", No duplicate row (NaN left):", count_false, ".")
    print("\n")


In [25]:
# Check for NaN
df_train.isna().sum()

Employee ID                      0
NAME                             0
FIRST_NAME                       0
GENDER                           0
MANAGER                          0
EMPLOYEE_TYPE                    0
DEPARTMENT                       0
DPT_CHANGE_FLAG                  0
JOB                              0
STATUS                           0
COMPANY                          0
SITE                             0
PERMANENT                        0
EMPLOYEE_CLASS                   5
FULL_TIME                        0
EMPLOYEE_LEVEL                   1
HANDICAP                      4729
CITIZENSHIP                      2
AGE                              0
CONTRACT_TENURE                  0
EMPLOYEE_TENURE                  0
SUM_BONUS_UNEXPECTED_3Mago     840
SUM_BONUS_WELCOME_3Mago        840
SUM_BONUS_CHALLENGE_3Mago      840
SUM_BONUS_MISC_3Mago           840
SUM_BONUS_EXC_3Mago            840
SUM_BONUS_LANGUAGE_3Mago       842
SUM_BONUS_SHARING_3Mago        840
SUM_BONUS_OBJECTIVE_

In [26]:
# Handle NaN in master data
start_point = len(df_train)
df_train['HANDICAP'].fillna('NONE', inplace=True)
df_train.dropna(subset=['EMPLOYEE_CLASS', 'EMPLOYEE_LEVEL', 'CITIZENSHIP'], inplace=True)
print(round((1 - (len(df_train) / start_point)) * 100, 2), "% of lines dropped.")
#df_train.isna().sum()

0.17 % of lines dropped.


At this stage there are no more NaN among the master data.  Now the bonuses...

In [27]:
# NaN for a bonus corresponds with no bonus, i.e. = 0.
for col in df_train.columns:
    if 'BONUS' in col:
        df_train[col].fillna(value=0, inplace=True)

## Data redundancy

In [None]:
# Find redundant features
DCF = DropConstantFeatures(tol=0.99)   # Feature is considered constant if 99% of values are identical
DCF.fit(df_train)

In [None]:
DCF.features_to_drop_

***

# VANAF HIER VERDER WERKEN

***

## Check correlations

In [None]:
# Heatmap indicates that age, contract tenure and employee tenure have low correlations
for col in df_masterdata.columns:
    if 'AGE' in col or 'TENURE' in col:
        df_masterdata.drop(col, axis='columns', inplace=True)

In [None]:
#    # Correlation heatmap
#    fig, axes = plt.subplots(figsize=(20,3))
#    sns.heatmap(df_corr.corr(), annot=True)
    
#plt.show();

## Check which columns are relevant

### Data transformation

In [None]:
amt_feat = []
cat_feat = []
num_feat = []

for col in df_train.columns:
    if 'BONUS' in col:
        amt_feat.append(col)
    if not df_train[col].dtype.kind in 'iuf':
        df_train[col] = df_train[col].astype('category')
        cat_feat.append(col)
    if col=='AGE' or col=='CONTRACT_TENURE' or col=='EMPLOYEE_TENURE':
        num_feat.append(col)

#col_trans = ColumnTransformer(transformers=[
#    ('categories', OneHotEncoder(dtype='int', handle_unknown='error'), cat_feat),
##    ('scaler', StandardScaler(), amt_feat)
#    ])

#col_trans.fit_transform(df_train)
df_train.info()



In [None]:
X = df_train.drop(['Target_Churn', 'Employee ID', 'NAME', 'FIRST_NAME'], axis='columns')
y = df_train['Target_Churn']

cat_feat_2 = cat_feat.copy()
cat_feat_2.remove('Employee ID')
cat_feat_2.remove('NAME')
cat_feat_2.remove('FIRST_NAME')

scaler = StandardScaler()

col_trans = ColumnTransformer(
    [
        ('num', StandardScaler(), num_feat),
        ('cat', OneHotEncoder(drop='if_binary', sparse=False, dtype=np.intc), cat_feat_2),
    ], remainder='passthrough'   # By default, all un-transformed columns are dropped.
)

scaled_X = col_trans.fit_transform(X)

#scaled_X = scaler.fit_transform(X)

In [None]:
X.head()

In [None]:
#col_trans.get_feature_names_out()

### Principal Components

In [None]:
pca = PCA()   # PCA on all features
principal_components = pca.fit_transform(scaled_X)

In [None]:
explained_variance = []

for n in range(1,11):
    pca = PCA(n_components=n)
    pca.fit(scaled_X)
    
    explained_variance.append(np.sum(pca.explained_variance_ratio_))

plt.plot(range(1,11), explained_variance)
plt.xlabel("Number of Components")
plt.ylabel("Variance Explained");

In [None]:
#pca.n_components_
#pca.components_.shape

In [None]:
# Relatie tussen PCA en features:
pca_comp = pd.DataFrame(pca.components_, index=['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
pca_comp.columns = col_trans.get_feature_names_out()   # Replace generated column headers with actual feature names
pca_comp.head()

#plt.figure(figsize=(20,3),dpi=100)
#sns.heatmap(pca_comp,annot=True)

In [None]:
# Explained variance mee in rekening brengen bij correlatie van de features
display(pca.explained_variance_ratio_)

pca_w = pca_comp.mul((pca.explained_variance_ratio_), axis='index').copy()

## Normaliseren
#pca_w = pca_w / pca_w.sum().abs().sum()   # Totaal van alle PCA correlaties op 1 brengen


In [None]:
pca_wabs = pca_w.abs().copy()

# Calculate total correlation per feature rather than by OneHotEncoder split
lst_cum = ['MANAGER', 'DEPARTMENT', 'DPT_CHANGE_FLAG', 'COMPANY', 'SITE', 'FULL_TIME', 'EMPLOYEE_LEVEL', 'HANDICAP', 'CITIZENSHIP']
for col in lst_cum:
    prefix = 'cat__' + col

    # Add column with total correlation
    pca_wabs[col] = pca_wabs.filter(regex=prefix, axis='columns').sum(axis='columns')

    # Remove columns with correlations for individual OneHotEncoder splits
    fltr = filter(lambda x: x.startswith(prefix), pca_wabs.columns)
    pca_wabs.drop(list(fltr), axis='columns', inplace=True)

# Remove prefixes from column names
dict_colnames = {}
for i, col in enumerate(pca_wabs.columns):
    if col.find('__') > 0:
        dict_colnames[col] = col[ (col.find('__')+2) : ]

pca_wabs.rename(columns=dict_colnames, inplace=True)

display(pca_wabs.head())


In [None]:
# DF opbouwen met gecumuleerde feature importances over de 10 PC's
pca_feat_imp = pd.DataFrame(data={'Importance' : pca_w.sum()}, index=col_trans.get_feature_names_out())

# Features aggregeren (undo OneHotEncoding)
pca_feat_imp = agg_feat_imp(pca_feat_imp)

# Features plotten volgens afnemend belang
plot_feat_imp(pca_feat_imp, export=True, file_name='Output/FeatureImportance_PrincipalComponents.jpg')

In [None]:
# Features sorteren volgens afnemend gecumuleerd belang over de 10 PC
pca_feat_imp = pd.DataFrame(data={'Importance' : pca_wabs.sum()}, index=pca_wabs.columns)
pca_feat_imp.sort_values('Importance', ascending=False, inplace=True)

plt.figure(figsize=(12,6),dpi=100)
sns.barplot(pca_feat_imp, x=pca_feat_imp.index, y=pca_feat_imp['Importance'])
plt.xticks(rotation=90);

### Random Forest

In [None]:
switch_rfc = False

if switch_rfc:
    n_estimators=[100, 128, 15, 200, 250, 300, 350]
    max_features= [2, 4, 6, 8, 10]
    bootstrap = [True]
    oob_score = [True]   # oob_score heeft geen impact op performantie, geeft enkel idee van accuraatheid van model
    # Best params na 1e run: {'bootstrap': True, 'max_features': 8, 'n_estimators': 128, 'oob_score': True}

    param_grid = {'n_estimators':n_estimators,
                'max_features':max_features,
                'bootstrap':bootstrap,
                'oob_score':oob_score}  # oob_score is enkel zinvol indien bootstrap=True anders zal de fit failen

    rfc = RandomForestClassifier()
    grid = GridSearchCV(rfc,param_grid)
    grid.fit(scaled_X, y)

    display(grid.best_params_)   # parameters beste estimator

In [None]:
# aanmaken beste model met oob_score:
rfc = RandomForestClassifier(max_features=8, n_estimators=300, oob_score=True)
rfc.fit(scaled_X, y)
rfc.oob_score_   # oob_score is tegenovergestelde van OOB error !

In [None]:
# DF opbouwen met feature importances
rfc_feat_imp = pd.DataFrame(data={'Importance' : rfc.feature_importances_}, index=col_trans.get_feature_names_out())

# Features aggregeren (undo OneHotEncoding)
rfc_feat_imp = agg_feat_imp(rfc_feat_imp)

# Features plotten volgens afnemend belang
plot_feat_imp(rfc_feat_imp, export=True, file_name='Output/FeatureImportance_RandomForest.jpg')

### Gradient boosting

In [None]:
switch_gb = False
if switch_gb:
    n_est = [80, 100, 150, 200, 300]
    max_f = [6, 8, 10]
    max_d = [4, 5, 6]
    l_rate = [0.02, 0.05, 0.1]
    # Best params na 1e run: {'learning_rate': 0.05, 'max_depth': 5, 'max_features': 8, 'n_estimators': 100}
    # Best params na 2e run: {'learning_rate': 0.05, 'max_depth': 4, 'max_features': 6, 'n_estimators': 200}

    param_grid = {'n_estimators': n_est,
                'max_features': max_f,
                'max_depth': max_d,
                'learning_rate': l_rate}

    gbc = GradientBoostingClassifier()
    grid = GridSearchCV(gbc, param_grid)
    grid.fit(scaled_X, y)

    display(grid.best_params_)   # parameters beste estimator

In [None]:
gbc = GradientBoostingClassifier(learning_rate= 0.05, max_depth= 5, max_features= 8, n_estimators= 200)
gbc.fit(scaled_X, y)

In [None]:
# DF opbouwen met feature importances
gbc_feat_imp = pd.DataFrame(data=gbc.feature_importances_, index=col_trans.get_feature_names_out(), columns=['Importance'])

# Features aggregeren (undo OneHotEncoding)
gbc_feat_imp = agg_feat_imp(gbc_feat_imp)

# Features plotten volgens afnemend belang
plot_feat_imp(gbc_feat_imp, export=True, file_name='Output/FeatureImportance_GradientBoosting.jpg')

### Check correlation between bonuses

In [None]:
# Create separate dataframes with bonuses
df_3m_bonus = pd.DataFrame()
df_sumbonus = pd.DataFrame()
df_evobonus = pd.DataFrame()

for col in amt_feat:
    if 'SUM_BONUS' in col:
        if '3Mago' in col:
            df_3m_bonus[col] = df_train[col]
        else:
            df_sumbonus[col] = df_train[col]
    if 'EVOLUTION' in col:
        df_evobonus[col] = df_train[col]


In [None]:
# Check for bonus columns with only values = 0 & remove them
switch_del_col_0 = False

if switch_del_col_0:
    print(len(df_3m_bonus.columns))
    zero_cols = df_3m_bonus.columns[(df_3m_bonus == 0).all()]
    df_3m_bonus.drop(labels=zero_cols, axis=1, inplace=True)
    print(len(df_3m_bonus.columns))

    print(len(df_sumbonus.columns))
    zero_cols = df_sumbonus.columns[(df_sumbonus == 0).all()]
    df_sumbonus.drop(labels=zero_cols, axis=1, inplace=True)
    print(len(df_sumbonus.columns))

    print(len(df_evobonus.columns))
    zero_cols = df_evobonus.columns[(df_evobonus == 0).all()]
    df_evobonus.drop(labels=zero_cols, axis=1, inplace=True)
    print(len(df_evobonus.columns))

In [None]:
# Bonus correlation matrices
df_sumcorr = df_sumbonus.corr()
df_3m_corr = df_3m_bonus.corr()
df_evocorr = df_evobonus.corr()

# Trim column headers
df_sumcorr.rename(columns=lambda x: x.removeprefix('SUM_BONUS_'), inplace=True)
df_3m_corr.rename(columns=lambda x: x.removeprefix('SUM_BONUS_'), inplace=True)
df_3m_corr.rename(columns=lambda x: x.removesuffix('_3Mago'), inplace=True)
df_evocorr.rename(columns=lambda x: x.removeprefix('EVOLUTION_BONUS_'), inplace=True)

# Trim index
df_sumcorr.rename(index=lambda x: x.removeprefix('SUM_BONUS_'), inplace=True)
df_3m_corr.rename(index=lambda x: x.removeprefix('SUM_BONUS_'), inplace=True)
df_3m_corr.rename(index=lambda x: x.removesuffix('_3Mago'), inplace=True)
df_evocorr.rename(index=lambda x: x.removeprefix('EVOLUTION_BONUS_'), inplace=True)

# Sort DF rows identically
df_3m_corr = df_3m_corr.reindex(labels=df_sumcorr.columns.tolist())
df_evocorr = df_evocorr.reindex(labels=df_sumcorr.columns.tolist())

# Sort DF columns identically
df_3m_corr = df_3m_corr[df_sumcorr.columns.tolist()]
df_evocorr = df_evocorr[df_sumcorr.columns.tolist()]

# Print correlation matrices in a single row
from IPython.display import display_html 
df1_styler = df_sumcorr.style.set_table_attributes("style='display:inline', margin-right:20px;'").set_caption('Correlations Sum Bonus')
df2_styler = df_3m_corr.style.set_table_attributes("style='display:inline', margin-right:20px;'").set_caption('Correlations 3m Bonus')
df3_styler = df_evocorr.style.set_table_attributes("style='display:inline'").set_caption('Correlations Evo Bonus')
display_html(df1_styler._repr_html_()+df2_styler._repr_html_()+df3_styler._repr_html_(), raw=True)

# Print correlation heatmaps in single row
fig, axes = plt.subplots(figsize=(20,3), dpi=100, nrows=1, ncols=3)
sns.heatmap(df_sumcorr, ax=axes[0])   # annot=True, 
sns.heatmap(df_3m_corr, ax=axes[1])   # annot=True, 
sns.heatmap(df_evocorr, ax=axes[2])   # annot=True, 
axes[0].set_title('Correlations Sum Bonus')
axes[1].set_title('Correlations Sum Bonus 3m')
axes[2].set_title('Correlations Evo Bonus')
plt.subplots_adjust(wspace=0.8)
plt.show();

In [None]:
# Trim column headers
df_sumbonus.rename(columns=lambda x: x.removeprefix('SUM_BONUS_'), inplace=True)
df_3m_bonus.rename(columns=lambda x: x.removeprefix('SUM_BONUS_'), inplace=True)
df_3m_bonus.rename(columns=lambda x: x.removesuffix('_3Mago'), inplace=True)
#df_evocorr.rename(columns=lambda x: x.removeprefix('EVOLUTION_BONUS_'), inplace=True)

df_sumbonus.corrwith(df_3m_bonus, axis='index')

In [None]:
df_train_condensed = df_train.copy()
# Ignore names
df_train_condensed.drop(['NAME', 'FIRST_NAME'], axis=1, inplace=True)
# Drop columns with only a single value
df_train_condensed.drop(['EMPLOYEE_TYPE', 'JOB', 'STATUS'], axis=1, inplace=True)
# Ignore bonus details
for col in df_train_condensed.columns:
    if ( 'SUM_BONUS' in col or 'EVOLUTION_BONUS' in col ) and not 'TOTAL' in col:
        del df_train_condensed[col]
df_train_condensed.head()
#sns.pairplot(df_train_condensed,diag_kind='hist')

### Train | Test Split en Scaling

In [None]:
X = df_train.drop('Target_Churn',axis=1)
X = X.drop(['Employee ID', 'NAME', 'FIRST_NAME'], axis='columns')
y = df_train['Target_Churn']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

In [None]:
scaler = StandardScaler()

In [None]:
X_train.info()

In [None]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

## Maken van het Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression()

In [None]:
log_model.fit(scaled_X_train,y_train)

In [None]:
log_model.coef_

In [None]:
df.head()

### Model Performantie voor Classificatie

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report, ConfusionMatrixDisplay

In [None]:
y_pred = log_model.predict(scaled_X_test)

In [None]:
# herinner de accuracy paradox: we willen niet alleen op deze metric vertrouwen!
accuracy_score(y_test,y_pred)

In [None]:
# Idem: confusion matrix, maar mooi geplot - Ruwe waarden
cm = confusion_matrix(y_test, y_pred, labels=log_model.classes_)

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=log_model.classes_)
disp.plot()

In [None]:
# Confusion matrix, genormaliseerd met normalize = true: 
# Dit is normalisatie over rijen, dus over de echte labels.  Normalisatie betekent dat de waarden in de rij
# zullen worden genormaliseerd om samen een som van 1 te bekomen (100%)

# Elke rij in de confusion matrix zal de proportie predicties voor de respectieve echte klasse weergeven.
# Dit geeft, voor elke echte klasse (0 / 1), welke proportie van de samples voor elke klasse werden voorspeld
# Concreet: de eerste rij geeft weer dat, van alle echte klasse 0 samples, 89% correct werd voorspeld, en 11% 
# foutief aan klasse 1 werden toegekend.
# De tweede rij geeft aan dat 5% van de echte klasse 1 samples foutief als klasse 0 werden voorspeld, en 95% juist.

cm = confusion_matrix(y_test, y_pred, labels=log_model.classes_, normalize='true')

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=log_model.classes_)
disp.plot()

In [None]:
# Confusion matrix, genormaliseerd met normalize = pred: 
# Dit is een normalisatie op kolom, dus normalisatie zodat de som van de predicties per klasse 1 geven (dus over kolommen) 
# Concreet voor elke kolom: de waarden representeren de proportie van die voorspelde klasse die tot elke werkelijke klasse behoren.
# Maw: het leert ons, voor een gegeven voorspelde klasse, welk percentage van de voorspellingen correct waren, en welk % misclassificaties
# van andere klasses waren.

cm = confusion_matrix(y_test, y_pred, labels=log_model.classes_, normalize='pred')

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=log_model.classes_)
disp.plot()

In [None]:
# Confusion matrix, genormaliseerd met normalize = all: 
# Dit is een normalisatie op totaal van de samples.  Elke waarde in de cm zal worden gedeeld door het totaal aantal
# samples om zo de proportie van het totaal weer te geven, en dus de proportie True Negatives etc...

cm = confusion_matrix(y_test, y_pred, labels=log_model.classes_, normalize='all')

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=log_model.classes_)
disp.plot()

In [None]:
# Precision, Recall en F1-score:
# vergeet het print statement niet hier...
print(classification_report(y_test,y_pred))

### Als je enkel precision en recall wil: 

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

### Als je een voorspelling voor een punt wil

In [None]:
log_model.predict_proba(scaled_X_test)[0]

In [None]:
# werkelijke waarde
y_test[0]

## Evaluating Curves and AUC

**Make sure to watch the video on this!**

In [None]:
from sklearn.metrics import precision_recall_curve,PrecisionRecallDisplay,RocCurveDisplay

In [None]:
# tradeoff tussen precision en recall visualiseren:
fig, ax = plt.subplots(figsize=(12,8), dpi=200)
y_prob = log_model.predict_proba(scaled_X_test)
y_prob = y_prob[:,1] # enkel kans klasse 1
precision, recall, _ = precision_recall_curve(y_test, y_prob)
disp = PrecisionRecallDisplay(precision=precision, recall=recall)
disp.plot(ax=ax)

In [None]:
# ROC curve
fig, ax = plt.subplots(figsize=(12,8), dpi=200)
RocCurveDisplay.from_estimator(log_model, scaled_X_test, y_test, ax=ax)

------
------