In [1]:
import seaborn as sns
color = sns.color_palette()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read the train and test datasets
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [3]:
#preview the first few rows of the dataset
df_train.head().T

Unnamed: 0,0,1,2,3,4
EmployeeNo,YAK/S/00001,YAK/S/00002,YAK/S/00003,YAK/S/00004,YAK/S/00006
Division,Commercial Sales and Marketing,Customer Support and Field Operations,Commercial Sales and Marketing,Commercial Sales and Marketing,Information and Strategy
Qualification,"MSc, MBA and PhD",First Degree or HND,First Degree or HND,First Degree or HND,First Degree or HND
Gender,Female,Male,Male,Male,Male
Channel_of_Recruitment,Direct Internal process,Agency and others,Direct Internal process,Agency and others,Direct Internal process
Trainings_Attended,2,2,2,3,3
Year_of_birth,1986,1991,1987,1982,1990
Last_performance_score,12.5,12.5,7.5,2.5,7.5
Year_of_recruitment,2011,2015,2012,2009,2012
Targets_met,1,0,0,0,0


In [4]:
# descriptive statistics of the numeric features
df_train.describe()

Unnamed: 0,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,Promoted_or_Not
count,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0
mean,2.25368,1986.209334,7.698959,2013.139695,0.352996,0.023152,55.366465,0.084595
std,0.609443,7.646047,3.744135,4.261451,0.477908,0.150388,13.362741,0.278282
min,2.0,1950.0,0.0,1982.0,0.0,0.0,31.0,0.0
25%,2.0,1982.0,5.0,2012.0,0.0,0.0,43.0,0.0
50%,2.0,1988.0,7.5,2014.0,0.0,0.0,52.0,0.0
75%,2.0,1992.0,10.0,2016.0,1.0,0.0,68.0,0.0
max,11.0,2001.0,12.5,2018.0,1.0,1.0,91.0,1.0


In [5]:
df_train.isnull().sum() 

EmployeeNo                                0
Division                                  0
Qualification                          1679
Gender                                    0
Channel_of_Recruitment                    0
Trainings_Attended                        0
Year_of_birth                             0
Last_performance_score                    0
Year_of_recruitment                       0
Targets_met                               0
Previous_Award                            0
Training_score_average                    0
State_Of_Origin                           0
Foreign_schooled                          0
Marital_Status                            0
Past_Disciplinary_Action                  0
Previous_IntraDepartmental_Movement       0
No_of_previous_employers                  0
Promoted_or_Not                           0
dtype: int64

In [6]:
df_train['Age'] = 2019 - df_train['Year_of_birth']
df_test['Age'] = 2019 - df_test['Year_of_birth']
df_train['Working_Years'] = 2019 - df_train['Year_of_recruitment']
df_test['Working_Years'] = 2019 - df_test['Year_of_recruitment']

In [7]:
df_train['Qualification'].fillna(df_train['Qualification'].mode()[0], inplace=True)
df_test['Qualification'].fillna(df_test['Qualification'].mode()[0], inplace=True)

In [8]:
num_cols = ['Trainings_Attended', 'Year_of_birth', 'Last_performance_score', 'Year_of_recruitment',
           'Previous_Award', 'Training_score_average', 'Targets_met','Promoted_or_Not',
            'Age', 'Working_Years']

In [9]:
## regroup features
division = {'Commercial Sales and Marketing' :'CSS', 'Customer Support and Field Operations':'CSFO',
             'Information and Strategy':'IS', 'Information Technology and Solution Support':'ITSS',
             'Sourcing and Purchasing':'SP', 'Business Finance Operations':'BFO','People/HR Management':'HR', 
            'Research and Innovation':'RI', 'Regulatory and Legal services':'RLS'}
qualification = {'MSc, MBA and PhD':'MSc_MBA_PhD', 'First Degree or HND':'firstDegree_HND', 'Non-University Education':'primary_secondary'}
channel_of_recruitment = {'Direct Internal process':'direct_process', 'Agency and others':'agent_others', 'Referral and Special candidates':'referral_special'}
no_of_previous_employers = {'0':'0','1':'<=4', '2':'<=4', '3':'<=4', '4':'<=4','5':'>=5', 'More than 5':'>=5'}
Geopolitcal_zone = {'ANAMBRA':'South_East', 'KATSINA':'North_West', 'NIGER':'North_Central', 'AKWA IBOM':'South_South', 'ENUGU':'South_East',
        'OYO':'South_West',
       'LAGOS':'South_West', 'ONDO':'South_West', 'KWARA':'Noth_Central', 'EDO':'South_South', 'BAUCHI':'North_East', 
        'TARABA':'North_East', 'RIVERS':'South_South',
       'KADUNA':'North_West', 'PLATEAU':'North_Central', 'EKITI':'South_West', 'BORNO':'North_East', 'IMO':'South_East',
        'CROSS RIVER':'South_South',
       'KANO':'North_West', 'FCT':'North_Central', 'OGUN':'South_West', 'GOMBE':'North_East', 'DELTA':'South_South',
        'BENUE':'North_Central', 'ZAMFARA':'North_West',
       'OSUN':'South_West', 'ABIA':'South_East', 'KEBBI':'North_West', 'ADAMAWA':'North_East', 'NASSARAWA':'North_Central',
        'SOKOTO':'North_East', 'KOGI':'North_Central',
       'JIGAWA':'North_West', 'BAYELSA':'South_South', 'YOBE':'North_East', 'EBONYI':'South_East'}
datasets=[df_train, df_test]
for dataset in datasets:
    dataset['Division']=dataset['Division'].replace(division)
    dataset['Qualification'] = dataset['Qualification'].replace(qualification)
    dataset['Channel_of_Recruitment'] = dataset['Channel_of_Recruitment'].replace(channel_of_recruitment)
    dataset['No_of_previous_employers'] = dataset['No_of_previous_employers'].replace(no_of_previous_employers)
    dataset['Geopolitical_zone'] = dataset['State_Of_Origin'].replace(Geopolitcal_zone)

In [10]:
df_train.head()

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,...,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not,Age,Working_Years,Geopolitical_zone
0,YAK/S/00001,CSS,MSc_MBA_PhD,Female,direct_process,2,1986,12.5,2011,1,...,ANAMBRA,No,Married,No,No,0,0,33,8,South_East
1,YAK/S/00002,CSFO,firstDegree_HND,Male,agent_others,2,1991,12.5,2015,0,...,ANAMBRA,Yes,Married,No,No,0,0,28,4,South_East
2,YAK/S/00003,CSS,firstDegree_HND,Male,direct_process,2,1987,7.5,2012,0,...,KATSINA,Yes,Married,No,No,0,0,32,7,North_West
3,YAK/S/00004,CSS,firstDegree_HND,Male,agent_others,3,1982,2.5,2009,0,...,NIGER,Yes,Single,No,No,<=4,0,37,10,North_Central
4,YAK/S/00006,IS,firstDegree_HND,Male,direct_process,3,1990,7.5,2012,0,...,AKWA IBOM,Yes,Married,No,No,<=4,0,29,7,South_South


In [11]:
train_cols = ['Division', 'Qualification', 'Gender',
               'Channel_of_Recruitment', 'Trainings_Attended',
               'Last_performance_score', 'Targets_met',
               'Previous_Award', 'Training_score_average',
               'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action',
               'Previous_IntraDepartmental_Movement', 'No_of_previous_employers',
               'Promoted_or_Not', 'Age', 'Working_Years', 'Geopolitical_zone']
test_cols = ['Division', 'Qualification', 'Gender',
               'Channel_of_Recruitment', 'Trainings_Attended',
               'Last_performance_score', 'Targets_met',
               'Previous_Award', 'Training_score_average',
               'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action',
               'Previous_IntraDepartmental_Movement', 'No_of_previous_employers',
                'Age', 'Working_Years', 'Geopolitical_zone']
train_data = df_train[train_cols]
test_data = df_test[test_cols]

In [12]:
train_data.head()

Unnamed: 0,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Last_performance_score,Targets_met,Previous_Award,Training_score_average,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not,Age,Working_Years,Geopolitical_zone
0,CSS,MSc_MBA_PhD,Female,direct_process,2,12.5,1,0,41,No,Married,No,No,0,0,33,8,South_East
1,CSFO,firstDegree_HND,Male,agent_others,2,12.5,0,0,52,Yes,Married,No,No,0,0,28,4,South_East
2,CSS,firstDegree_HND,Male,direct_process,2,7.5,0,0,42,Yes,Married,No,No,0,0,32,7,North_West
3,CSS,firstDegree_HND,Male,agent_others,3,2.5,0,0,42,Yes,Single,No,No,<=4,0,37,10,North_Central
4,IS,firstDegree_HND,Male,direct_process,3,7.5,0,0,77,Yes,Married,No,No,<=4,0,29,7,South_South


In [13]:
## dummify the categorical features
cols = ['Division', 'Qualification', 'Gender', 'Channel_of_Recruitment',
       'Previous_Award', 'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action',
       'Previous_IntraDepartmental_Movement', 'No_of_previous_employers', 'Geopolitical_zone']
train = pd.get_dummies(train_data, columns =cols, prefix_sep='__')  # the seperator between the prefix (column name) and cell value
test = pd.get_dummies(test_data, columns =cols, prefix_sep='__')  # the seperator between the prefix (column name) and cell value

In [14]:
features = ['Trainings_Attended', 'Last_performance_score', 'Targets_met',
       'Training_score_average', 'Age', 'Working_Years',
       'Division__BFO', 'Division__CSFO', 'Division__CSS', 'Division__HR',
       'Division__IS', 'Division__ITSS', 'Division__RI', 'Division__RLS',
       'Division__SP', 'Qualification__MSc_MBA_PhD',
       'Qualification__firstDegree_HND', 'Qualification__primary_secondary',
       'Gender__Female', 'Gender__Male',
       'Channel_of_Recruitment__agent_others',
       'Channel_of_Recruitment__direct_process',
       'Channel_of_Recruitment__referral_special', 'Previous_Award__0',
       'Previous_Award__1', 'Foreign_schooled__No', 'Foreign_schooled__Yes',
       'Marital_Status__Married', 'Marital_Status__Not_Sure',
       'Marital_Status__Single', 'Past_Disciplinary_Action__No',
       'Past_Disciplinary_Action__Yes',
       'Previous_IntraDepartmental_Movement__No',
       'Previous_IntraDepartmental_Movement__Yes',
       'No_of_previous_employers__0', 'No_of_previous_employers__<=4',
       'No_of_previous_employers__>=5', 'Geopolitical_zone__North_Central',
       'Geopolitical_zone__North_East', 'Geopolitical_zone__North_West',
       'Geopolitical_zone__Noth_Central', 'Geopolitical_zone__South_East',
       'Geopolitical_zone__South_South', 'Geopolitical_zone__South_West']

In [15]:
predicted_class_name = ['Promoted_or_Not']
X = train[features].values
y = train[predicted_class_name].values

In [16]:
from sklearn.model_selection import train_test_split
import sklearn.metrics as sklm
import xgboost as xgb

In [17]:
split_test_size = 0.30
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=split_test_size, random_state=42)

In [18]:
colu = ['Trainings_Attended', 'Last_performance_score', 'Targets_met',
       'Training_score_average', 'Age', 'Working_Years',
       'Division__BFO', 'Division__CSFO', 'Division__CSS', 'Division__HR',
       'Division__IS', 'Division__ITSS', 'Division__RI', 'Division__RLS',
       'Division__SP', 'Qualification__MSc_MBA_PhD',
       'Qualification__firstDegree_HND', 'Qualification__primary_secondary',
       'Gender__Female', 'Gender__Male',
       'Channel_of_Recruitment__agent_others',
       'Channel_of_Recruitment__direct_process',
       'Channel_of_Recruitment__referral_special', 'Previous_Award__0',
       'Previous_Award__1', 'Foreign_schooled__No', 'Foreign_schooled__Yes',
       'Marital_Status__Married', 'Marital_Status__Not_Sure',
       'Marital_Status__Single', 'Past_Disciplinary_Action__No',
       'Past_Disciplinary_Action__Yes',
       'Previous_IntraDepartmental_Movement__No',
       'Previous_IntraDepartmental_Movement__Yes',
       'No_of_previous_employers__0', 'No_of_previous_employers__<=4',
       'No_of_previous_employers__>=5', 'Geopolitical_zone__North_Central',
       'Geopolitical_zone__North_East', 'Geopolitical_zone__North_West',
       'Geopolitical_zone__Noth_Central', 'Geopolitical_zone__South_East',
       'Geopolitical_zone__South_South', 'Geopolitical_zone__South_West']
test1=test[colu]

In [19]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)
test1=ss.transform(test1)

In [20]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])

In [21]:
xgb=xgb.XGBClassifier()
xgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [22]:
probabilities = xgb.predict_proba(X_test)
print_metrics(y_test, probabilities, 0.5)  

                 Confusion matrix
                 Score positive    Score negative
Actual positive     10509                10
Actual negative       725               250

Accuracy        0.94
AUC             0.90
Macro precision 0.95
Macro recall    0.63
 
           Positive      Negative
Num case    10519           975
Precision    0.94          0.96
Recall       1.00          0.26
F1           0.97          0.40


In [23]:
solution=xgb.predict(test1)
my_submission=pd.DataFrame({'EmployeeNo':df_test.EmployeeNo,'Promoted_or_Not': solution})
my_submission.to_csv('XgboostClassifierDSN_staff_algorithmic_promotion_prediction.csv', index=False)