In [1]:
# import neccessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [2]:
# Get training data into pandas dataframe
train_data = pd.read_csv('./intercampusai2019/train.csv')

In [3]:
# Test data into pandas Dataframe
test_data = pd.read_csv('./intercampusai2019/test.csv')

In [4]:
train_data.columns

Index(['EmployeeNo', 'Division', 'Qualification', 'Gender',
       'Channel_of_Recruitment', 'Trainings_Attended', 'Year_of_birth',
       'Last_performance_score', 'Year_of_recruitment', 'Targets_met',
       'Previous_Award', 'Training_score_average', 'State_Of_Origin',
       'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action',
       'Previous_IntraDepartmental_Movement', 'No_of_previous_employers',
       'Promoted_or_Not'],
      dtype='object')

In [5]:
train_data.head()

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not
0,YAK/S/00001,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,2,1986,12.5,2011,1,0,41,ANAMBRA,No,Married,No,No,0,0
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,ANAMBRA,Yes,Married,No,No,0,0
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,KATSINA,Yes,Married,No,No,0,0
3,YAK/S/00004,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,1982,2.5,2009,0,0,42,NIGER,Yes,Single,No,No,1,0
4,YAK/S/00006,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,1990,7.5,2012,0,0,77,AKWA IBOM,Yes,Married,No,No,1,0


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38312 entries, 0 to 38311
Data columns (total 19 columns):
EmployeeNo                             38312 non-null object
Division                               38312 non-null object
Qualification                          36633 non-null object
Gender                                 38312 non-null object
Channel_of_Recruitment                 38312 non-null object
Trainings_Attended                     38312 non-null int64
Year_of_birth                          38312 non-null int64
Last_performance_score                 38312 non-null float64
Year_of_recruitment                    38312 non-null int64
Targets_met                            38312 non-null int64
Previous_Award                         38312 non-null int64
Training_score_average                 38312 non-null int64
State_Of_Origin                        38312 non-null object
Foreign_schooled                       38312 non-null object
Marital_Status                         383

In [7]:
# Create colunms for employee age and work years
train_data['Employee_age'] = 2019 - train_data['Year_of_birth']
train_data['Work_years'] = 2019 - train_data['Year_of_recruitment']

test_data['Employee_age'] = 2019 - test_data['Year_of_birth']
test_data['Work_years'] = 2019 - test_data['Year_of_recruitment']

In [8]:
# As can be seen on train_data.info() Qualification seems to have nan values
train_data['Qualification'].unique()
train_data['Qualification'].fillna('non', inplace = True)

test_data['Qualification'].unique()
test_data['Qualification'].fillna('non', inplace = True)

In [9]:
# Select numerical columns
numerical_cols = ['Trainings_Attended', 'Last_performance_score', 'Targets_met',
                  'Previous_Award', 'Training_score_average', 'Employee_age', 'Work_years']

In [10]:
# Select categorical columns
cat_cols = ['Division', 'Qualification', 'Gender', 'Foreign_schooled',
            'Marital_Status', 'Past_Disciplinary_Action',
            'Previous_IntraDepartmental_Movement', 'Channel_of_Recruitment',
            'No_of_previous_employers']

In [11]:
# Using Label Encoder for the categorical variables
encoder = LabelEncoder()
train_data_encoded = train_data[cat_cols].apply(encoder.fit_transform)
test_data_encoded = test_data[cat_cols].apply(encoder.fit_transform)

In [12]:
train_data_encoded.head()

Unnamed: 0,Division,Qualification,Gender,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,Channel_of_Recruitment,No_of_previous_employers
0,1,1,0,0,0,0,0,1,0
1,2,0,1,1,0,0,0,0,0
2,1,0,1,1,0,0,0,1,0
3,1,0,1,1,2,0,0,0,1
4,4,0,1,1,0,0,0,1,1


In [13]:
train_data1 = train_data[numerical_cols].join(train_data_encoded)
test_data1 = test_data[numerical_cols].join(test_data_encoded)

In [14]:
train_data1.head()

Unnamed: 0,Trainings_Attended,Last_performance_score,Targets_met,Previous_Award,Training_score_average,Employee_age,Work_years,Division,Qualification,Gender,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,Channel_of_Recruitment,No_of_previous_employers
0,2,12.5,1,0,41,33,8,1,1,0,0,0,0,0,1,0
1,2,12.5,0,0,52,28,4,2,0,1,1,0,0,0,0,0
2,2,7.5,0,0,42,32,7,1,0,1,1,0,0,0,1,0
3,3,2.5,0,0,42,37,10,1,0,1,1,2,0,0,0,1
4,3,7.5,0,0,77,29,7,4,0,1,1,0,0,0,1,1


In [15]:
y = train_data.Promoted_or_Not
X = train_data1

In [16]:
# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                      random_state=0)  

In [17]:
dtrain = lgb.Dataset(X_train, label=y_train)

In [18]:
param = {}
param['learning_rate']= 0.001
param['boosting_type']='gbdt'
param['objective']='binary'
param['metric']='mean_absolute_error'
param['sub_feature']=0.9
param['num_leaves']= 25
param['min_data']=30
param['max_depth']=0

In [19]:
basemodel= lgb.train(param, dtrain, 1000)

In [20]:
pred = basemodel.predict(X_valid)

#convert into binary values

for i in range(0,1000):
    if (pred[i] >= 0.5):
        pred[i] = 1
    else:
        pred[i] =0
len(pred)       

7663

In [21]:
# Evaluate the model
score = mean_absolute_error(y_valid, pred)
print('MAE:', score)
score1 = roc_auc_score(y_valid, pred)
score1


MAE: 0.11690634903490375


0.8393866586375296

In [22]:
preds_test = basemodel.predict(test_data1).astype(int)

In [23]:
# Saving output to submission.csv
output = pd.DataFrame({'EmployeeNo': test_data.EmployeeNo,
                       'Promoted_or_Not': preds_test})
output.to_csv('submission.csv', index=False)