# Training Dataset

## Importing Libraries

In [15]:
import pandas as pd
import seaborn as sns
import numpy as np
import random as rnd
import matplotlib.pyplot as plt
import xgboost as xgb
import warnings

from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
warnings.filterwarnings('ignore')

!pip install xgboost




## Dataset importing by pandas

In [2]:

df = pd.read_excel('http://data.iabac.org/exam/p2/data/INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls', 'INX_Future_Inc_Employee_Perform', index_col=None)
df.head()

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3


# Traing a model 

## Encoding Catagorical Data

In [5]:
# load data
# creating a list of only numerical values
num = [u'Age', u'DistanceFromHome', u'EmpEducationLevel', u'EmpEnvironmentSatisfaction', u'EmpHourlyRate'
 , u'EmpJobInvolvement', u'EmpJobLevel', u'EmpJobSatisfaction'
 , u'NumCompaniesWorked', u'EmpLastSalaryHikePercent'
 , u'EmpRelationshipSatisfaction', u'TotalWorkExperienceInYears'
 , u'TrainingTimesLastYear', u'EmpWorkLifeBalance'
 , u'ExperienceYearsAtThisCompany', u'ExperienceYearsInCurrentRole'
 , u'YearsSinceLastPromotion', u'YearsWithCurrManager']

# Empty list to store columns with categorical data
categorical = []
for col, value in df.iteritems():
    if value.dtype == 'object':
        categorical.append(col)

# Store the numerical columns in a list numerical
numerical=df.columns.difference(categorical)
# Store the categorical data in a dataframe called df_cat
df_cat = df[categorical]
df_cat = df_cat.drop(['EmpNumber'], axis=1)

# Store the numerical features to a dataframe df_num
df_num = df[numerical]

df_cat = pd.get_dummies(df_cat)
# Concat the two dataframes together columnwise
df_final = pd.concat([df_num, df_cat], axis=1)
df_final.shape

df_final1=df_final.drop(['PerformanceRating'], axis=1)


X = df_final1.iloc[:,:]

Y =df_final.loc[:,['PerformanceRating']]
df_final.head()


Unnamed: 0,Age,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,EmpJobLevel,EmpJobSatisfaction,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,...,EmpJobRole_Senior Manager R&D,EmpJobRole_Technical Architect,EmpJobRole_Technical Lead,BusinessTravelFrequency_Non-Travel,BusinessTravelFrequency_Travel_Frequently,BusinessTravelFrequency_Travel_Rarely,OverTime_No,OverTime_Yes,Attrition_No,Attrition_Yes
0,32,10,3,4,55,3,2,4,12,4,...,0,0,0,0,0,1,1,0,1,0
1,47,14,4,4,42,3,2,1,12,4,...,0,0,0,0,0,1,1,0,1,0
2,40,5,4,4,48,2,3,1,21,3,...,0,0,0,0,1,0,0,1,1,0
3,41,10,4,2,73,2,5,4,15,2,...,0,0,0,0,0,1,1,0,1,0
4,60,16,4,1,84,3,2,1,14,4,...,0,0,0,0,0,1,1,0,1,0


### Model By ExtraGradient Boosting Classifier

#### Step 1: Storing Values

In [6]:
X=df.iloc[:,1:27].values
y=df.iloc[:,27].values
y

array([3, 3, 4, ..., 3, 3, 2], dtype=int64)

observation
- x stores the total column except PerformanceRating
- y stores performance rating
- The output is the array of catagories of PerformanceRating ie 1 for low, 2 for good, 3 excellent and 4 for outstanding

 #### Step 2: Splitting Data in to training and testing

In [11]:
X=df_cat
y = df_final['PerformanceRating'] # Target Performance Rating 
# Split data into train and test sets as well as for validation and testing
X_train, X_test, y_train, y_test = train_test_split(df_final1,y, test_size= 0.3,random_state=120)
# Fitting XGBoost to the Training set


#### Step 3: Over Sampling

In [None]:
oversampler= SMOTE(random_state=120)
smote_train, smote_target = oversampler.fit_sample(X_train,y_train)

#### Step 4: Setting parameters

In [None]:
seed=0
param= {
    'n_estimators': 500,
    'max_features': 2.0,
    'learning_rate' : 0.01,
    'max_depth': 12,
    'min_samples_leaf': 8,
    'subsample': 0.8,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 5
   }

#### Step 5: Applying Param in to XGBoosting Classifier

In [13]:

classifier = xgb.XGBClassifier(**param)
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=12, max_features='sqrt', min_child_weight=1,
       min_samples_leaf=8, missing=None, n_estimators=500, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8, verbose=5)