In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import getpass

In [3]:
user= getpass.getuser()

In [5]:
df_train = pd.read_csv("C:\\Users\\"+user+"\\Desktop\\EDA\\HR\\train.csv")

In [6]:
df_train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [4]:
df_train.shape

(54808, 14)

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


## Handling Null Values

In [6]:
df_train.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

From the above cell, we find that null values are present in education and previous_year_rating columns.

#### Education  -  Categorical 
#### previous_year_rating - Numerical (Float) 


In [7]:
df_train['education'].unique()

array(["Master's & above", "Bachelor's", nan, 'Below Secondary'],
      dtype=object)

In [8]:
education_mode = df_train.education.mode()[0]

In [9]:
education_mode

"Bachelor's"

In general, maximum people starts job after their bachelor's degree. In the dataset also, maximum people's education is Bachelors. Hence, filling education null values with mode value.

In [10]:
df_train['education'] = df_train['education'].fillna(education_mode)

### Filling previous_year_rating null values with random imputation

In [11]:
variable = 'previous_year_rating'

In [12]:
df_train[variable+'_imputation'] = df_train['previous_year_rating']
random = df_train[variable+'_imputation'].dropna().sample(df_train[variable+'_imputation'].isnull().sum(),random_state=0)
random.index = df_train[df_train[variable+'_imputation'].isnull()].index
df_train.loc[df_train[variable+'_imputation'].isnull(),variable+'_imputation'] = random

In [13]:
df_train = df_train.drop(['previous_year_rating'],axis=1)

In [14]:
df_train = df_train.rename(columns={'previous_year_rating_imputation':'previous_year_rating'})

In [15]:
df_train.isnull().any()

employee_id             False
department              False
region                  False
education               False
gender                  False
recruitment_channel     False
no_of_trainings         False
age                     False
length_of_service       False
KPIs_met >80%           False
awards_won?             False
avg_training_score      False
is_promoted             False
previous_year_rating    False
dtype: bool

In [16]:
df_train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,previous_year_rating
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,8,1,0,49,0,5.0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,4,0,0,60,0,5.0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,7,0,0,50,0,3.0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,10,0,0,50,0,1.0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,2,0,0,73,0,3.0


### Dropping employee_id column

In [17]:
# Dropping employee_id column 
df_train = df_train.drop(['employee_id'],axis=1)

## Encoding Techniques

### Converting categorical values to numerical values

-> Using Target Guided Ordinal Encoding for department feature

-> Using probability ratio encoding for region  feature

-> Using Count\Frequency encoding method for education feature

-> Using One Hot Encoding for gender feature

-> Using mapping method for recruitment_channel feature

### 1. Target Guided Ordinal Encoding for department feature

In [17]:
df_train['department'] = df_train['department'].str[0]

In [18]:
df_train.groupby(['department'])['is_promoted'].mean().sort_values()

department
L    0.051011
H    0.056245
R    0.069069
S    0.072031
F    0.081230
O    0.090148
A    0.095665
P    0.096386
T    0.107593
Name: is_promoted, dtype: float64

In [19]:
ordinal_labels = df_train.groupby(['department'])['is_promoted'].mean().sort_values().index

In [20]:
ordinal_labels

Index(['L', 'H', 'R', 'S', 'F', 'O', 'A', 'P', 'T'], dtype='object', name='department')

In [21]:
dict_labels = {k:i for i,k in enumerate(ordinal_labels,0)}

In [22]:
dict_labels

{'L': 0, 'H': 1, 'R': 2, 'S': 3, 'F': 4, 'O': 5, 'A': 6, 'P': 7, 'T': 8}

In [23]:
df_train['department'] = df_train['department'].map(dict_labels)

In [24]:
df_train.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,previous_year_rating
0,3,region_7,Master's & above,f,sourcing,1,35,8,1,0,49,0,5.0
1,5,region_22,Bachelor's,m,other,1,30,4,0,0,60,0,5.0
2,3,region_19,Bachelor's,m,sourcing,1,34,7,0,0,50,0,3.0
3,3,region_23,Bachelor's,m,other,2,39,10,0,0,50,0,1.0
4,8,region_26,Bachelor's,m,other,1,45,2,0,0,73,0,3.0


### 2. probability ratio encoding for region feature

In [25]:
df_train.region.unique()

array(['region_7', 'region_22', 'region_19', 'region_23', 'region_26',
       'region_2', 'region_20', 'region_34', 'region_1', 'region_4',
       'region_29', 'region_31', 'region_15', 'region_14', 'region_11',
       'region_5', 'region_28', 'region_17', 'region_13', 'region_16',
       'region_25', 'region_10', 'region_27', 'region_30', 'region_12',
       'region_21', 'region_8', 'region_32', 'region_6', 'region_33',
       'region_24', 'region_3', 'region_9', 'region_18'], dtype=object)

In [26]:
reg_values = df_train.groupby(['region'])['is_promoted'].mean().sort_values()

In [27]:
reg_values = pd.DataFrame(reg_values)

In [28]:
reg_values['not_promoted'] = 1 - reg_values['is_promoted']

In [29]:
reg_values['prob_ratio_en'] = reg_values['is_promoted']/reg_values['not_promoted']

In [30]:
prob_encoded = reg_values['prob_ratio_en'].to_dict()

In [31]:
df_train['region'] = df_train['region'].map(prob_encoded)

### 3. Count\Frequency encoding method for education feature

In [32]:
edu_dict = df_train['education'].value_counts().to_dict()

In [33]:
edu_dict

{"Bachelor's": 39078, "Master's & above": 14925, 'Below Secondary': 805}

In [34]:
df_train['education'] = df_train['education'].map(edu_dict)

In [35]:
df_train['gender'].isnull().sum()

0

### 4. One Hot Encoding for gender feature

In [36]:
df_train['gender']=pd.get_dummies(df_train['gender'],drop_first=True,dummy_na=False)

In [38]:
df_train.isnull().sum()

department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
previous_year_rating    0
dtype: int64

In [40]:
df_train.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,previous_year_rating
0,3,0.119251,14925,0,sourcing,1,35,8,1,0,49,0,5.0
1,5,0.128908,39078,1,other,1,30,4,0,0,60,0,5.0
2,3,0.064555,39078,1,sourcing,1,34,7,0,0,50,0,3.0
3,3,0.131985,39078,1,other,2,39,10,0,0,50,0,1.0
4,8,0.067548,39078,1,other,1,45,2,0,0,73,0,3.0


### 5. mapping method for recruitment_channel feature

In [41]:
df_train['recruitment_channel'].unique()

array(['sourcing', 'other', 'referred'], dtype=object)

In [42]:
dict_channel = {'referred':1,'sourcing':2,'other':3}

In [43]:
df_train['recruitment_channel']=df_train['recruitment_channel'].map(dict_channel)

In [44]:
df_train.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,previous_year_rating
0,3,0.119251,14925,0,2,1,35,8,1,0,49,0,5.0
1,5,0.128908,39078,1,3,1,30,4,0,0,60,0,5.0
2,3,0.064555,39078,1,2,1,34,7,0,0,50,0,3.0
3,3,0.131985,39078,1,3,2,39,10,0,0,50,0,1.0
4,8,0.067548,39078,1,3,1,45,2,0,0,73,0,3.0


#### All categorical features are converted to numerical values