In [178]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [179]:
data=pd.read_excel("INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls")

In [180]:
data=pd.DataFrame(data,columns=data.columns)

In [181]:
pd.set_option('display.max_columns',None)

In [182]:
data.head()

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,EmpJobLevel,EmpJobSatisfaction,NumCompaniesWorked,OverTime,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,4,55,3,2,4,1,No,12,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,4,42,3,2,1,2,No,12,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,4,48,2,3,1,5,Yes,21,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,2,73,2,5,4,3,No,15,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,1,84,3,2,1,8,No,14,4,10,1,3,2,2,2,2,No,3


In [183]:
#data.info()

There is no null values in the data.But some categorical values and need categorical conversion.

In [185]:
#print(data['Gender'].unique(),"\n")
#print(data['EducationBackground'].unique(),"\n")
#print(data['MaritalStatus'].unique(),"\n")
#print(data['EmpDepartment'].unique(),"\n")
#print(data['EmpJobRole'].unique(),"\n")
#print(data['BusinessTravelFrequency'].unique(),"\n")

In [186]:
#Encoding the data manually

def apply_mappings(data):
    if isinstance(data, np.ndarray):
        column_names=['EmpNumber', 'Age', 'Gender', 'EducationBackground', 'MaritalStatus',
       'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency',
       'DistanceFromHome', 'EmpEducationLevel', 'EmpEnvironmentSatisfaction',
       'EmpHourlyRate', 'EmpJobInvolvement', 'EmpJobLevel',
       'EmpJobSatisfaction', 'NumCompaniesWorked', 'OverTime',
       'EmpLastSalaryHikePercent', 'EmpRelationshipSatisfaction',
       'TotalWorkExperienceInYears', 'TrainingTimesLastYear',
       'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany',
       'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition', 'PerformanceRating']
        data = pd.DataFrame(data, columns=column_names)
        
    Gender_mapping={'Male':1,'Female':0}
    EducationBackground_mapping={'Human Resources':1,'Other':2,'Technical Degree':3,'Marketing':4,'Medical':5,'Life Sciences':6}
    MaritalStatus_mapping={'Single':1, 'Married':2, 'Divorced':0}
    EmpDepartment_mapping={'Sales':6,'Development':5,'Research & Development':4,'Human Resources':3,'Finance':2,'Data Science':1}
    BusinessTravelFrequency_mapping={'Non-Travel':1,'Travel_Frequently':2,'Travel_Rarely':3}
    OverTime_mapping={'Yes':0,'No':1}
    Attrition_mapping={'Yes':0,'No':1}

# Function for apply the mappings
#def apply_mappings(data):
    data['Gender']=data.Gender.map(Gender_mapping)
    data['EducationBackground']=data.EducationBackground.map(EducationBackground_mapping)
    data['MaritalStatus']=data.MaritalStatus.map(MaritalStatus_mapping)
    data['EmpDepartment']=data.EmpDepartment.map(EmpDepartment_mapping)
    data['BusinessTravelFrequency']=data.BusinessTravelFrequency.map(BusinessTravelFrequency_mapping)
    data['OverTime']=data.OverTime.map(OverTime_mapping)
    data['Attrition']=data.Attrition.map(Attrition_mapping)
    return data

# Create a transformer using FunctionTransformer
mapping_transformer = FunctionTransformer(apply_mappings)



In [187]:
def encode_and_drop_columns(data):
    if 'EmpJobRole' in data.columns:
#Initialise label encoder
        labelencoder_EmpjobRole=LabelEncoder()
#Fit and transform data
        encoded_data=labelencoder_EmpjobRole.fit_transform(data['EmpJobRole'])
#Add encoded data to dataframe
        data['EmpJobRoleEncod']=encoded_data
        data.drop(['EmpJobRole'],inplace=True,axis=1)
    if 'EmpNumber' in data.columns:
#Remove EmpNumber which is not so important in prediction
        data.drop(['EmpNumber'],inplace=True,axis=1)
    return data

# Create transformer
label_encoding_transformer = FunctionTransformer(encode_and_drop_columns)

In [188]:
#data.head()

In [189]:
def scale_features(data):
    
    data_for_scaling=data[['Age','EducationBackground','EmpDepartment','DistanceFromHome','EmpEducationLevel','EmpEnvironmentSatisfaction',
                        'EmpHourlyRate','EmpJobInvolvement','EmpJobLevel','EmpJobSatisfaction','NumCompaniesWorked',
                        'EmpLastSalaryHikePercent','EmpRelationshipSatisfaction','TotalWorkExperienceInYears','TrainingTimesLastYear',
                        'EmpWorkLifeBalance','ExperienceYearsAtThisCompany','ExperienceYearsInCurrentRole','YearsSinceLastPromotion',
                        'YearsWithCurrManager','EmpJobRoleEncod']]

    #data = pd.DataFrame(data, columns=data.columns)
    data_not_scaling=data.drop(data_for_scaling,axis=1)

#Object Creation
    scaler=MinMaxScaler()

#Fit and transform data
    scaled_data=scaler.fit_transform(data_for_scaling)

    scaled_df=pd.DataFrame(scaled_data,columns=data_for_scaling.columns)
    
#Joining the scaled and non scaled data
    new_data=pd.concat([scaled_df,data_not_scaling],axis=1)
    
    final_df=pd.DataFrame(new_data,columns=new_data.columns)
    return final_df


In [190]:
pipeline = Pipeline(steps=[
    ('mapping_transformer', FunctionTransformer(apply_mappings, validate=False)),  # Apply mappings
    ('label_encoding_transformer', FunctionTransformer(encode_and_drop_columns, validate=False)),  # Label encode and drop columns
    ('scaling', FunctionTransformer(scale_features, validate=False))  # Scale selected features
])
pipeline.set_output(transform='pandas')

In [191]:
final_data=pipeline.fit_transform(data)

In [192]:
final_data.to_excel('Processed_data.xlsx',index=False)
#final_data.head()

In [193]:
import joblib
joblib.dump(pipeline,'processed_pipeline.pkl')

['processed_pipeline.pkl']