In [10]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.pipeline import Pipeline , FeatureUnion
from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    MinMaxScaler,
    PowerTransformer,
    StandardScaler,
    FunctionTransformer)
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.outliers import Winsorizer 
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import warnings

In [2]:
pd.set_option('display.max_columns' , None)
sklearn.set_config(transform_output='pandas')
warnings.filterwarnings('ignore')

In [3]:
path = 'D:\\Technocolab\\Attrition-Forecast-Analysis-and-Prediction\\Acme_Attrition Data'

Acme_df = pd.read_csv(path + '\\train.csv')

In [4]:
Acme_df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,28,Travel_Frequently,783,Sales,1,2,Life Sciences,1,1927,3,Male,42,2,2,Sales Executive,4,Married,6834,19255,1,Yes,12,3,3,80,1,7,2,3,7,7,0,7,No
1,39,Travel_Frequently,443,Research & Development,8,1,Life Sciences,1,602,3,Female,48,3,1,Laboratory Technician,3,Married,3755,17872,1,No,11,3,1,80,1,8,3,3,8,3,0,7,No
2,37,Travel_Rarely,1192,Research & Development,5,2,Medical,1,460,4,Male,61,3,2,Manufacturing Director,4,Divorced,6347,23177,7,No,16,3,3,80,2,8,2,2,6,2,0,4,No
3,35,Travel_Rarely,1142,Research & Development,23,4,Medical,1,75,3,Female,30,3,1,Laboratory Technician,1,Married,4014,16002,3,Yes,15,3,3,80,1,4,3,3,2,2,2,2,No
4,36,Travel_Rarely,506,Research & Development,3,3,Technical Degree,1,397,3,Male,30,3,2,Research Scientist,2,Single,4485,26285,4,No,12,3,4,80,0,10,2,3,8,0,7,7,No


In [5]:
Acme_df['Attrition'] = Acme_df['Attrition'].replace({'No':0,'Yes':1})

In [6]:
Acme_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       940 non-null    int64 
 1   BusinessTravel            940 non-null    object
 2   DailyRate                 940 non-null    int64 
 3   Department                940 non-null    object
 4   DistanceFromHome          940 non-null    int64 
 5   Education                 940 non-null    int64 
 6   EducationField            940 non-null    object
 7   EmployeeCount             940 non-null    int64 
 8   EmployeeNumber            940 non-null    int64 
 9   EnvironmentSatisfaction   940 non-null    int64 
 10  Gender                    940 non-null    object
 11  HourlyRate                940 non-null    int64 
 12  JobInvolvement            940 non-null    int64 
 13  JobLevel                  940 non-null    int64 
 14  JobRole                   

In [7]:
X_train_main = Acme_df.drop(columns='Attrition')

X_train_main

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,28,Travel_Frequently,783,Sales,1,2,Life Sciences,1,1927,3,Male,42,2,2,Sales Executive,4,Married,6834,19255,1,Yes,12,3,3,80,1,7,2,3,7,7,0,7
1,39,Travel_Frequently,443,Research & Development,8,1,Life Sciences,1,602,3,Female,48,3,1,Laboratory Technician,3,Married,3755,17872,1,No,11,3,1,80,1,8,3,3,8,3,0,7
2,37,Travel_Rarely,1192,Research & Development,5,2,Medical,1,460,4,Male,61,3,2,Manufacturing Director,4,Divorced,6347,23177,7,No,16,3,3,80,2,8,2,2,6,2,0,4
3,35,Travel_Rarely,1142,Research & Development,23,4,Medical,1,75,3,Female,30,3,1,Laboratory Technician,1,Married,4014,16002,3,Yes,15,3,3,80,1,4,3,3,2,2,2,2
4,36,Travel_Rarely,506,Research & Development,3,3,Technical Degree,1,397,3,Male,30,3,2,Research Scientist,2,Single,4485,26285,4,No,12,3,4,80,0,10,2,3,8,0,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,26,Travel_Rarely,482,Research & Development,1,2,Life Sciences,1,1893,2,Female,90,2,1,Research Scientist,3,Married,2933,14908,1,Yes,13,3,3,80,1,1,3,2,1,0,1,0
936,39,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,4,Male,42,2,3,Healthcare Representative,1,Married,9991,21457,4,No,15,3,1,80,1,9,5,3,7,7,1,7
937,53,Travel_Rarely,868,Sales,8,3,Marketing,1,897,1,Male,73,3,4,Sales Executive,4,Married,11836,22789,5,No,14,3,3,80,1,28,3,3,2,0,2,2
938,19,Travel_Rarely,528,Sales,22,1,Marketing,1,167,4,Male,50,3,1,Sales Representative,3,Single,1675,26820,1,Yes,19,3,4,80,0,0,2,2,0,0,0,0


In [8]:
Y_train_main = Acme_df['Attrition'].copy()
Y_train_main

0      0
1      0
2      0
3      0
4      0
      ..
935    0
936    0
937    0
938    1
939    1
Name: Attrition, Length: 940, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_train_main, Y_train_main, test_size=0.05, random_state=0)

In [12]:
categorical_df = Acme_df.drop(columns='Attrition').select_dtypes(include=['object'])

categorical_df

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Travel_Frequently,Sales,Life Sciences,Male,Sales Executive,Married,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Female,Laboratory Technician,Married,No
2,Travel_Rarely,Research & Development,Medical,Male,Manufacturing Director,Divorced,No
3,Travel_Rarely,Research & Development,Medical,Female,Laboratory Technician,Married,Yes
4,Travel_Rarely,Research & Development,Technical Degree,Male,Research Scientist,Single,No
...,...,...,...,...,...,...,...
935,Travel_Rarely,Research & Development,Life Sciences,Female,Research Scientist,Married,Yes
936,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,No
937,Travel_Rarely,Sales,Marketing,Male,Sales Executive,Married,No
938,Travel_Rarely,Sales,Marketing,Male,Sales Representative,Single,Yes


In [13]:
non_categorical_df = Acme_df.select_dtypes(exclude=['object'])

non_categorical_df

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,28,783,1,2,1,1927,3,42,2,2,4,6834,19255,1,12,3,3,80,1,7,2,3,7,7,0,7,0
1,39,443,8,1,1,602,3,48,3,1,3,3755,17872,1,11,3,1,80,1,8,3,3,8,3,0,7,0
2,37,1192,5,2,1,460,4,61,3,2,4,6347,23177,7,16,3,3,80,2,8,2,2,6,2,0,4,0
3,35,1142,23,4,1,75,3,30,3,1,1,4014,16002,3,15,3,3,80,1,4,3,3,2,2,2,2,0
4,36,506,3,3,1,397,3,30,3,2,2,4485,26285,4,12,3,4,80,0,10,2,3,8,0,7,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,26,482,1,2,1,1893,2,90,2,1,3,2933,14908,1,13,3,3,80,1,1,3,2,1,0,1,0,0
936,39,613,6,1,1,2062,4,42,2,3,1,9991,21457,4,15,3,1,80,1,9,5,3,7,7,1,7,0
937,53,868,8,3,1,897,1,73,3,4,4,11836,22789,5,14,3,3,80,1,28,3,3,2,0,2,2,0
938,19,528,22,1,1,167,4,50,3,1,3,1675,26820,1,19,3,4,80,0,0,2,2,0,0,0,0,1


In [14]:
non_categorical_df.StandardHours.unique()

array([80])

## Description 

#### 1. nominal data

In [15]:
categorical_df.BusinessTravel.unique()

array(['Travel_Frequently', 'Travel_Rarely', 'Non-Travel'], dtype=object)

In [16]:
categorical_df.Department.unique()

array(['Sales', 'Research & Development', 'Human Resources'], dtype=object)

In [17]:
categorical_df.EducationField.unique()

array(['Life Sciences', 'Medical', 'Technical Degree', 'Marketing',
       'Other', 'Human Resources'], dtype=object)

In [18]:
categorical_df.JobRole.unique()

array(['Sales Executive', 'Laboratory Technician',
       'Manufacturing Director', 'Research Scientist', 'Human Resources',
       'Research Director', 'Healthcare Representative', 'Manager',
       'Sales Representative'], dtype=object)

In [19]:
categorical_df.MaritalStatus.unique()

array(['Married', 'Divorced', 'Single'], dtype=object)

In [20]:
categorical_df.OverTime.unique()

array(['Yes', 'No'], dtype=object)

#### - apply onehotencoding on `Department` , `EducationField` , `MaritalStatus` , `OverTime` , `Gender` and `JobRole`
#### - Apply ordinalEncoding on `BusinessTravel`

In [21]:
categorical_df.BusinessTravel.unique()

array(['Travel_Frequently', 'Travel_Rarely', 'Non-Travel'], dtype=object)

### Nominal Categorical Encoding

In [38]:
X_train['OverTime'] = X_train['OverTime'].map({'No':0,'Yes':1})
X_test['OverTime'] = X_test['OverTime'].map({'No':0,'Yes':1})
X_train['Gender'] = X_train['Gender'].map({'Male':0,'Female':1})
X_test['Gender'] = X_test['Gender'].map({'Male':0,'Female':1})

In [36]:
from sklearn.preprocessing import LabelEncoder
encoding_cols=['BusinessTravel','Department','EducationField','JobRole','MaritalStatus']
label_encoders = {}
for column in encoding_cols:
    label_encoders[column] = LabelEncoder()
    X_train[column] = label_encoders[column].fit_transform(X_train[column])
    X_test[column] = label_encoders[column].fit_transform(X_test[column])

In [24]:
X_train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
483,42,2,201,1,1,4,1,1,517,2,1,95,3,1,2,1,0,2576,20490,3,0,16,3,2,80,1,8,5,3,5,2,1,2
278,41,1,840,1,9,3,3,1,999,1,0,64,3,5,5,3,0,19419,3735,2,0,17,3,2,80,1,21,2,4,18,16,0,11
750,56,1,906,2,6,3,1,1,532,3,1,86,4,4,7,1,1,13212,18256,9,0,11,3,4,80,3,36,0,2,7,7,7,7
425,35,2,662,2,1,5,2,1,204,3,0,94,3,3,7,2,1,7295,11439,1,0,13,3,1,80,2,10,3,3,10,8,0,6
422,38,1,693,1,7,3,1,1,1382,4,0,57,4,1,6,3,0,2610,15748,1,0,11,3,4,80,3,4,2,3,4,2,0,3


## Encode Numeric Value

In [25]:
X_train.drop(columns=['EmployeeNumber' , 'StandardHours'])

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
483,42,2,201,1,1,4,1,1,2,1,95,3,1,2,1,0,2576,20490,3,0,16,3,2,1,8,5,3,5,2,1,2
278,41,1,840,1,9,3,3,1,1,0,64,3,5,5,3,0,19419,3735,2,0,17,3,2,1,21,2,4,18,16,0,11
750,56,1,906,2,6,3,1,1,3,1,86,4,4,7,1,1,13212,18256,9,0,11,3,4,3,36,0,2,7,7,7,7
425,35,2,662,2,1,5,2,1,3,0,94,3,3,7,2,1,7295,11439,1,0,13,3,1,2,10,3,3,10,8,0,6
422,38,1,693,1,7,3,1,1,4,0,57,4,1,6,3,0,2610,15748,1,0,11,3,4,3,4,2,3,4,2,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,42,2,419,2,12,4,2,1,2,0,77,3,2,7,4,0,5087,2900,3,1,12,3,3,2,14,4,3,0,0,0,0
192,26,2,1355,0,25,1,1,1,3,1,61,3,1,1,3,1,2942,8916,1,0,23,4,4,1,8,3,3,8,7,5,7
629,31,1,1125,2,7,4,2,1,1,1,68,3,3,7,1,1,9637,8277,2,0,14,3,4,2,9,3,3,3,2,2,2
559,37,2,408,1,19,2,1,1,2,0,73,3,1,6,2,1,3022,10227,4,0,21,4,1,0,8,1,3,1,0,0,0


In [35]:
X_train.BusinessTravel.unique()

array([2, 1, 0])

In [26]:
ordinal_features = ['Education','EnvironmentSatisfaction','JobInvolvement','JobSatisfaction',
                    'PerformanceRating','RelationshipSatisfaction','WorkLifeBalance']
Acme_df[ordinal_features].head()

Unnamed: 0,Education,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,PerformanceRating,RelationshipSatisfaction,WorkLifeBalance
0,2,3,2,4,3,3,3
1,1,3,3,3,3,1,3
2,2,4,3,4,3,3,2
3,4,3,3,1,3,3,3
4,3,3,3,2,3,4,3


In [27]:
Column_Transformer = ColumnTransformer(transformers=[
    ('scaler' , StandardScaler , ['MonthlyRate' , 'MonthlyIncome'])
])

In [28]:
from sklearn.linear_model import LogisticRegression  
estimator = LogisticRegression(random_state=0)

In [29]:
selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
)

In [32]:
estimator.fit(X_train,y_train)

In [42]:
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score

In [43]:
prediction=estimator.predict(X_test)
cnf_matrix = confusion_matrix(y_test,prediction)
print("Accuracy Score -", accuracy_score(y_test , prediction))

Accuracy Score - 0.7872340425531915


## Column Transform