### Task # 12:
- This is task # 12 overall, and task 2 of Week 6
### Task # 02:

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### Problem # 2.1:
Predicting Employee Attrition Using Logistic Regression
Dataset: HR Analytics Employee Attrition Dataset

Preprocessing Steps:
- Handle missing values if any.
- Encode categorical variables (e.g., one-hot encoding for department, gender, etc.).
- Standardize numerical features.

Task: Implement logistic regression to predict employee attrition and evaluate the model
using precision, recall, and F1-score.

In [57]:
#importing the dataset
# File path to the dataset
fp = 'WA_Fn-UseC_-HR-Employee-Attrition.csv'

employee_df = pd.read_csv(fp)

# Null values check
#null values verification 
print(employee_df.columns[employee_df.isnull().any()])

#a look at the dataset
employee_df.head(3)


Index([], dtype='object')


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0


In [58]:
#Available column names
employee_df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [59]:
# so there are no null values lets proceed further
# employee_df['BusinessTravel'].value_counts()
# employee_df['Department'].value_counts()
# employee_df['EducationField'].value_counts()
# employee_df['MaritalStatus'].value_counts()
# employee_df['Gender'].value_counts()
# employee_df['JobRole']value_counts()

## All these variables are categorical so lets encode it with label encoder, we are not using hot encoding because we already have a large dataste with 
## many columns so one hot encoding will make it more complex

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Applying LabelEncoder to categorical columns
categorical_columns = ['BusinessTravel', 'Department', 'EducationField','Gender', 'JobRole', 'MaritalStatus','Over18','OverTime']


for col in categorical_columns:
    employee_df[col] = label_encoder.fit_transform(employee_df[col])

# Verifying the changes
employee_df[categorical_columns].head()

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,2,2,1,0,7,2,0,1
1,1,1,1,1,6,1,0,0
2,2,1,4,1,2,2,0,1
3,1,1,1,0,6,1,0,1
4,2,1,3,1,2,1,0,0


In [60]:
# lets have a look at our target variable 
employee_df['Attrition'].value_counts()

Attrition
No     1233
Yes     237
Name: count, dtype: int64

In [61]:
# as we can see from above output our dataset is imbalance lets apply undersampling to balance the dataset
from imblearn.under_sampling import RandomUnderSampler

# defining target variable and features as Y and X respectively
X = employee_df.drop(['Attrition', 'EmployeeCount', 'EmployeeNumber'], axis=1)  # Dropping the target and non-predictive columns
Y = employee_df['Attrition']  # The target variable

# Converting target variable to binary values for(0 and 1)
Y = Y.map({'Yes': 1, 'No': 0})

rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, Y)

# Converting the resampled arrays back to DataFrames
X_rus_df = pd.DataFrame(X_rus, columns=X.columns)
y_rus_df = pd.DataFrame(y_rus, columns=['Attrition'])

# Combining X and y to view the complete DataFrame
rus_df = pd.concat([X_rus_df, y_rus_df], axis=1)
rus_df['Attrition'].value_counts()

Attrition
0    237
1    237
Name: count, dtype: int64

In [62]:
#as we can see from the above output we have now a balance dataset to be used 
#now lets have a look at possible columns that can be standardized
# List of selected columns to standardize
numerical_columns = [
    'Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate',
    'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'StockOptionLevel',
    'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'
]

#this standard deviation will give us the idea , if the majority values are not near to 1 so we will apply standardization 
rus_df[numerical_columns].std()
#Before standardization

Age                           9.232097
DailyRate                   390.616351
DistanceFromHome              8.241896
HourlyRate                   20.150602
MonthlyIncome              4393.706891
MonthlyRate                7222.025231
NumCompaniesWorked            2.493892
PercentSalaryHike             3.714831
PerformanceRating             0.357242
StockOptionLevel              0.858134
TotalWorkingYears             7.528239
TrainingTimesLastYear         1.236253
WorkLifeBalance               0.743800
YearsAtCompany                5.951512
YearsInCurrentRole            3.374214
YearsSinceLastPromotion       3.167786
YearsWithCurrManager          3.446916
dtype: float64

In [63]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# # Standardizing the columns
rus_df[numerical_columns] = scaler.fit_transform(rus_df[numerical_columns])

rus_df[numerical_columns].std()
#After standardization

Age                        1.001057
DailyRate                  1.001057
DistanceFromHome           1.001057
HourlyRate                 1.001057
MonthlyIncome              1.001057
MonthlyRate                1.001057
NumCompaniesWorked         1.001057
PercentSalaryHike          1.001057
PerformanceRating          1.001057
StockOptionLevel           1.001057
TotalWorkingYears          1.001057
TrainingTimesLastYear      1.001057
WorkLifeBalance            1.001057
YearsAtCompany             1.001057
YearsInCurrentRole         1.001057
YearsSinceLastPromotion    1.001057
YearsWithCurrManager       1.001057
dtype: float64

In [86]:
#now the dataset has been balanced , standardized and encoded now its time for splitting the dataset into train and test 
#splitting for training and testing
from sklearn.model_selection import train_test_split


# Define features and target variable
X = X_rus_df  # Resampled and standardized features
Y = y_rus_df  # Resampled target

# Split into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Standardizing features after split
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [87]:
# LogisticRegression

from sklearn.linear_model import LogisticRegression
 
# Initialize and train the Logistic Regression model with more iterations
log_reg = LogisticRegression(random_state=42, max_iter=10000)
log_reg.fit(X_train_scaled, Y_train.values.ravel())

# Making predictions on the testing set
y_pred = log_reg.predict(X_test_scaled)


In [88]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report


# Evaluate the model
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

print("Confusion Matrix:")
print(confusion_matrix(Y_test, y_pred))

print("Classification Report:")
print(classification_report(Y_test, y_pred))

Precision: 0.73
Recall: 0.67
F1 Score: 0.70
Confusion Matrix:
[[35 12]
 [16 32]]
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.74      0.71        47
           1       0.73      0.67      0.70        48

    accuracy                           0.71        95
   macro avg       0.71      0.71      0.70        95
weighted avg       0.71      0.71      0.70        95

