In [2]:
# 1. Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import shap

In [3]:
# 2. Load data
df = pd.read_csv('hr_data.csv')
df

Unnamed: 0,EmployeeID,Department,JobRole,Salary,Attrition,YearsAtCompany,YearsSinceLastPromotion,Gender,Age
0,1,Sales,Sales Executive,58000,Yes,3,1,Male,28
1,2,HR,HR Manager,76000,No,7,2,Female,35
2,3,IT,Software Engineer,89000,No,4,3,Male,30
3,4,Finance,Financial Analyst,67000,Yes,2,2,Female,29
4,5,Sales,Sales Manager,92000,No,8,4,Male,40
5,6,IT,Data Scientist,105000,No,6,1,Male,32
6,7,HR,Recruiter,48000,Yes,1,1,Female,26
7,8,Finance,Accountant,58000,No,5,3,Female,31
8,9,IT,Systems Engineer,71000,Yes,3,1,Male,27
9,10,Sales,Sales Executive,60000,No,2,2,Female,28


In [4]:
# 3. EDA

## a) Department-wise Attrition
dept_attrition = df.groupby('Department')['Attrition'].value_counts(normalize=True).unstack() * 100
print("Department-wise Attrition (%)\n", dept_attrition)

Department-wise Attrition (%)
 Attrition          No        Yes
Department                      
Finance     60.000000  40.000000
HR          25.000000  75.000000
IT          83.333333  16.666667
Sales       80.000000  20.000000


In [5]:
## b) Salary Bands
df['SalaryBand'] = pd.cut(df['Salary'], bins=[0,60000,80000,100000,120000], labels=['Low','Medium','High','Very High'])
df

Unnamed: 0,EmployeeID,Department,JobRole,Salary,Attrition,YearsAtCompany,YearsSinceLastPromotion,Gender,Age,SalaryBand
0,1,Sales,Sales Executive,58000,Yes,3,1,Male,28,Low
1,2,HR,HR Manager,76000,No,7,2,Female,35,Medium
2,3,IT,Software Engineer,89000,No,4,3,Male,30,High
3,4,Finance,Financial Analyst,67000,Yes,2,2,Female,29,Medium
4,5,Sales,Sales Manager,92000,No,8,4,Male,40,High
5,6,IT,Data Scientist,105000,No,6,1,Male,32,Very High
6,7,HR,Recruiter,48000,Yes,1,1,Female,26,Low
7,8,Finance,Accountant,58000,No,5,3,Female,31,Low
8,9,IT,Systems Engineer,71000,Yes,3,1,Male,27,Medium
9,10,Sales,Sales Executive,60000,No,2,2,Female,28,Low


In [6]:
## c) Promotions
df['PromotionInLast2Years'] = df['YearsSinceLastPromotion'].apply(lambda x: 'Yes' if x <= 2 else 'No')
df

Unnamed: 0,EmployeeID,Department,JobRole,Salary,Attrition,YearsAtCompany,YearsSinceLastPromotion,Gender,Age,SalaryBand,PromotionInLast2Years
0,1,Sales,Sales Executive,58000,Yes,3,1,Male,28,Low,Yes
1,2,HR,HR Manager,76000,No,7,2,Female,35,Medium,Yes
2,3,IT,Software Engineer,89000,No,4,3,Male,30,High,No
3,4,Finance,Financial Analyst,67000,Yes,2,2,Female,29,Medium,Yes
4,5,Sales,Sales Manager,92000,No,8,4,Male,40,High,No
5,6,IT,Data Scientist,105000,No,6,1,Male,32,Very High,Yes
6,7,HR,Recruiter,48000,Yes,1,1,Female,26,Low,Yes
7,8,Finance,Accountant,58000,No,5,3,Female,31,Low,No
8,9,IT,Systems Engineer,71000,Yes,3,1,Male,27,Medium,Yes
9,10,Sales,Sales Executive,60000,No,2,2,Female,28,Low,Yes


In [7]:
# 4. Data Preprocessing
## Encode categorical variables
le = LabelEncoder()
df['Attrition_Flag'] = le.fit_transform(df['Attrition'])  # 1=Yes, 0=No
df['Department_Encoded'] = le.fit_transform(df['Department'])
df['Gender_Encoded'] = le.fit_transform(df['Gender'])
df['SalaryBand_Encoded'] = le.fit_transform(df['SalaryBand'])
df['Promotion_Flag'] = df['PromotionInLast2Years'].map({'Yes':1, 'No':0})
df

Unnamed: 0,EmployeeID,Department,JobRole,Salary,Attrition,YearsAtCompany,YearsSinceLastPromotion,Gender,Age,SalaryBand,PromotionInLast2Years,Attrition_Flag,Department_Encoded,Gender_Encoded,SalaryBand_Encoded,Promotion_Flag
0,1,Sales,Sales Executive,58000,Yes,3,1,Male,28,Low,Yes,1,3,1,1,1
1,2,HR,HR Manager,76000,No,7,2,Female,35,Medium,Yes,0,1,0,2,1
2,3,IT,Software Engineer,89000,No,4,3,Male,30,High,No,0,2,1,0,0
3,4,Finance,Financial Analyst,67000,Yes,2,2,Female,29,Medium,Yes,1,0,0,2,1
4,5,Sales,Sales Manager,92000,No,8,4,Male,40,High,No,0,3,1,0,0
5,6,IT,Data Scientist,105000,No,6,1,Male,32,Very High,Yes,0,2,1,3,1
6,7,HR,Recruiter,48000,Yes,1,1,Female,26,Low,Yes,1,1,0,1,1
7,8,Finance,Accountant,58000,No,5,3,Female,31,Low,No,0,0,0,1,0
8,9,IT,Systems Engineer,71000,Yes,3,1,Male,27,Medium,Yes,1,2,1,2,1
9,10,Sales,Sales Executive,60000,No,2,2,Female,28,Low,Yes,0,3,0,1,1


In [8]:
## Features and Target
features = ['Department_Encoded', 'Salary', 'YearsAtCompany', 'Gender_Encoded', 'Promotion_Flag']
X = df[features]
y = df['Attrition_Flag']
features


['Department_Encoded',
 'Salary',
 'YearsAtCompany',
 'Gender_Encoded',
 'Promotion_Flag']

In [9]:
# 5. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [10]:
# 6. Build Models
## Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
## Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

In [12]:
# 7. Model Evaluation
print("\nLogistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nDecision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

print("\nConfusion Matrix (Decision Tree):\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report (Decision Tree):\n", classification_report(y_test, y_pred_dt))



Logistic Regression Accuracy: 0.5

Decision Tree Accuracy: 0.3333333333333333

Confusion Matrix (Decision Tree):
 [[2 2]
 [2 0]]

Classification Report (Decision Tree):
               precision    recall  f1-score   support

           0       0.50      0.50      0.50         4
           1       0.00      0.00      0.00         2

    accuracy                           0.33         6
   macro avg       0.25      0.25      0.25         6
weighted avg       0.33      0.33      0.33         6



In [13]:
# 8. SHAP Analysis (Explain Decision Tree)
explainer = shap.TreeExplainer(dt)
shap_values = explainer.shap_values(X_test)




In [15]:
# 9. Export processed data for Power BI
df.to_csv('processed_hr_data.csv', index=False)
print("\nProcessed data saved for Power BI.")


Processed data saved for Power BI.
