
# 🏢 Predicting Employee Attrition: A Decision-Support Tool for HR Strategy

This project uses HR data to identify patterns in employee attrition and predict who is at risk of leaving. It aims to help HR departments proactively improve retention using data-driven insights.


## 📂 Load Dataset

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

import warnings
warnings.filterwarnings('ignore')

# Download CSV from GitHub if running on Colab
if "google.colab" in str(get_ipython()):
    !wget https://raw.githubusercontent.com/Rafsun-Chowdhury/Salifort-Motors-Employee-Retention-Project-/main/HR_capstone_dataset.csv

df = pd.read_csv("HR_capstone_dataset.csv")
df.head()


## 🧹 Clean and Prepare the Data

In [None]:

df = df.rename(columns={
    'Work_accident': 'work_accident',
    'average_montly_hours': 'average_monthly_hours',
    'time_spend_company': 'tenure',
    'Department': 'department'
})
df = df.drop_duplicates()

# Encode salary
df['salary'] = df['salary'].map({'low': 0, 'medium': 1, 'high': 2})
df['salary_label'] = df['salary'].map({0: 'Low', 1: 'Medium', 2: 'High'})

# One-hot encode department
df = pd.get_dummies(df, columns=['department'], drop_first=True)

# Confirm the real one-hot department feature names
department_columns = [col for col in df.columns if col.startswith("department_")]
print("Department features used in training:", department_columns)


## 📊 Visual Insights

In [None]:

plt.figure(figsize=(8,5))
sns.countplot(data=df, x='salary_label', hue='left')
plt.title("Attrition by Salary Level")
plt.xlabel("Salary Level")
plt.ylabel("Employee Count")
plt.legend(title="Left Company", labels=["Stayed", "Left"])
plt.grid(True)
plt.show()

prop_df = df.groupby('salary_label')['left'].value_counts(normalize=True).unstack()
prop_df.plot(kind='bar', stacked=True, figsize=(8,5), colormap='coolwarm')
plt.title("Proportion of Employees Who Left by Salary Level")
plt.ylabel("Proportion")
plt.xlabel("Salary Level")
plt.legend(title="Left Company", labels=["Stayed", "Left"])
plt.grid(axis='y')
plt.show()

plt.figure(figsize=(8,4))
sns.boxplot(data=df, x='left', y='satisfaction_level')
plt.title("Satisfaction Level by Attrition Status")
plt.grid(True)
plt.show()


## 🤖 Train the Model

In [None]:

X = df.drop(columns=['left', 'salary_label'])
y = df['left']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)


## 📌 Feature Importance

In [None]:

importances = pd.Series(model.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh', figsize=(10,6), title='Feature Importances')
plt.grid(True)
plt.show()


## 🧠 Predict Attrition with Just a Few Inputs

In [None]:

def simple_attrition_risk(model, satisfaction_level, monthly_hours, salary_level, department_name):
    feature_vector = {
        'satisfaction_level': satisfaction_level,
        'last_evaluation': 0.6,
        'number_project': 4,
        'average_monthly_hours': monthly_hours,
        'tenure': 3,
        'work_accident': 0,
        'promotion_last_5years': 0,
        'salary': salary_level
    }

    all_departments = [
        'department_IT', 'department_RandD', 'department_accounting',
        'department_hr', 'department_management', 'department_marketing',
        'department_product_mng', 'department_sales', 'department_support',
        'department_technical'
    ]

    for dept in all_departments:
        feature_vector[dept] = 1 if dept.endswith(department_name.lower()) else 0

    input_data = pd.DataFrame([feature_vector])
    pred = model.predict_proba(input_data)[0]
    print(f"Likelihood of staying: {pred[0]*100:.1f}%, Leaving: {pred[1]*100:.1f}%")

# Example
simple_attrition_risk(model, satisfaction_level=0.3, monthly_hours=180, salary_level=0, department_name='technical')



## ✅ Conclusion

This notebook demonstrates how HR teams can use data to identify potential attrition risks and take preemptive action. Visualizations and a simplified prediction tool make the insights both actionable and accessible.
