In [None]:
# Logistic Regression: Employee Retention Analysis (HR Analytics)
# ---------------------------------------------------------------
# Objective:
# Predict whether an employee will leave the company based on HR attributes.

# ---------------------------------------------------------------
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ---------------------------------------------------------------
# Step 1: Load Dataset
print("===== Employee Retention Dataset =====")

hr = pd.read_csv('HR_comma_sep.csv')
print(hr.head())
print(hr.info())
print(hr.describe())

# ---------------------------------------------------------------
# Step 2: Exploratory Data Analysis (EDA)
print("\n===== Exploratory Data Analysis =====")

# Check for missing values
print("Missing values per column:\n", hr.isnull().sum())

# Correlation heatmap to identify variables influencing retention
plt.figure(figsize=(10,6))
sns.heatmap(hr.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap (HR Variables)')
plt.show()

# From correlation, satisfaction_level, average_montly_hours, and promotion_last_5years are impactful

# ---------------------------------------------------------------
# Step 3: Bar Charts

# (a) Salary vs Retention
sns.countplot(x='salary', hue='left', data=hr)
plt.title('Impact of Salary on Employee Retention')
plt.xlabel('Salary Level')
plt.ylabel('Number of Employees')
plt.show()

# (b) Department vs Retention
sns.countplot(x='Department', hue='left', data=hr)
plt.title('Department vs Employee Retention')
plt.xlabel('Department')
plt.ylabel('Number of Employees')
plt.xticks(rotation=45)
plt.show()

# ---------------------------------------------------------------
# Step 4: Data Preparation for Logistic Regression

# Select key variables
subdf = hr[['satisfaction_level', 'average_montly_hours', 'promotion_last_5years', 'salary']]

# Convert 'salary' to dummy variables
salary_dummies = pd.get_dummies(subdf['salary'], drop_first=True)
final_df = pd.concat([subdf.drop('salary', axis=1), salary_dummies], axis=1)

# Add target variable 'left'
final_df['left'] = hr['left']

# ---------------------------------------------------------------
# Step 5: Split Data into Training and Testing Sets

X = final_df.drop('left', axis=1)
y = final_df['left']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------------------------------------------
# Step 6: Build and Train Logistic Regression Model

model = LogisticRegression()
model.fit(X_train, y_train)

# ---------------------------------------------------------------
# Step 7: Model Evaluation

y_pred = model.predict(X_test)

print("\n===== Model Evaluation =====")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ---------------------------------------------------------------
# Step 8: Example Prediction

print("\nExample Prediction (satisfaction=0.8, hours=150, promotion=0, salary=medium):")
example = pd.DataFrame([[0.8, 150, 0, 1, 0]], columns=['satisfaction_level', 'average_montly_hours', 'promotion_last_5years', 'medium', 'high'])
print("Predicted (1 = will leave, 0 = will stay):", model.predict(example)[0])
