In [2]:
!pip install reportlab


Collecting reportlab
  Downloading reportlab-4.4.4-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.4-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   -------------------------------- ------- 1.6/2.0 MB 7.6 MB/s eta 0:00:01
   ---------------------------------------- 2.0/2.0 MB 6.8 MB/s eta 0:00:00
Installing collected packages: reportlab
Successfully installed reportlab-4.4.4


In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER

csv_path = "WA_Fn-UseC_-HR-Employee-Attrition.csv"   # update path if needed
df = pd.read_csv(csv_path)
df.head()


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
print("Shape:", df.shape)
print("Missing per column:\n", df.isnull().sum().head(10))
print("Constant cols:", [c for c in df.columns if df[c].nunique()<=1])


Shape: (1470, 35)
Missing per column:
 Age                 0
Attrition           0
BusinessTravel      0
DailyRate           0
Department          0
DistanceFromHome    0
Education           0
EducationField      0
EmployeeCount       0
EmployeeNumber      0
dtype: int64
Constant cols: ['EmployeeCount', 'Over18', 'StandardHours']


In [5]:
# Map target
df['Attrition'] = df['Attrition'].map({'Yes':1, 'No':0})

# Drop identifier/constant columns
drops = ['EmployeeNumber','EmployeeCount','Over18','StandardHours']
df = df.drop(columns=[c for c in drops if c in df.columns], errors='ignore')

# Split numeric and categorical
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c!='Attrition']
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

X_num = df[numeric_cols].copy()
X_cat = pd.get_dummies(df[cat_cols].astype(str), drop_first=True)
X = pd.concat([X_num, X_cat], axis=1)
y = df['Attrition'].astype(int)

print("Features:", X.shape, "Target distribution:\n", y.value_counts(normalize=True))


Features: (1470, 44) Target distribution:
 Attrition
0    0.838776
1    0.161224
Name: proportion, dtype: float64


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)
joblib.dump(clf, "hr_attrition_rf_model.pkl")


['hr_attrition_rf_model.pkl']

In [7]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc = roc_auc_score(y_test, y_proba)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)
print("ROC AUC:", roc)
print("Confusion Matrix:\n", cm)
print("\nClassification report:\n", classification_report(y_test, y_pred, zero_division=0))


Accuracy: 0.826530612244898
Precision: 0.3333333333333333
Recall: 0.0851063829787234
F1: 0.13559322033898305
ROC AUC: 0.8009303126884314
Confusion Matrix:
 [[239   8]
 [ 43   4]]

Classification report:
               precision    recall  f1-score   support

           0       0.85      0.97      0.90       247
           1       0.33      0.09      0.14        47

    accuracy                           0.83       294
   macro avg       0.59      0.53      0.52       294
weighted avg       0.77      0.83      0.78       294



In [11]:
import matplotlib.pyplot as plt
feat_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
top = feat_imp.head(12)
print(top)

# Save plots
plt.figure(figsize=(5,4))
vals = df['Attrition'].value_counts().sort_index()
plt.bar(['No','Yes'], vals.values)
plt.title("Attrition Distribution")
plt.ylabel("Count")
plt.savefig("attrition_distribution.png")
plt.close()

plt.figure(figsize=(6,4))
plt.hist(df[df['Attrition']==0]['Age'], bins=20, alpha=0.6, label='No')
plt.hist(df[df['Attrition']==1]['Age'], bins=20, alpha=0.6, label='Yes')
plt.legend()
plt.title("Age distribution by Attrition")
plt.savefig("age_attrition_hist.png")
plt.close()

plt.figure(figsize=(6,4))
plt.boxplot(
    [df[df['Attrition']==0]['MonthlyIncome'], df[df['Attrition']==1]['MonthlyIncome']], 
    tick_labels=['No','Yes']
)

plt.title("Monthly Income by Attrition")
plt.savefig("income_by_attrition_boxplot.png")
plt.close()

plt.figure(figsize=(6,4))
top.head(10)[::-1].plot(kind='barh')
plt.title("Top 10 Feature Importances")
plt.savefig("feature_importances.png")
plt.close()


MonthlyIncome           0.073386
Age                     0.066670
DailyRate               0.054236
TotalWorkingYears       0.052048
HourlyRate              0.047055
DistanceFromHome        0.046824
MonthlyRate             0.046283
YearsAtCompany          0.045568
OverTime_Yes            0.038507
NumCompaniesWorked      0.036292
YearsWithCurrManager    0.035765
PercentSalaryHike       0.034137
dtype: float64


In [12]:
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER

doc = SimpleDocTemplate("HR_Attrition_Report.pdf", pagesize=A4)
styles = getSampleStyleSheet()
title_style = ParagraphStyle(name="TitleStyle", parent=styles["Heading1"], alignment=TA_CENTER, fontSize=18)
story = []
story.append(Paragraph("IBM HR Analytics — Employee Attrition & Performance", title_style))
story.append(Paragraph("<b>Dataset</b>: Rows: {} | Columns: {}".format(df.shape[0], df.shape[1]), styles['BodyText']))
story.append(Spacer(1,6))
story.append(RLImage("attrition_distribution.png", width=450, height=300))
story.append(Spacer(1,6))
story.append(Paragraph("<b>Model Metrics (test)</b>: Accuracy {:.3f}, Precision {:.3f}, Recall {:.3f}, F1 {:.3f}, ROC AUC {:.3f}".format(acc, prec, rec, f1, roc), styles['BodyText']))
story.append(Spacer(1,6))
story.append(RLImage("income_by_attrition_boxplot.png", width=450, height=300))
story.append(Spacer(1,6))
story.append(RLImage("feature_importances.png", width=450, height=300))
doc.build(story)


In [13]:
from IPython.display import IFrame
IFrame("HR_Attrition_Report.pdf", width=800, height=600)
