
# 01 — Data Exploration (EDA): Employee Attrition

This notebook performs initial **Exploratory Data Analysis (EDA)** for the project **Employee Attrition Prediction System**.

**Goals**
- Load the IBM HR Analytics dataset
- Inspect structure, types, missing values
- Explore target balance (**Attrition**)
- Produce quick visualizations (histograms, counts)
- Save figures for the report in `../reports/figs/`

> Paths assume this notebook lives in `notebooks/` inside your repo. If your structure differs, adjust the relative paths in the first code cell.


In [None]:

# --- Setup & Imports ---
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Make sure plots display in the notebook
%matplotlib inline

# Where to save figures
FIG_DIR = os.path.abspath(os.path.join(os.path.pardir, 'reports', 'figs'))
os.makedirs(FIG_DIR, exist_ok=True)

print('Figure directory:', FIG_DIR)



## 1. Load Dataset

By default, we try `../data/raw/WA_Fn-UseC_-HR-Employee-Attrition.csv` (Kaggle IBM HR dataset).
If the file isn't found, adjust the `CSV_PATH` below.


In [None]:

# --- Load dataset ---
possible_paths = [
    os.path.abspath(os.path.join(os.path.pardir, 'data', 'raw', 'WA_Fn-UseC_-HR-Employee-Attrition.csv')),
    os.path.abspath(os.path.join('..', 'data', 'raw', 'WA_Fn-UseC_-HR-Employee-Attrition.csv')),
    'WA_Fn-UseC_-HR-Employee-Attrition.csv'
]

CSV_PATH = None
for p in possible_paths:
    if os.path.exists(p):
        CSV_PATH = p
        break

if CSV_PATH is None:
    raise FileNotFoundError("Dataset not found. Place the CSV at '../data/raw/WA_Fn-UseC_-HR-Employee-Attrition.csv' and re-run.")

print('Using dataset at:', CSV_PATH)
df = pd.read_csv(CSV_PATH)
df.head()



## 2. Overview: Shape, Types, Preview


In [None]:

print('Shape:', df.shape)

print('\nData types:')
print(df.dtypes.sort_values())

print('\nSample rows:')
display(df.head(10))



## 3. Missing Values & Basic Stats


In [None]:

# Missing values per column
na_counts = df.isna().sum().sort_values(ascending=False)
display(na_counts[na_counts>0])

# Numeric summary
display(df.describe())

# Categorical summary
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
display(df[categorical_cols].describe(include='object'))



## 4. Target Distribution (Attrition)


In [None]:

target_col = 'Attrition'
if target_col not in df.columns:
    raise KeyError(f"Expected target column '{target_col}' not found.")

counts = df[target_col].value_counts().sort_index()
print(counts)

# Bar plot for class balance
fig, ax = plt.subplots()
ax.bar(counts.index.astype(str), counts.values)
ax.set_title('Attrition Class Distribution')
ax.set_xlabel('Attrition')
ax.set_ylabel('Count')
for i, v in enumerate(counts.values):
    ax.text(i, v + max(counts.values)*0.01, str(v), ha='center', va='bottom')
fig.tight_layout()
fig_path = os.path.join(FIG_DIR, 'attrition_class_distribution.png')
fig.savefig(fig_path, dpi=200)
print('Saved plot ->', fig_path)
plt.show()



## 5. Histograms for Numerical Features


In [None]:

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

for col in numeric_cols:
    fig, ax = plt.subplots()
    ax.hist(df[col].dropna(), bins=30)
    ax.set_title(f'Distribution: {col}')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    fig.tight_layout()
    out = os.path.join(FIG_DIR, f'hist_{col}.png')
    fig.savefig(out, dpi=200)
    plt.close(fig)

print(f"Saved {len(numeric_cols)} histogram(s) to:", FIG_DIR)



## 6. Correlation Heatmap (Numeric Features)


In [None]:

if len(numeric_cols) > 1:
    corr = df[numeric_cols].corr(numeric_only=True)
    fig, ax = plt.subplots(figsize=(10,8))
    cax = ax.imshow(corr.values, interpolation='nearest', aspect='auto')
    ax.set_title('Correlation Heatmap (numeric features)')
    ax.set_xticks(range(len(numeric_cols)))
    ax.set_yticks(range(len(numeric_cols)))
    ax.set_xticklabels(numeric_cols, rotation=90)
    ax.set_yticklabels(numeric_cols)
    fig.colorbar(cax)
    fig.tight_layout()
    out = os.path.join(FIG_DIR, 'correlation_heatmap.png')
    fig.savefig(out, dpi=200)
    print('Saved plot ->', out)
    plt.show()
else:
    print("Not enough numeric columns for a correlation heatmap.")



## 7. Categorical Features vs Attrition (Top Categories)

For key categorical variables, show counts and attrition rate per category.


In [None]:

def bar_counts(series, title, xlabel, fname):
    vc = series.value_counts()
    fig, ax = plt.subplots()
    ax.bar(vc.index.astype(str), vc.values)
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=45)
    fig.tight_layout()
    out = os.path.join(FIG_DIR, fname)
    fig.savefig(out, dpi=200)
    plt.close(fig)
    return out

def attrition_rate_by_cat(df, cat_col, target='Attrition'):
    # Assumes target is 'Yes'/'No'
    grp = df.groupby(cat_col)[target].apply(lambda s: (s=='Yes').mean()).sort_values(ascending=False)
    return grp

cat_candidates = ['Department', 'JobRole', 'BusinessTravel', 'MaritalStatus', 'OverTime', 'EducationField']

for col in cat_candidates:
    if col in df.columns:
        count_img = bar_counts(df[col], f'{col} — Counts', col, f'counts_{col}.png')
        print('Saved:', count_img)

        rate = attrition_rate_by_cat(df, col, target=target_col)
        display(rate.to_frame('AttritionRate'))
        # Plot rates
        fig, ax = plt.subplots(figsize=(8,4))
        ax.bar(rate.index.astype(str), rate.values)
        ax.set_title(f'{col} — Attrition Rate')
        ax.set_xlabel(col)
        ax.set_ylabel('Attrition Rate (Yes)')
        ax.set_ylim(0, 1)
        ax.tick_params(axis='x', rotation=45)
        fig.tight_layout()
        out = os.path.join(FIG_DIR, f'attrition_rate_{col}.png')
        fig.savefig(out, dpi=200)
        print('Saved:', out)
        plt.show()



## 8. Save a Cleaned Copy (Optional)

If you perform simple cleaning (e.g., drop columns, fix types), you can save a processed version for modeling.


In [None]:

# Example: no cleaning yet, but here is where you'd do it.
processed_dir = os.path.abspath(os.path.join(os.path.pardir, 'data', 'processed'))
os.makedirs(processed_dir, exist_ok=True)

processed_path = os.path.join(processed_dir, 'hr_attrition_processed.csv')
df.to_csv(processed_path, index=False)
print('Saved processed copy ->', processed_path)
