In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:

# Set visualization style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10,6)



In [None]:

# Load the cleaned dataset
data_path = "../data/raw/HR-Employee-Attrition_cleaned.csv"
df = pd.read_csv(data_path)


In [None]:

# Display first few rows
df.head()


In [None]:

# Basic info about dataset
df.info()


In [None]:

# Summary statistics for numerical columns
df.describe()


In [None]:

# Check missing values
df.isnull().sum()


In [None]:

# List of categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)


In [None]:

# Distribution of target variable: Attrition
sns.countplot(x='Attrition', data=df)
plt.title("Attrition Distribution")
plt.show()


In [None]:

# Histograms for numerical features
df[numerical_cols].hist(bins=20, figsize=(15,12), color='skyblue')
plt.suptitle("Histograms of Numerical Features")
plt.show()


In [None]:

# Boxplots for numerical features vs Attrition
for col in numerical_cols:
    if col != 'EmployeeNumber':  # exclude ID column
        sns.boxplot(x='Attrition', y=col, data=df)
        plt.title(f'{col} vs Attrition')
        plt.show()


In [None]:

# Countplots for categorical features vs Attrition
for col in categorical_cols:
    if col != 'Attrition':
        sns.countplot(x=col, hue='Attrition', data=df)
        plt.title(f'{col} vs Attrition')
        plt.xticks(rotation=45)
        plt.show()


In [None]:

# Correlation heatmap for numerical features
corr = df[numerical_cols].corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()
