In [None]:
# 📚 Basic Libraries
import pandas as pd
import numpy as np

# 📊 Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# 🤖 Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 
from sklearn.metrics import roc_curve, confusion_matrix, ConfusionMatrixDisplay
# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings

In [None]:
dataset=pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

### Overview
Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or ‘compare average monthly income by education and attrition’. This is a fictional data set created by IBM data scientists.

## Features
Education

1 'Below College'
2 'College'
3 'Bachelor'
4 'Master'
5 'Doctor'

EnvironmentSatisfaction

1 'Low'
2 'Medium'
3 'High'
4 'Very High'

JobInvolvement

1 'Low'
2 'Medium'
3 'High'
4 'Very High'

JobSatisfaction

1 'Low'
2 'Medium'
3 'High'
4 'Very High'

PerformanceRating

1 'Low'
2 'Good'
3 'Excellent'
4 'Outstanding'

RelationshipSatisfaction

1 'Low'
2 'Medium'
3 'High'
4 'Very High'

WorkLifeBalance

1 'Bad'
2 'Good'
3 'Better'
4 'Best'

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset.isna().sum()

In [None]:
dataset.duplicated().value_counts()

In [None]:
dataset.columns = dataset.columns.str.lower().str.replace(" ", "_")

In [None]:
df=dataset.copy()

In [None]:
df.eq(" ").sum()

In [None]:
df.nunique()

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.businesstravel.value_counts()

In [None]:
df.attrition.value_counts()

In [None]:
df.describe().T

<h2 style="color: #FF6347;">Moving our target to the right</h2>

In [None]:
target = df.pop('attrition')

In [None]:
df['attrition'] = target

In [None]:
df.head(3)

<h3 style="color: #FF6347;">Checking our target distribution</h3>

In [None]:
# check distribution for target variable
sns.countplot(x ='attrition', data = df)
plt.savefig('attrition.png')

In [None]:
df.dtypes

In [None]:
df.overtime.value_counts()

In [None]:
df.over18.value_counts()

## Breakdown of Distance from Home by Job Role and Attrition

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x='jobrole', y='distancefromhome', hue='attrition', data=df)
plt.title('Breakdown of Distance from Home by Job Role and Attrition')
plt.xticks(rotation=90)
plt.show()


"""Education

1 'Below College'
2 'College'
3 'Bachelor'
4 'Master'
5 'Doctor' """

### compare average monthly income by education and attrition

In [None]:
grouped = df.groupby(['education', 'attrition'])['monthlyincome'].mean().reset_index()

# Plot the average MonthlyIncome
plt.figure(figsize=(12, 8))
sns.barplot(x='education', y='monthlyincome', hue='attrition', data=grouped)
plt.title('Average Monthly Income by Education and Attrition')
plt.xlabel('Education Level')
plt.ylabel('Average Monthly Income')
plt.legend(title='Attrition')
plt.show()

#Targeted Retention Strategies: For education levels where attrition 'Yes' has significantly lower incomes than attrition 'No', consider strategies such as salary adjustments to improve retention.

In [None]:
# in case of 5 :Identifying Key Education Levels: If certain education levels have a high average income and also high attrition, this could indicate that these employees might be leaving for better pay elsewhere.

### Job Satisfaction by Attrition

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='attrition', y='jobsatisfaction', data=df)
plt.title('Job Satisfaction by Attrition')
plt.show()


In [None]:
### Job Satisfaction: Lower job satisfaction might correlate with higher attrition.

### Monthly Income by Job Role and Attrition

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x='jobrole', y='monthlyincome', hue='attrition', data=df)
plt.title('Monthly Income by Job Role and Attrition')
plt.xticks(rotation=90)
plt.show()


### Attrition Rate by Department

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='department', hue='attrition', data=df)
plt.title('Attrition Rate by Department')
plt.show()

<h2 style="color: #008080;">Selecting numerical</h2>

In [None]:
num=df.select_dtypes("number")

<h2 style="color: #008080;">Checking Distributions</h2>

In [None]:
color = '#0072B2'

# grid size
nrows, ncols = 5, 4  # adjust for your number of features

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))

axes = axes.flatten()

# Plot each numerical feature
for i, ax in enumerate(axes):
    if i >= len(num.columns):
        ax.set_visible(False)  # hide unesed plots
        continue
    ax.hist(num.iloc[:, i], bins=30, color=color, edgecolor='black')
    ax.set_title(num.columns[i])

plt.tight_layout()
plt.show()

In [None]:
num.jobsatisfaction.skew().round(2)  # jobsatisfaction

In [None]:
num.relationshipsatisfaction.skew().round(2)  #relationshipsatisfaction

- Skewness = 0: The distribution is perfectly symmetrical, resembling a normal distribution.
- Skewness > 0: The distribution is positively skewed (or right-skewed), meaning there is a longer tail on the right side of the distribution. The mass of the distribution is concentrated on the left of the figure.
- Skewness < 0: The distribution is negatively skewed (or left-skewed), meaning there is a longer tail on the left side of the distribution. The mass of the distribution is concentrated on the right of the figure.

In [None]:
color = '#0072B2'

# grid size
nrows, ncols = 5, 4 

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))

axes = axes.flatten()

for i, ax in enumerate(axes):
    if i >= len(num.columns):
        ax.set_visible(False)
        continue
    ax.boxplot(num.iloc[:, i].dropna(), vert=False, patch_artist=True, 
               boxprops=dict(facecolor=color, color='black'), 
               medianprops=dict(color='yellow'), whiskerprops=dict(color='black'), 
               capprops=dict(color='black'), flierprops=dict(marker='o', color='red', markersize=5))
    ax.set_title(num.columns[i], fontsize=10)
    ax.tick_params(axis='x', labelsize=8)

plt.tight_layout()
plt.show()

If we dig into numerical data we will find out some interesting information.

Column 'EmployeeCount' is all 1s which indicate every observation is linked with 1 employee only, thanks for this info and we will drop it.
Column 'StandardHours' is all 80s which means everyone in this dataset works as a fulltime employee and we could definitely drop it as well.
Column 'Over18' is another interesting column which tells us every employee in this dataset is over 18 and we will drop it as well.

In [None]:
# drop out features that give out useless information
df = df.drop(columns = ['employeenumber', 'employeecount', 'standardhours', 'over18'])
df.head()

In [None]:
# visualization for numerical features
fig, axss = plt.subplots(3,4, figsize=[15,10])
sns.boxplot(x='attrition', y ='dailyrate', data=df, ax=axss[0][0],palette="Blues")
sns.boxplot(x='attrition', y ='age', data=df, ax=axss[0][1],palette="Blues")
sns.boxplot(x='attrition', y ='distancefromhome', data=df, ax=axss[0][2],palette="Blues")
sns.boxplot(x='attrition', y ='hourlyrate', data=df, ax=axss[0][3],palette="Blues")
sns.boxplot(x='attrition', y ='monthlyincome', data=df, ax=axss[1][0],palette="Blues")
sns.boxplot(x='attrition', y ='monthlyrate', data=df, ax=axss[1][1],palette="Blues")
sns.boxplot(x='attrition', y ='numcompaniesworked', data=df, ax=axss[1][2],palette="Blues")
sns.boxplot(x='attrition', y ='totalworkingyears', data=df, ax=axss[1][3],palette="Blues")
sns.boxplot(x='attrition', y ='yearsatcompany', data=df, ax=axss[2][0],palette="Blues")
sns.boxplot(x='attrition', y ='yearsincurrentrole', data=df, ax=axss[2][1],palette="Blues")
sns.boxplot(x='attrition', y ='yearssincelastpromotion', data=df, ax=axss[2][2],palette="Blues")
sns.boxplot(x='attrition', y ='yearswithcurrmanager', data=df, ax=axss[2][3],palette="Blues")
plt.tight_layout()
plt.savefig('numerical_dist.png');

In [None]:
# visualization for non numerical features
fig,axss = plt.subplots(2,4, figsize=[15,10])
sns.countplot(x='attrition', hue='businesstravel', data=df, ax=axss[0][0])
sns.countplot(x='attrition', hue='department', data=df, ax=axss[0][1])
sns.countplot(x='attrition', hue='gender', data=df, ax=axss[0][2])
sns.countplot(x='attrition', hue='jobrole', data=df, ax=axss[0][3])
sns.countplot(x='attrition', hue='educationfield', data=df, ax=axss[1][0])
sns.countplot(x='attrition', hue='maritalstatus', data=df, ax=axss[1][1])
sns.countplot(x='attrition', hue='overtime', data=df, ax=axss[1][2])
plt.tight_layout()
plt.savefig('cate_dist.png');

<h2 style="color: #008080;">2. Feature engineering</h2>

For feature engineering, we would like to check correlations between each features and tranform non numerical feature into numerical by different ways such as encoding so that we could be able to feed the feature to the model.

### Label Encoding:

In [None]:
df['attrition']=df['attrition'].map({'No':0,'Yes':1})
df['overtime']=df['overtime'].map({'No':0,'Yes':1})

### Looking for Correlations

In [None]:
# check correlation between numerical features and target variable
corr_score = df[['age', 'dailyrate', 'distancefromhome', 'education',
        'environmentsatisfaction', 'hourlyrate', 'jobinvolvement', 'joblevel',
        'jobsatisfaction', 'monthlyincome', 'monthlyrate', 'numcompaniesworked',
        'percentsalaryhike', 'performancerating', 'relationshipsatisfaction',
        'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
        'worklifebalance', 'yearsatcompany', 'yearsincurrentrole',
        'yearssincelastpromotion', 'yearswithcurrmanager', 'attrition']].corr()
corr_score

In [None]:
# visualization of correlation relationships
plt.figure(figsize=(15, 10))
mask = np.triu(corr_score)
sns.heatmap(corr_score,cmap="Oranges",annot = True, fmt = '.2f',mask = mask,cbar_kws={"shrink": .5})
plt.tight_layout()
plt.savefig('corr.png');

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
# Correlation values with attrition
pearson_correlation=num.corrwith(df['attrition'])

# Convert to DataFrame for easy plotting
correlation_df = pd.DataFrame(list(pearson_correlation.items()), columns=['Feature', 'Correlation']).dropna()

# Sort by absolute value of correlation
correlation_df['AbsCorrelation'] = correlation_df['Correlation'].abs()
correlation_df = correlation_df.sort_values(by='AbsCorrelation', ascending=False)

# Plot
plt.figure(figsize=(12, 8))
plt.barh(correlation_df['Feature'], correlation_df['Correlation'], color='skyblue')
plt.xlabel('Correlation with Attrition')
plt.title('Feature Correlations with Employee Attrition')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest correlation on top
plt.tight_layout()
plt.savefig('correlation_bar_plot.png')
plt.show()


In [None]:
df.head(3)

 ### one-hot encoding

In [None]:
# Apply one-hot encoding to other categorical columns
one_hot_enc_columns = ['businesstravel', 'department', 'educationfield', 'gender', 'jobrole', 'maritalstatus']
df = pd.get_dummies(df, columns=one_hot_enc_columns, drop_first=True)

In [None]:
df= df*1

In [None]:
df

In [None]:
df.dtypes

In [None]:
# Calculate the total number of employees
total_employees = df.shape[0]

# Calculate the number of employees who left
employees_left = df[df['attrition'] == 'Yes'].shape[0]

# Calculate the attrition rate
attrition_rate = (employees_left / total_employees) * 100
print(f"Attrition Rate: {attrition_rate:.2f}%")


In [None]:
# Count the number of attrition cases
attrition_counts = df['attrition'].value_counts()

# Plot the bar chart
plt.figure(figsize=(8, 6))
sns.barplot(x=attrition_counts.index, y=attrition_counts.values, palette='viridis')
plt.title('Attrition Count')
plt.xlabel('Attrition')
plt.ylabel('Number of Employees')
plt.show()

In [None]:
num.monthlyincome.skew().round(2)

In [None]:
df.head()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
# Selecting numerical features for standardization
numerical_features = [
    'age', 'dailyrate', 'distancefromhome', 'education', 'employeenumber', 
    'environmentsatisfaction', 'hourlyrate', 'jobinvolvement', 'joblevel', 
    'jobsatisfaction', 'monthlyincome', 'monthlyrate', 'numcompaniesworked', 
    'percentsalaryhike', 'performancerating', 'relationshipsatisfaction', 
    'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear', 
    'worklifebalance', 'yearsatcompany', 'yearsincurrentrole', 
    'yearssincelastpromotion', 'yearswithcurrmanager'
]

# Dropping features that are constant and don't need standardization
numerical_features = [feature for feature in numerical_features if feature in df.columns]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Check the result
df.head()

<h2 style="color: #008080;">3. Model training and performance evaluation</h2>

In [None]:
# Select features and target
X = df.drop(columns=['attrition'])
y = df['attrition']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

class_report = classification_report(y_test, y_pred)
#cm = confusion_matrix(y_test, y_pred)
#print("confusion matrix",cm)
print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)

In [None]:
print("Test data accuracy: ",model.score(X_test,y_test))
print("Train data accuracy: ", model.score(X_train, y_train))

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
plt.figure(figsize=(8, 6))
disp.plot(cmap='Oranges')  
plt.grid(True)
plt.show()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg=RandomForestRegressor()
reg.fit(X_train,y_train)

In [None]:
y_pred_randomF=reg.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_randomF)

class_report = classification_report(y_test, y_pred_randomF)

print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)

In [None]:
from sklearn.svm import LinearSVC

In [None]:
model = LinearSVC()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(classification_report(y_test, predictions))

In [None]:
cm = confusion_matrix(y_test, predictions)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
plt.figure(figsize=(8, 6))
disp.plot(cmap='Oranges')  
plt.grid(True)
plt.show()

<h2 style="color: #FF6347;">XGBoost Regressor</h2>

In [None]:
from xgboost import XGBRegressor
xgb=XGBRegressor()
xgb.fit(X_train,y_train)

In [None]:
y_pred_XGB=xgb.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_XGB)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred_XGB))

In [None]:
cm = confusion_matrix(y_test, y_pred_XGB)

In [None]:
feature_names = list(X_train.columns)
importances = reg.feature_importances_
indices = np.argsort(importances)[::-1] # sorts indices of importances in descending order

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), [feature_names[i] for i in indices], rotation=90)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

<h2 style="color: #FF6347;">Imbalance Data</h2>

In [None]:
count_classes = pd.value_counts(df['attrition'])
count_classes.plot(kind = 'bar')

<h2 style="color: #FF6347;">Oversampling/Undersampling</h2>

In [None]:
from sklearn.utils import resample

In [None]:
train = pd.concat([X_train, y_train], axis=1)

In [None]:
no_attrition = train[train['attrition']==0]
attrition = train[train['attrition']==1]

In [None]:
display(no_attrition.shape)
display(attrition.shape)

In [None]:
oversampled_attrition = resample(attrition,
                                replace=True,  # Sample with replacement
                                n_samples=len(no_attrition),  # Match number in majority class
                                random_state=42)  # Reproducible results

In [None]:
display(no_attrition.shape)
display(oversampled_attrition.shape)

In [None]:
train_oversampled = pd.concat([no_attrition,oversampled_attrition])
train_oversampled.sample(10)

In [None]:
X_train = train_oversampled.drop('attrition',axis = 1).copy()
y_train = train_oversampled['attrition'].copy()

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
len(X_test)

In [None]:
len(y_test)

<h2 style="color: #FF6347;">Model Validation</h2>

In [None]:
predictions = model.predict(X_test)

In [None]:
len(predictions)

In [None]:
print(classification_report(y_test, predictions))

<h2 style="color: #FF6347;">Oversampling with SMOTE</h2>

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
X = df.drop("attrition", axis=1)
y = df.attrition

In [None]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=100)

In [None]:
y.value_counts()

In [None]:
y_sm.value_counts()

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
cm = confusion_matrix(y_test, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
plt.figure(figsize=(8, 6))
disp.plot(cmap='Oranges')  
plt.grid(True)
plt.show()

<h2 style="color: #FF6347;">Undersampling with TomeLinks</h2>

In [None]:
from imblearn.under_sampling import TomekLinks # Undersampling librarie technique

In [None]:
X = df.drop("attrition", axis=1)
y = df.attrition

In [None]:
tl = TomekLinks(sampling_strategy='majority')
X_tl, y_tl = tl.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=100)

In [None]:
y.value_counts()

In [None]:
y_tl.value_counts()

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))