In [1]:
#Importing libraries
import pandas as pd
import numpy as np

In [9]:
#Loading dataset
dataset = pd.read_excel("HR.xlsx")

In [10]:
#Dataset info
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


## Data Preprocessing

### Dropping null/missing value rows

In [14]:
#Check for any missing value. If found delete row
missing_values = dataset['sales'].isnull()

# Drop rows with missing values in the specified column
dataset = dataset.dropna(subset=['sales'])

### Handling outliers using IQR technique

In [15]:
# Calculate the IQR for the 'average_montly_hours' column
Q1 = dataset['average_montly_hours'].quantile(0.25)
Q3 = dataset['average_montly_hours'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify and remove outliers from the DataFrame
dataset = dataset[(dataset['average_montly_hours'] >= lower_bound) & (dataset['average_montly_hours'] <= upper_bound)]

### Label encoding on salary attribute

In [22]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

# Fit the encoder on the 'salary' column
le.fit(dataset['salary'])

# Transform the 'salary' column using the fitted encoder
dataset['salary_encoded'] = le.transform(dataset['salary'])

dataset.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,salary_encoded
0,0.38,0.53,2,157,3,0,1,0,sales,low,1
1,0.8,0.86,5,262,6,0,1,0,sales,medium,2
2,0.11,0.88,7,272,4,0,1,0,sales,medium,2
3,0.72,0.87,5,223,5,0,1,0,sales,low,1
4,0.37,0.52,2,159,3,0,1,0,sales,low,1


### *No need for normalisation on numerical values as no attribute is suitable for normalisation

## Applying classification algorithms

### Logistic Regression 

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load your dataset into a DataFrame
df = dataset

# Define the feature columns and target column
feature_cols = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
target_col = 'left'

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[target_col], test_size=0.2, random_state=0)

# Create a LogisticRegression object
clf = LogisticRegression(max_iter=1000)

# Fit the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the classifier using accuracy score
accuracy = clf.score(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.77


## Decision Tree

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Load your dataset into a DataFrame
df = dataset

# Define the feature columns and target column
feature_cols = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
target_col = 'left'

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[target_col], test_size=0.2, random_state=0)

# Create a DecisionTreeClassifier object
clf = DecisionTreeClassifier()

# Fit the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the classifier using accuracy score
accuracy = clf.score(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.98


## Applying clustering algorithms

### K-Means Clustering

In [33]:
import pandas as pd
from sklearn.cluster import KMeans

# Load your dataset into a DataFrame
df = dataset

# Define the feature columns
feature_cols = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']

# Extract the feature data from the DataFrame
X = df[feature_cols]

# Create a KMeans object with k=3 clusters
kmeans = KMeans(n_clusters=3, n_init=10)

# Fit the model on the data
kmeans.fit(X)

# Make predictions on the data
y_pred = kmeans.predict(X)

# Add the cluster labels to the DataFrame
df['cluster'] = y_pred

df.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,salary_encoded,cluster
0,0.38,0.53,2,157,3,0,1,0,sales,low,1,2
1,0.8,0.86,5,262,6,0,1,0,sales,medium,2,1
2,0.11,0.88,7,272,4,0,1,0,sales,medium,2,1
3,0.72,0.87,5,223,5,0,1,0,sales,low,1,0
4,0.37,0.52,2,159,3,0,1,0,sales,low,1,2
5,0.41,0.5,2,153,3,0,1,0,sales,low,1,2
6,0.1,0.77,6,247,4,0,1,0,sales,low,1,1
7,0.92,0.85,5,259,5,0,1,0,sales,low,1,1
8,0.89,1.0,5,224,5,0,1,0,sales,low,1,0
9,0.42,0.53,2,142,3,0,1,0,sales,low,1,2


### Agglomerative Clustering

In [40]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering

# Load your dataset into a DataFrame
df = dataset

# Define the feature columns
feature_cols = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']

# Extract the feature data from the DataFrame
X = df[feature_cols]

# Create an AgglomerativeClustering object with n_clusters=3
agg = AgglomerativeClustering(n_clusters=4)

# Fit the model on the data
agg.fit(X)

# Make predictions on the data
y_pred = agg.labels_

# Add the cluster labels to the DataFrame
df['cluster'] = y_pred

df.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,salary_encoded,cluster
0,0.38,0.53,2,157,3,0,1,0,sales,low,1,0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,2,3
2,0.11,0.88,7,272,4,0,1,0,sales,medium,2,3
3,0.72,0.87,5,223,5,0,1,0,sales,low,1,1
4,0.37,0.52,2,159,3,0,1,0,sales,low,1,0
5,0.41,0.5,2,153,3,0,1,0,sales,low,1,0
6,0.1,0.77,6,247,4,0,1,0,sales,low,1,1
7,0.92,0.85,5,259,5,0,1,0,sales,low,1,3
8,0.89,1.0,5,224,5,0,1,0,sales,low,1,1
9,0.42,0.53,2,142,3,0,1,0,sales,low,1,0
