In [3]:
import pandas as pd
df = pd.read_csv("employee_attrition.csv")
df

Unnamed: 0,Age,Department,Years_At_Company,Salary,Overtime,JobSatisfaction,Attrition
0,29,Sales,12,38,No,3,No
1,23,Finance,12,98,No,1,Yes
2,39,Tech,19,46,No,1,No
3,37,Finance,14,46,No,5,Yes
4,36,Tech,16,90,No,4,Yes
5,30,HR,8,100,Yes,5,No
6,28,Tech,17,51,Yes,5,No
7,27,Tech,0,63,Yes,2,Yes
8,49,HR,3,97,No,1,No
9,24,Tech,17,84,No,5,No


In [4]:
df.shape

(50, 7)

In [5]:
df.head()

Unnamed: 0,Age,Department,Years_At_Company,Salary,Overtime,JobSatisfaction,Attrition
0,29,Sales,12,38,No,3,No
1,23,Finance,12,98,No,1,Yes
2,39,Tech,19,46,No,1,No
3,37,Finance,14,46,No,5,Yes
4,36,Tech,16,90,No,4,Yes


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               50 non-null     int64 
 1   Department        50 non-null     object
 2   Years_At_Company  50 non-null     int64 
 3   Salary            50 non-null     int64 
 4   Overtime          50 non-null     object
 5   JobSatisfaction   50 non-null     int64 
 6   Attrition         50 non-null     object
dtypes: int64(4), object(3)
memory usage: 2.9+ KB


In [7]:
df.describe()

Unnamed: 0,Age,Years_At_Company,Salary,JobSatisfaction
count,50.0,50.0,50.0,50.0
mean,35.22,10.2,65.34,3.08
std,8.78842,6.38557,22.293598,1.626816
min,22.0,0.0,30.0,1.0
25%,27.25,5.0,46.0,1.25
50%,35.5,10.0,60.5,3.0
75%,42.25,16.0,86.75,5.0
max,54.0,20.0,100.0,5.0


In [8]:
print("Missing Values:")
print(df.isnull().sum())
print("\n Number of Duplicate Rows:", df.duplicated().sum())

Missing Values:
Age                 0
Department          0
Years_At_Company    0
Salary              0
Overtime            0
JobSatisfaction     0
Attrition           0
dtype: int64

 Number of Duplicate Rows: 0


In [10]:
from sklearn.preprocessing import LabelEncoder
categorical_cols = df.select_dtypes(include=['object'])
lab = LabelEncoder()

for col in categorical_cols:
    df[col] = lab.fit_transform(df[col])

In [11]:
df.head()

Unnamed: 0,Age,Department,Years_At_Company,Salary,Overtime,JobSatisfaction,Attrition
0,29,2,12,38,0,3,0
1,23,0,12,98,0,1,1
2,39,3,19,46,0,1,0
3,37,0,14,46,0,5,1
4,36,3,16,90,0,4,1


In [12]:
df.dtypes

Age                 int64
Department          int64
Years_At_Company    int64
Salary              int64
Overtime            int64
JobSatisfaction     int64
Attrition           int64
dtype: object

In [13]:
X = df.drop(['Attrition', 'Age', 'Years_At_Company', 'Salary'], axis = 1)
y = df['Attrition']
X = pd.get_dummies(X, columns = ['Department', 'Overtime'], drop_first = True)
display(X.head())
display(y.head())

Unnamed: 0,JobSatisfaction,Department_1,Department_2,Department_3,Overtime_1
0,3,False,True,False,False
1,1,False,False,False,False
2,1,False,False,True,False
3,5,False,False,False,False
4,4,False,False,True,False


0    0
1    1
2    0
3    1
4    1
Name: Attrition, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (40, 5)
X_test shape: (10, 5)
y_train shape: (40,)
y_test shape: (10,)


In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100, random_state = 42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred, average  = 'weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision:  {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.4000
Precision:  0.3810
Recall: 0.4000
F1-score: 0.3750


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

n_estimators_values = [50, 100, 150]
max_depth_values = [None, 5, 10]
results = []

for n_estimators in n_estimators_values:
     for max_depth in max_depth_values:
         model = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth, random_state = 42)
         model.fit(X_train, y_train)
         y_pred = model.predict(X_test)
         
         accuracy = accuracy_score(y_test, y_pred)
         precision = precision_score(y_test, y_pred, average = 'weighted', zero_division = 0)
         recall = recall_score(y_test, y_pred, average = 'weighted', zero_division = 0)
         f1 = f1_score(y_test, y_pred, average = 'weighted', zero_division = 0)

         results.append({
             'max_depth': max_depth,
             'n_estimators': n_estimators,
             'accuracy': accuracy,
             'precision': precision,
             'recall': recall,
             'f1_score': f1
       })
         
results_df = pd.DataFrame(results)
display(results_df)


Unnamed: 0,max_depth,n_estimators,accuracy,precision,recall,f1_score
0,,50,0.4,0.380952,0.4,0.375
1,5.0,50,0.5,0.5,0.5,0.494949
2,10.0,50,0.4,0.380952,0.4,0.375
3,,100,0.4,0.380952,0.4,0.375
4,5.0,100,0.4,0.380952,0.4,0.375
5,10.0,100,0.4,0.380952,0.4,0.375
6,,150,0.4,0.380952,0.4,0.375
7,5.0,150,0.3,0.291667,0.3,0.292929
8,10.0,150,0.4,0.380952,0.4,0.375
