In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## import dataset

In [4]:
# Generate synthetic data
data = pd.read_csv('https://raw.githubusercontent.com/josephgitau/sept/main/Employee.csv')

data.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [5]:
# Reshape data for scikit-learn
X = data.drop('LeaveOrNot', axis=1)
y = data['LeaveOrNot']

In [6]:
# encode the categorical variables

cat_columns = data.select_dtypes(include=['object']).columns

# import ordinal encoder
from sklearn.preprocessing import OrdinalEncoder

# instantiate the encoder
encoder = OrdinalEncoder()

# fit-transform the data
X[cat_columns] = encoder.fit_transform(X[cat_columns])

# check data
X.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,0.0,2017,0.0,3,34,1.0,0.0,0
1,0.0,2013,2.0,1,28,0.0,0.0,3
2,0.0,2014,1.0,3,38,0.0,0.0,2
3,1.0,2016,0.0,3,27,1.0,0.0,5
4,1.0,2017,2.0,3,24,1.0,1.0,2


#### experiment and see if random forest can work with categorical data

In [7]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# instantiate the model
model = RandomForestClassifier()

# fit the model
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)

In [16]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.8582169709989259
Confusion Matrix:
[[566  44]
 [ 88 233]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       610
           1       0.84      0.73      0.78       321

    accuracy                           0.86       931
   macro avg       0.85      0.83      0.84       931
weighted avg       0.86      0.86      0.86       931



In [23]:
param_grid = {
    'n_estimators': [390, 420, 380, 400, 450],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 8, 9, 11, 12,  None],
    'min_samples_split': [10, 8, 9, 11, 12],
    'min_samples_leaf': [1, 2, 4, 5, 6, 7],
    'bootstrap': [True, False]
}

In [24]:
rf_random = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                               n_iter=200, cv=5, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


  warn(


In [25]:
best_random = rf_random.best_estimator_
best_random

In [26]:
best_random = rf_random.best_estimator_

# fit the best model
best_random.fit(X_train, y_train)

# predict
y_pred = best_random.predict(X_test)

  warn(


In [27]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.8700322234156821
Confusion Matrix:
[[595  15]
 [106 215]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.98      0.91       610
           1       0.93      0.67      0.78       321

    accuracy                           0.87       931
   macro avg       0.89      0.82      0.84       931
weighted avg       0.88      0.87      0.86       931

