In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset

In [2]:
data = pd.read_csv("emp_satisfaction.csv")

In [6]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


# Create a label encoder for each column

In [7]:
sales_encoder = LabelEncoder()
salary_encoder = LabelEncoder()

# Encode the non-numeric columns

In [30]:
data["sales"] = sales_encoder.fit_transform(data["sales"])
data["salary"] = salary_encoder.fit_transform(data["salary"])
print(data.head())

   satisfaction_level  last_evaluation  number_project  average_montly_hours  \
0                0.38             0.53               2                   157   
1                0.80             0.86               5                   262   
2                0.11             0.88               7                   272   
3                0.72             0.87               5                   223   
4                0.37             0.52               2                   159   

   time_spend_company  Work_accident  left  promotion_last_5years  sales  \
0                   3              0     1                      0      7   
1                   6              0     1                      0      7   
2                   4              0     1                      0      7   
3                   5              0     1                      0      7   
4                   3              0     1                      0      7   

   salary  
0       1  
1       2  
2       2  
3       1  
4 

# Split the data into features and target variables

In [31]:
X = data.drop(columns=['left'])
y = data['left'].values

# Split the data into training and testing sets

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create a Random Forest classifier with 100 trees

In [33]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)

# Fit the classifier to the training data

In [34]:
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

# Make predictions on the testing data

In [35]:
predictions_test = rf.predict(X_test)

# Calculate the accuracy of the testing data

In [36]:
accuracy = accuracy_score(y_test, predictions_test)
print(classification_report(y_test, predictions_test))
print("Accuracy of testing data = ", accuracy)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2299
           1       0.99      0.98      0.98       701

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       0.99      0.99      0.99      3000

Accuracy of testing data =  0.9926666666666667


# Checking accuracy of training dataset

In [38]:
predictions_train = rf.predict(X_train)
accuracy = accuracy_score(y_train, predictions_train)
print("Accuracy of training data = ", accuracy)
print(classification_report(y_train, predictions_train))

Accuracy of training data =  0.9999166597216435
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9129
           1       1.00      1.00      1.00      2870

    accuracy                           1.00     11999
   macro avg       1.00      1.00      1.00     11999
weighted avg       1.00      1.00      1.00     11999



# Print the feature importances

In [None]:
print("Feature Importances:")
for name, importance in zip(X.columns, rf.feature_importances_):
    print(name, "=", importance)

# Visualizing the feature importances

In [None]:
plt.figure(figsize=(10, 5))
plt.bar(X.columns, rf.feature_importances_)
plt.xticks(rotation=90)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()