# Loading the libraries

In [110]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Loading the dataset

In [111]:
data = pd.read_csv('EmployeeSurvey.csv')


# Intitial exploration of the data

In [112]:
print("First 5 rows⬇️:\n")
print(data.head())
print("\nLast 5 rows⬇️:\n")
print(data.tail())
print("\nGeneral information about the dataset⬇️:\n")
print(data.info())
print(f"Numbers of rows and columns in the dataset: {data.shape}")
print(f"\nRows with at least one null value: {data.isnull().any(axis=1).sum()}")


First 5 rows⬇️:

   Emp ID  satisfaction_level  last_evaluation  number_project  \
0     1.0                0.38             0.53             2.0   
1     2.0                0.80             0.86             5.0   
2     3.0                0.11             0.88             7.0   
3     4.0                0.72             0.87             5.0   
4     5.0                0.37             0.52             2.0   

   average_montly_hours  time_spend_company  Work_accident  \
0                 157.0                 3.0            0.0   
1                 262.0                 6.0            0.0   
2                 272.0                 4.0            0.0   
3                 223.0                 5.0            0.0   
4                 159.0                 3.0            0.0   

   promotion_last_5years   dept  salary  
0                    0.0  sales     low  
1                    0.0  sales  medium  
2                    0.0  sales  medium  
3                    0.0  sales     low  
4  

# Improving the quality of data

###### First of all I want to drop the 'Emp ID' column since it's not meaningful for prediction; and doesn't serve well as an index (starting from 1 instead of 0)

In [113]:
data.drop("Emp ID", axis=1, inplace=True)

###### Since there are not much null values in comparison to the general size of the dataset - the best solution for the precision of the future predictions will be simply from them.

In [116]:
data = data.dropna()
print(f"\nRows with at least one null value after dropping nulls: {data.isnull().any(axis=1).sum()}")


Rows with at least one null value after dropping nulls: 0


### I'm interested to explore just the sales department, as it is one of the most numerous in the dataset, and different professions usually have different career  patterns, what will decrease the predictions accuracy significantly.

In [117]:
is_sales_department = data['dept'] == 'sales'
sales_data = data[is_sales_department]
# Getting rid of the "dept" column, since we will explore just sales department
sales_data.drop('dept', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_data.drop('dept', axis=1, inplace=True)


### Get target (dependent) variable into the separate Pandas Dataframe, dropping it in the main sales' dataset which will for the features (independent variables) 

In [118]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target_var = 'satisfaction_level'
Y = sales_data[target_var]
X = sales_data.drop([target_var], axis=1)
X['salary'] = le.fit_transform(X['salary'])

# Divide data into train & test

In [120]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=101)

#### Ensuring variables were splited correctly

In [121]:
print(f"X_train: {X_train.shape}")
print(f"Y_train: {Y_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"Y_test: {Y_test.shape}")

X_train: (3312, 7)
Y_train: (3312,)
X_test: (828, 7)
Y_test: (828,)


# Building & training the basic model

In [122]:

rf_Model = RandomForestRegressor()
rf_Model.fit(X_train,Y_train)

# Check accuracy

In [123]:
print(f"Train Accuracy: {rf_Model.score(X_train, Y_train):.3f}")
print(f"Test Accuracy: {rf_Model.score(X_test, Y_test):.3f}")


Train Accuracy: 0.932
Test Accuracy: 0.542


### The accuracy is very poor, let's improve it

In [124]:
rf_Model = RandomForestRegressor(
    n_estimators=100,          # More trees improve the prediction accuracy
    max_depth=10,              # Limit tree depth helps prevent overfitting
    min_samples_split=5,       # Let's require more samples to split a node
    min_samples_leaf=2,        # Let's require more samples in leaf nodes
    random_state=101
)

rf_Model.fit(X_train,Y_train) # Retraining the model with new params


#### Identify most important features

In [125]:
importances = rf_Model.feature_importances_
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
})
print(feature_importance.sort_values('importance', ascending=False))

                 feature  importance
1         number_project    0.529957
2   average_montly_hours    0.223745
0        last_evaluation    0.140155
3     time_spend_company    0.075089
6                 salary    0.020484
4          Work_accident    0.008739
5  promotion_last_5years    0.001831


#### Keep only the most important features

In [126]:
important_features = feature_importance[feature_importance['importance'] > 0.05]['feature'].tolist()
X = X[important_features]

In [127]:
print(f"Train Accuracy: {rf_Model.score(X_train, Y_train):.3f}")
print(f"Test Accuracy: {rf_Model.score(X_test, Y_test):.3f}")

Train Accuracy: 0.610
Test Accuracy: 0.462


### Currently the accuracy is still poor, still has to be improved