In [25]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


In [26]:
# Replace 'your_file.csv' with the path to your CSV file
file_path = 'Employee Attrition.csv'

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(file_path)


In [27]:
data = data.dropna(subset=['satisfaction_level'])

# Drop rows with missing values
data.dropna(inplace=True)


In [28]:
# Splitting the data into train, validation, and test sets
data_train, data_test = train_test_split(data, train_size=0.8, random_state=1)
data_train, data_val = train_test_split(data_train, train_size=0.8, random_state=1)


In [29]:
target_variable = 'satisfaction_level'
features = ['dept', 'salary', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
numeric_features = ['last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
categorical_features = ['dept', 'salary']

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14999 entries, 0 to 15786
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Emp ID                 14999 non-null  float64
 1   satisfaction_level     14999 non-null  float64
 2   last_evaluation        14999 non-null  float64
 3   number_project         14999 non-null  float64
 4   average_montly_hours   14999 non-null  float64
 5   time_spend_company     14999 non-null  float64
 6   Work_accident          14999 non-null  float64
 7   promotion_last_5years  14999 non-null  float64
 8   dept                   14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(8), object(2)
memory usage: 1.3+ MB


In [31]:
data.head()

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
0,1.0,0.38,0.53,2.0,157.0,3.0,0.0,0.0,sales,low
1,2.0,0.8,0.86,5.0,262.0,6.0,0.0,0.0,sales,medium
2,3.0,0.11,0.88,7.0,272.0,4.0,0.0,0.0,sales,medium
3,4.0,0.72,0.87,5.0,223.0,5.0,0.0,0.0,sales,low
4,5.0,0.37,0.52,2.0,159.0,3.0,0.0,0.0,sales,low


In [32]:
# Define transformers
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [33]:
# Preprocessing: Handling Missing Values and Encoding Categorical Columns
preprocessing = ColumnTransformer([
    ("numeric", numeric_transformer, numeric_features),
    ("categorical", categorical_transformer, categorical_features)
])


In [34]:
# Separate features and target variable for train, validation, and test sets
X_train = data_train[features]
y_train = data_train[target_variable]

X_val = data_val[features]
y_val = data_val[target_variable]

X_test = data_test[features]
y_test = data_test[target_variable]


In [35]:
# Parameter grid to search
param_grid = {
    "model__n_neighbors": [5, 10, 15],
    "model__weights": ["uniform", "distance"]
}

In [36]:
# KNeighborsRegressor model
knn = KNeighborsRegressor()
KNeighborsRegressor_model = Pipeline([
    ('preprocessing', preprocessing),
    ('model', knn)
])

In [37]:
# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(KNeighborsRegressor_model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] END .......model__n_neighbors=5, model__weights=uniform; total time=   0.1s




[CV] END .......model__n_neighbors=5, model__weights=uniform; total time=   0.1s
[CV] END .......model__n_neighbors=5, model__weights=uniform; total time=   0.1s




[CV] END ......model__n_neighbors=5, model__weights=distance; total time=   0.1s
[CV] END ......model__n_neighbors=5, model__weights=distance; total time=   0.1s




[CV] END ......model__n_neighbors=5, model__weights=distance; total time=   0.1s
[CV] END ......model__n_neighbors=10, model__weights=uniform; total time=   0.1s




[CV] END ......model__n_neighbors=10, model__weights=uniform; total time=   0.1s
[CV] END ......model__n_neighbors=10, model__weights=uniform; total time=   0.1s




[CV] END .....model__n_neighbors=10, model__weights=distance; total time=   0.1s
[CV] END .....model__n_neighbors=10, model__weights=distance; total time=   0.1s




[CV] END .....model__n_neighbors=10, model__weights=distance; total time=   0.1s
[CV] END ......model__n_neighbors=15, model__weights=uniform; total time=   0.1s




[CV] END ......model__n_neighbors=15, model__weights=uniform; total time=   0.1s
[CV] END ......model__n_neighbors=15, model__weights=uniform; total time=   0.1s




[CV] END .....model__n_neighbors=15, model__weights=distance; total time=   0.1s
[CV] END .....model__n_neighbors=15, model__weights=distance; total time=   0.1s




[CV] END .....model__n_neighbors=15, model__weights=distance; total time=   0.1s




In [38]:
param_grid_rf = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 5, 10, 20],
    'model__min_samples_split': [2, 5, 10]
}

In [39]:
# Create the RandomForestRegressor model pipeline
RandomForest_model = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('model', RandomForestRegressor())
])

In [40]:
grid_search_rf = GridSearchCV(RandomForest_model, param_grid_rf, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search_rf.fit(X_train, y_train)

best_rf = grid_search_rf.best_estimator_

Fitting 3 folds for each of 36 candidates, totalling 108 fits




[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   4.2s




[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   2.6s




[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   2.2s




[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=   4.4s




[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=   5.6s




[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=   4.3s




[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=300; total time=   6.9s




[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=300; total time=   7.2s




[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=300; total time=   7.5s




[CV] END model__max_depth=None, model__min_samples_split=5, model__n_estimators=100; total time=   3.0s




[CV] END model__max_depth=None, model__min_samples_split=5, model__n_estimators=100; total time=   1.9s




[CV] END model__max_depth=None, model__min_samples_split=5, model__n_estimators=100; total time=   1.9s




[CV] END model__max_depth=None, model__min_samples_split=5, model__n_estimators=200; total time=   4.3s




[CV] END model__max_depth=None, model__min_samples_split=5, model__n_estimators=200; total time=   5.3s




[CV] END model__max_depth=None, model__min_samples_split=5, model__n_estimators=200; total time=   3.8s




[CV] END model__max_depth=None, model__min_samples_split=5, model__n_estimators=300; total time=   5.9s




[CV] END model__max_depth=None, model__min_samples_split=5, model__n_estimators=300; total time=   7.4s




[CV] END model__max_depth=None, model__min_samples_split=5, model__n_estimators=300; total time=   5.8s




[CV] END model__max_depth=None, model__min_samples_split=10, model__n_estimators=100; total time=   1.9s




[CV] END model__max_depth=None, model__min_samples_split=10, model__n_estimators=100; total time=   2.8s




[CV] END model__max_depth=None, model__min_samples_split=10, model__n_estimators=100; total time=   1.9s




[CV] END model__max_depth=None, model__min_samples_split=10, model__n_estimators=200; total time=   3.6s




[CV] END model__max_depth=None, model__min_samples_split=10, model__n_estimators=200; total time=   3.5s




[CV] END model__max_depth=None, model__min_samples_split=10, model__n_estimators=200; total time=   4.4s




[CV] END model__max_depth=None, model__min_samples_split=10, model__n_estimators=300; total time=   5.6s




[CV] END model__max_depth=None, model__min_samples_split=10, model__n_estimators=300; total time=   5.6s




[CV] END model__max_depth=None, model__min_samples_split=10, model__n_estimators=300; total time=   6.4s




[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=100; total time=   0.6s




[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=100; total time=   0.6s




[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=100; total time=   0.6s




[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=200; total time=   1.2s




[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=200; total time=   1.2s




[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=200; total time=   1.3s




[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=300; total time=   1.8s




[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=300; total time=   2.3s




[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=300; total time=   2.3s




[CV] END model__max_depth=5, model__min_samples_split=5, model__n_estimators=100; total time=   0.6s




[CV] END model__max_depth=5, model__min_samples_split=5, model__n_estimators=100; total time=   0.6s




[CV] END model__max_depth=5, model__min_samples_split=5, model__n_estimators=100; total time=   0.6s




[CV] END model__max_depth=5, model__min_samples_split=5, model__n_estimators=200; total time=   1.1s




[CV] END model__max_depth=5, model__min_samples_split=5, model__n_estimators=200; total time=   1.1s




[CV] END model__max_depth=5, model__min_samples_split=5, model__n_estimators=200; total time=   1.1s




[CV] END model__max_depth=5, model__min_samples_split=5, model__n_estimators=300; total time=   1.6s




[CV] END model__max_depth=5, model__min_samples_split=5, model__n_estimators=300; total time=   1.7s




[CV] END model__max_depth=5, model__min_samples_split=5, model__n_estimators=300; total time=   2.0s




[CV] END model__max_depth=5, model__min_samples_split=10, model__n_estimators=100; total time=   0.9s




[CV] END model__max_depth=5, model__min_samples_split=10, model__n_estimators=100; total time=   0.9s




[CV] END model__max_depth=5, model__min_samples_split=10, model__n_estimators=100; total time=   0.8s




[CV] END model__max_depth=5, model__min_samples_split=10, model__n_estimators=200; total time=   1.3s




[CV] END model__max_depth=5, model__min_samples_split=10, model__n_estimators=200; total time=   1.2s




[CV] END model__max_depth=5, model__min_samples_split=10, model__n_estimators=200; total time=   1.1s




[CV] END model__max_depth=5, model__min_samples_split=10, model__n_estimators=300; total time=   1.7s




[CV] END model__max_depth=5, model__min_samples_split=10, model__n_estimators=300; total time=   1.7s




[CV] END model__max_depth=5, model__min_samples_split=10, model__n_estimators=300; total time=   1.6s




[CV] END model__max_depth=10, model__min_samples_split=2, model__n_estimators=100; total time=   1.0s




[CV] END model__max_depth=10, model__min_samples_split=2, model__n_estimators=100; total time=   1.0s




[CV] END model__max_depth=10, model__min_samples_split=2, model__n_estimators=100; total time=   1.5s




[CV] END model__max_depth=10, model__min_samples_split=2, model__n_estimators=200; total time=   2.7s




[CV] END model__max_depth=10, model__min_samples_split=2, model__n_estimators=200; total time=   2.1s




[CV] END model__max_depth=10, model__min_samples_split=2, model__n_estimators=200; total time=   2.3s




[CV] END model__max_depth=10, model__min_samples_split=2, model__n_estimators=300; total time=   3.1s




[CV] END model__max_depth=10, model__min_samples_split=2, model__n_estimators=300; total time=   3.5s




[CV] END model__max_depth=10, model__min_samples_split=2, model__n_estimators=300; total time=   4.0s




[CV] END model__max_depth=10, model__min_samples_split=5, model__n_estimators=100; total time=   1.0s




[CV] END model__max_depth=10, model__min_samples_split=5, model__n_estimators=100; total time=   1.0s




[CV] END model__max_depth=10, model__min_samples_split=5, model__n_estimators=100; total time=   1.0s




[CV] END model__max_depth=10, model__min_samples_split=5, model__n_estimators=200; total time=   2.1s




[CV] END model__max_depth=10, model__min_samples_split=5, model__n_estimators=200; total time=   2.0s




[CV] END model__max_depth=10, model__min_samples_split=5, model__n_estimators=200; total time=   2.5s




[CV] END model__max_depth=10, model__min_samples_split=5, model__n_estimators=300; total time=   3.9s




[CV] END model__max_depth=10, model__min_samples_split=5, model__n_estimators=300; total time=   3.0s




[CV] END model__max_depth=10, model__min_samples_split=5, model__n_estimators=300; total time=   3.0s




[CV] END model__max_depth=10, model__min_samples_split=10, model__n_estimators=100; total time=   1.0s




[CV] END model__max_depth=10, model__min_samples_split=10, model__n_estimators=100; total time=   1.0s




[CV] END model__max_depth=10, model__min_samples_split=10, model__n_estimators=100; total time=   1.0s




[CV] END model__max_depth=10, model__min_samples_split=10, model__n_estimators=200; total time=   3.1s




[CV] END model__max_depth=10, model__min_samples_split=10, model__n_estimators=200; total time=   2.3s




[CV] END model__max_depth=10, model__min_samples_split=10, model__n_estimators=200; total time=   2.0s




[CV] END model__max_depth=10, model__min_samples_split=10, model__n_estimators=300; total time=   2.9s




[CV] END model__max_depth=10, model__min_samples_split=10, model__n_estimators=300; total time=   3.0s




[CV] END model__max_depth=10, model__min_samples_split=10, model__n_estimators=300; total time=   3.8s




[CV] END model__max_depth=20, model__min_samples_split=2, model__n_estimators=100; total time=   2.2s




[CV] END model__max_depth=20, model__min_samples_split=2, model__n_estimators=100; total time=   1.9s




[CV] END model__max_depth=20, model__min_samples_split=2, model__n_estimators=100; total time=   2.0s




[CV] END model__max_depth=20, model__min_samples_split=2, model__n_estimators=200; total time=   3.7s




[CV] END model__max_depth=20, model__min_samples_split=2, model__n_estimators=200; total time=   5.5s




[CV] END model__max_depth=20, model__min_samples_split=2, model__n_estimators=200; total time=   3.9s




[CV] END model__max_depth=20, model__min_samples_split=2, model__n_estimators=300; total time=   5.7s




[CV] END model__max_depth=20, model__min_samples_split=2, model__n_estimators=300; total time=   7.1s




[CV] END model__max_depth=20, model__min_samples_split=2, model__n_estimators=300; total time=   5.9s




[CV] END model__max_depth=20, model__min_samples_split=5, model__n_estimators=100; total time=   1.8s




[CV] END model__max_depth=20, model__min_samples_split=5, model__n_estimators=100; total time=   2.4s




[CV] END model__max_depth=20, model__min_samples_split=5, model__n_estimators=100; total time=   2.3s




[CV] END model__max_depth=20, model__min_samples_split=5, model__n_estimators=200; total time=   3.4s




[CV] END model__max_depth=20, model__min_samples_split=5, model__n_estimators=200; total time=   3.3s




[CV] END model__max_depth=20, model__min_samples_split=5, model__n_estimators=200; total time=   3.9s




[CV] END model__max_depth=20, model__min_samples_split=5, model__n_estimators=300; total time=   5.9s




[CV] END model__max_depth=20, model__min_samples_split=5, model__n_estimators=300; total time=   5.3s




[CV] END model__max_depth=20, model__min_samples_split=5, model__n_estimators=300; total time=   6.5s




[CV] END model__max_depth=20, model__min_samples_split=10, model__n_estimators=100; total time=   1.6s




[CV] END model__max_depth=20, model__min_samples_split=10, model__n_estimators=100; total time=   1.6s




[CV] END model__max_depth=20, model__min_samples_split=10, model__n_estimators=100; total time=   1.6s




[CV] END model__max_depth=20, model__min_samples_split=10, model__n_estimators=200; total time=   3.1s




[CV] END model__max_depth=20, model__min_samples_split=10, model__n_estimators=200; total time=   4.3s




[CV] END model__max_depth=20, model__min_samples_split=10, model__n_estimators=200; total time=   3.3s




[CV] END model__max_depth=20, model__min_samples_split=10, model__n_estimators=300; total time=   4.7s




[CV] END model__max_depth=20, model__min_samples_split=10, model__n_estimators=300; total time=   5.9s




[CV] END model__max_depth=20, model__min_samples_split=10, model__n_estimators=300; total time=   4.8s




In [41]:
linear_reg = LinearRegression()
LinearRegression_model = Pipeline([
    ('preprocessing', preprocessing),  # Assuming you've defined preprocessing steps
    ('model', linear_reg)
])

# Fit the Linear Regression model
LinearRegression_model.fit(X_train, y_train)




In [42]:
# Evaluate the best model on the validation set
y_val_pred = best_knn.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'RMSE on Validation Data knn: {val_rmse}')


RMSE on Validation Data knn: 0.17473466344566713


In [43]:
# Evaluate the best RandomForest model on the validation set
y_val_pred_rf = best_rf.predict(X_val)
val_rmse_rf = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
print(f'RMSE on Validation Data - RandomForest: {val_rmse_rf}')



RMSE on Validation Data - RandomForest: 0.174711799537017


In [44]:
# Evaluate the Linear Regression model on the validation set
y_val_pred_lr = LinearRegression_model.predict(X_val)
val_rmse_lr = np.sqrt(mean_squared_error(y_val, y_val_pred_lr))
print(f'RMSE on Validation Data - Linear Regression: {val_rmse_lr}')



RMSE on Validation Data - Linear Regression: 0.2418663353890868


In [45]:
# Evaluate the best RandomForestRegressor model on the test set
y_test_pred_rf = best_rf.predict(X_test)
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
print(f'RMSE on Test Data (Random Forest): {test_rmse_rf}')

RMSE on Test Data (Random Forest): 0.17605642232090338
