In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [2]:
# Replace 'your_file.csv' with the path to your CSV file
file_path = 'Employee Attrition.csv'

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(file_path)


In [3]:
data = data.dropna(subset=['satisfaction_level'])

In [4]:
# Splitting the data into train, validation, and test sets
data_train, data_test = train_test_split(data, train_size=0.8, random_state=1)
data_train, data_val = train_test_split(data_train, train_size=0.8, random_state=1)


In [5]:
# Assuming 'data' is your DataFrame
target_variable = 'satisfaction_level'
features = ['dept', 'salary', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years' ]


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14999 entries, 0 to 15786
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Emp ID                 14999 non-null  float64
 1   satisfaction_level     14999 non-null  float64
 2   last_evaluation        14999 non-null  float64
 3   number_project         14999 non-null  float64
 4   average_montly_hours   14999 non-null  float64
 5   time_spend_company     14999 non-null  float64
 6   Work_accident          14999 non-null  float64
 7   promotion_last_5years  14999 non-null  float64
 8   dept                   14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(8), object(2)
memory usage: 1.3+ MB


In [7]:
data.head()

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
0,1.0,0.38,0.53,2.0,157.0,3.0,0.0,0.0,sales,low
1,2.0,0.8,0.86,5.0,262.0,6.0,0.0,0.0,sales,medium
2,3.0,0.11,0.88,7.0,272.0,4.0,0.0,0.0,sales,medium
3,4.0,0.72,0.87,5.0,223.0,5.0,0.0,0.0,sales,low
4,5.0,0.37,0.52,2.0,159.0,3.0,0.0,0.0,sales,low


In [8]:
# Define transformers
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])


In [9]:
# Preprocessing: Handling Missing Values and Encoding Categorical Columns
numeric_features = ['last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
categorical_features = ['dept', 'salary']

In [10]:
# Pipeline for numeric values: Apply a StandardScaler
num_pipeline = Pipeline([
    ("scale", StandardScaler())
])

# Pipeline for categorical values: Apply one-hot encoding
cat_pipeline = Pipeline([
    ("encode", OneHotEncoder())
])

preprocessing = ColumnTransformer([
    ("numeric", num_pipeline, numeric_features),
    ("categorical", cat_pipeline, categorical_features)
])

In [11]:
X_val = data_val.drop(target_variable, axis=1)  # Features for validation
y_val = data_val[target_variable]  # Target variable for validation

X_train = data_train.drop(target_variable, axis=1)  # Features for validation
y_train = data_train[target_variable]  # Target variable for validation


X_test = data_test.drop(target_variable, axis=1)  # Features for validation
y_test = data_test[target_variable]  # Target variable for validation


In [12]:
# Preprocess the training data
X_train = preprocessing.fit_transform(X_train)  # Fit and transform training data

# Transform the validation data using the same preprocessor
X_val = preprocessing.transform(X_val)  # Transform validation data

In [13]:
# Preprocess the training data
X_test = preprocessing.transform(X_test)  # Fit and transform training data

In [14]:
# Create a Sequential model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression task with linear activation
])

In [15]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['RootMeanSquaredError'])


In [16]:
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), verbose=2)

Epoch 1/50
300/300 - 9s - loss: 0.0662 - root_mean_squared_error: 0.2573 - val_loss: 0.0419 - val_root_mean_squared_error: 0.2047 - 9s/epoch - 29ms/step
Epoch 2/50
300/300 - 1s - loss: 0.0404 - root_mean_squared_error: 0.2010 - val_loss: 0.0405 - val_root_mean_squared_error: 0.2014 - 912ms/epoch - 3ms/step
Epoch 3/50
300/300 - 1s - loss: 0.0385 - root_mean_squared_error: 0.1962 - val_loss: 0.0387 - val_root_mean_squared_error: 0.1968 - 884ms/epoch - 3ms/step
Epoch 4/50
300/300 - 1s - loss: 0.0373 - root_mean_squared_error: 0.1930 - val_loss: 0.0374 - val_root_mean_squared_error: 0.1933 - 867ms/epoch - 3ms/step
Epoch 5/50
300/300 - 1s - loss: 0.0365 - root_mean_squared_error: 0.1910 - val_loss: 0.0376 - val_root_mean_squared_error: 0.1939 - 881ms/epoch - 3ms/step
Epoch 6/50
300/300 - 1s - loss: 0.0360 - root_mean_squared_error: 0.1896 - val_loss: 0.0374 - val_root_mean_squared_error: 0.1934 - 864ms/epoch - 3ms/step
Epoch 7/50
300/300 - 1s - loss: 0.0355 - root_mean_squared_error: 0.1883

In [17]:
val_rmse_nn = model.evaluate(X_val, y_val)[1]
print(f'RMSE on Validation Data (Neural Network): {val_rmse_nn}')


RMSE on Validation Data (Neural Network): 0.19080086052417755
