In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error



In [66]:
# Replace 'your_file.csv' with the path to your CSV file
file_path = 'Employee Attrition.csv'

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(file_path)


In [67]:

# Drop rows with missing values
data.dropna(inplace=True)


In [68]:
# Assuming 'data' is your preprocessed DataFrame and 'satisfaction_level' is the target variable
# Convert satisfaction_level to binary classification (satisfied/dissatisfied)
data['satisfaction_level'] = data['satisfaction_level'].apply(lambda x: 1 if x > 0.5 else 0)


In [69]:
data.head()

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
0,1.0,0,0.53,2.0,157.0,3.0,0.0,0.0,sales,low
1,2.0,1,0.86,5.0,262.0,6.0,0.0,0.0,sales,medium
2,3.0,0,0.88,7.0,272.0,4.0,0.0,0.0,sales,medium
3,4.0,1,0.87,5.0,223.0,5.0,0.0,0.0,sales,low
4,5.0,0,0.52,2.0,159.0,3.0,0.0,0.0,sales,low


In [70]:

# Assuming 'data' is your DataFrame
target_variable = 'satisfaction_level'
features = ['dept', 'salary', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years' ]

# Splitting the data into train, validation, and test sets
data_train_val, data_test = train_test_split(data, train_size=0.8, random_state=1)
data_train, data_val = train_test_split(data_train_val, train_size=0.8, random_state=1)


In [71]:
# Define column transformations
numeric_features = ['last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
categorical_features = ['dept', 'salary']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()


In [72]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [73]:
# Fit and transform the data using the preprocessor
transformed_data = preprocessor.fit_transform(data)

# Convert the transformed data into a DataFrame for viewing
transformed_df = pd.DataFrame(transformed_data)

# Display the transformed DataFrame
print(transformed_df.head())

         0         1         2         3         4         5    6    7    8   \
0 -1.087275 -1.462863 -0.882040 -0.341235 -0.411165 -0.147412  0.0  0.0  0.0   
1  0.840707  0.971113  1.220423  1.713436 -0.411165 -0.147412  0.0  0.0  0.0   
2  0.957554  2.593763  1.420657  0.343655 -0.411165 -0.147412  0.0  0.0  0.0   
3  0.899131  0.971113  0.439508  1.028546 -0.411165 -0.147412  0.0  0.0  0.0   
4 -1.145699 -1.462863 -0.841993 -0.341235 -0.411165 -0.147412  0.0  0.0  0.0   

    9    10   11   12   13   14   15   16   17   18  
0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  
1  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  
2  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  
3  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  
4  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  


In [74]:
# SVM pipeline
svm_model = Pipeline([
    ('preprocessor', preprocessor),
    ('svm', SVC(kernel='linear'))  # Change kernel type here if needed
])


In [75]:
# Data preparation
X_train = data_train.drop(target_variable, axis=1)
y_train = data_train[target_variable]
X_val = data_val.drop(target_variable, axis=1)
y_val = data_val[target_variable]
X_test = data_test.drop(target_variable, axis=1)
y_test = data_test[target_variable]


In [76]:
# Fit the SVM model on the training data
svm_model.fit(X_train, y_train)

In [77]:
# Model evaluation on validation set
predictions_val = svm_model.predict(X_val)
accuracy_val = accuracy_score(y_val, predictions_val)
class_report = classification_report(y_val, predictions_val)
conf_matrix = confusion_matrix(y_val, predictions_val)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [78]:
print(f'Validation Accuracy: {accuracy_val:.2f}')
print('Classification Report:\n', class_report)
print('Confusion Matrix:\n', conf_matrix)


Validation Accuracy: 0.67
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       802
           1       0.67      1.00      0.80      1598

    accuracy                           0.67      2400
   macro avg       0.33      0.50      0.40      2400
weighted avg       0.44      0.67      0.53      2400

Confusion Matrix:
 [[   0  802]
 [   0 1598]]


In [79]:
# Final evaluation on test set
predictions_test = svm_model.predict(X_test)
accuracy_test = accuracy_score(y_test, predictions_test)


In [80]:

print(f'Test Accuracy: {accuracy_test:.2f}')

Test Accuracy: 0.68


In [81]:
# Assuming 'y_val' is the true labels for the validation set
predictions_val = svm_model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, predictions_val))

print(f'RMSE on Validation Data (SVM): {rmse_val}')

# Assuming 'y_test' is the true labels for the test set
predictions_test = svm_model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, predictions_test))

print(f'RMSE on Test Data (SVM): {rmse_test}')

RMSE on Validation Data (SVM): 0.5780715065341542
RMSE on Test Data (SVM): 0.565685424949238
