In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the data
data = pd.read_csv("/Users/parthbehl/machine learning/deeplearning /Employee_Attrition/dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")


In [3]:
print(data.head())
print(data.info())
print(data.describe())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...

In [4]:
print(data.isnull().sum())


Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [5]:
#extra step is there were missing values found in our data
#we'll fill the missing : (1)numerical values with the mean of that column
#                         (2)categorical values with the mode              
 
# (1)
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['DailyRate'].fillna(data['DailyRate'].mean(), inplace=True)
data['DistanceFromHome'].fillna(data['DistanceFromHome'].mean(), inplace=True)
data['HourlyRate'].fillna(data['HourlyRate'].mean(), inplace=True)
data['MonthlyIncome'].fillna(data['MonthlyIncome'].mean(), inplace=True)
data['MonthlyRate'].fillna(data['MonthlyRate'].mean(), inplace=True)
data['NumCompaniesWorked'].fillna(data['NumCompaniesWorked'].mean(), inplace=True)
data['PercentSalaryHike'].fillna(data['PercentSalaryHike'].mean(), inplace=True)
data['TotalWorkingYears'].fillna(data['TotalWorkingYears'].mean(), inplace=True)
data['TrainingTimesLastYear'].fillna(data['TrainingTimesLastYear'].mean(), inplace=True)
data['YearsAtCompany'].fillna(data['YearsAtCompany'].mean(), inplace=True)
data['YearsInCurrentRole'].fillna(data['YearsInCurrentRole'].mean(), inplace=True)
data['YearsSinceLastPromotion'].fillna(data['YearsSinceLastPromotion'].mean(), inplace=True)
data['YearsWithCurrManager'].fillna(data['YearsWithCurrManager'].mean(), inplace=True)

# (2)
data['BusinessTravel'].fillna(data['BusinessTravel'].mode()[0], inplace=True)
data['Department'].fillna(data['Department'].mode()[0], inplace=True)
data['EducationField'].fillna(data['EducationField'].mode()[0], inplace=True)
data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
data['JobRole'].fillna(data['JobRole'].mode()[0], inplace=True)
data['MaritalStatus'].fillna(data['MaritalStatus'].mode()[0], inplace=True)
data['OverTime'].fillna(data['OverTime'].mode()[0], inplace=True)

print(data.isnull().sum())


Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [6]:
# Remove unnecessary columns (e.g., EmployeeCount, EmployeeNumber, Over18, StandardHours)
unnecessary_cols = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
data = data.drop(columns=unnecessary_cols, axis=1)

In [7]:
# Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

In [8]:
# Split data into features (X) and target variable (y)
X = data.drop('Attrition', axis=1)
y = data['Attrition']

In [9]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['int64']).columns
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [10]:
# Display preprocessed data
print(X.head())
print(y.head())

        Age  BusinessTravel  DailyRate  Department  DistanceFromHome  \
0  0.446350        0.590048   0.742527    1.401512         -1.010909   
1  1.322365       -0.913194  -1.297775   -0.493817         -0.147150   
2  0.008343        0.590048   1.414363   -0.493817         -0.887515   
3 -0.429664       -0.913194   1.461466   -0.493817         -0.764121   
4 -1.086676        0.590048  -0.524295   -0.493817         -0.887515   

   Education  EducationField  EnvironmentSatisfaction    Gender  HourlyRate  \
0  -0.891688       -0.937414                -0.660531 -1.224745    1.383138   
1  -1.868426       -0.937414                 0.254625  0.816497   -0.240677   
2  -0.891688        1.316673                 1.169781  0.816497    1.284725   
3   1.061787       -0.937414                 1.169781 -1.224745   -0.486709   
4  -1.868426        0.565311                -1.575686  0.816497   -1.274014   

   ...  PerformanceRating  RelationshipSatisfaction  StockOptionLevel  \
0  ...          -0.

In [11]:
#MODEL DEVELOPMENT 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
from imblearn.over_sampling import SMOTE

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to your training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Define models with parameter grids
models = [
    {
        'name': 'Random Forest',
        'estimator': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    {
        'name': 'Gradient Boosting',
        'estimator': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.5],
            'max_depth': [3, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    {
        'name': 'Logistic Regression',
        'estimator': LogisticRegression(solver='liblinear'),
        'params': {
            'penalty': ['l1', 'l2'],
            'C': [0.001, 0.01, 0.1, 1, 10, 100]
        }
    },
    {
        'name': 'Support Vector Machine',
        'estimator': SVC(),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf']
        }
    },
    {
        'name': 'K-Nearest Neighbors',
        'estimator': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 10],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
        }
    },
    {
        'name': 'Naive Bayes',
        'estimator': GaussianNB(),
        'params': {}
    }
]

# Rerun grid search for each model
best_model = None
best_accuracy = 0

for model_info in models:
    print(f"Searching best parameters for {model_info['name']}...")
    grid_search = GridSearchCV(model_info['estimator'], model_info['params'], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_estimator = grid_search.best_estimator_
    best_params = grid_search.best_params_
    mean_cv_score = grid_search.best_score_

    print(f"Best parameters: {best_params}")
    print(f"Best CV score: {mean_cv_score}")

    y_pred = best_estimator.predict(X_test)
    accuracy = best_estimator.score(X_test, y_test)
    print(classification_report(y_test, y_pred))

    if accuracy > best_accuracy:
        best_model = {
            'name': model_info['name'],
            'estimator': best_estimator,
            'params': best_params,
            'accuracy': accuracy
        }
        best_accuracy = accuracy

print(f"Best Model: {best_model['name']}")
print(f"Best Parameters: {best_model['params']}")
print(f"Best Accuracy: {best_model['accuracy']}")

Searching best parameters for Random Forest...
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.8605517490082942
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       255
           1       0.67      0.10      0.18        39

    accuracy                           0.87       294
   macro avg       0.77      0.55      0.55       294
weighted avg       0.85      0.87      0.83       294

Searching best parameters for Gradient Boosting...
Best parameters: {'learning_rate': 0.5, 'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 100}
Best CV score: 0.8622358456545258
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       255
           1       0.61      0.28      0.39        39

    accuracy                           0.88       294
   macro avg       0.75      0.63      0.66       294
weighted avg       0.86     

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the SVM model with the best parameters
svm_model = SVC(C=0.1, kernel='linear')

# Train the model on the training data
svm_model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = svm_model.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(classification_report(y_test, y_pred))


Accuracy: 0.891156462585034
Precision: 0.8909177706170187
Recall: 0.891156462585034
F1 Score: 0.8601662887377173
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       255
           1       0.89      0.21      0.33        39

    accuracy                           0.89       294
   macro avg       0.89      0.60      0.64       294
weighted avg       0.89      0.89      0.86       294



In [15]:
# Assuming 'target_variable_name' is the correct name of your target variable
target_variable_name = 'Attrition'  # This needs to be corrected based on your actual target variable name

# Calculate the correlation matrix
corr_matrix = X_train.corr().abs()

# Select features with correlation greater than 0.1 with the target variable
highly_correlated_features = corr_matrix[corr_matrix[target_variable_name] > 0.1].index

# Drop highly correlated features
X_train = X_train.drop(highly_correlated_features, axis=1)
X_test = X_test.drop(highly_correlated_features, axis=1)


KeyError: 'Attrition'