In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, InputLayer
import xgboost as xgb


# Load dataset
df = pd.read_csv('HR_Analytics.csv')

# Drop irrelevant or constant columns
columns_to_drop = ['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber']
df.drop(columns=columns_to_drop, axis=1, inplace=True)

# Encode target variable
target_column = 'Attrition'
label_encoder = LabelEncoder()
df[target_column] = label_encoder.fit_transform(df[target_column])

# Handle missing values appropriately
numeric_cols = df.select_dtypes(include='number').columns
categorical_cols = df.select_dtypes(exclude='number').columns

# Fill missing values
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop(target_column)

# Encode categorical columns using OneHotEncoder
for col in categorical_cols:
    if df[col].dtype in [np.int64, np.float64]:
        continue  # Skip if already numeric
    if df[col].nunique() == 2:
        df[col] = LabelEncoder().fit_transform(df[col])
    else:
        onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_data = onehot_encoder.fit_transform(df[[col]])
        encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_{cat}" for cat in onehot_encoder.categories_[0]])
        df = df.drop(col, axis=1).join(encoded_df)

# Split features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Standardize numerical features
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Ensure all columns in X are numeric
X = X.select_dtypes(include=['number'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare LSTM input (reshape for sequential model)
X_train_lstm = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# LSTM Model (using InputLayer)
lstm_model = Sequential()
lstm_model.add(InputLayer(shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
lstm_model.add(LSTM(50, activation='relu'))
lstm_model.add(Dropout(0.3))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train LSTM
lstm_model.fit(X_train_lstm, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=1)

# XGBoost Model with Hyperparameter Tuning using GridSearchCV
xgb_model = xgb.XGBClassifier(eval_metric='logloss')

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

best_xgb_model = grid_search.best_estimator_  # Get the best XGBoost model

# Evaluate Models
def evaluate_model(model, X_test, y_test, model_type="LSTM"):
    if model_type == "LSTM":
        X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))
        y_pred = model.predict(X_test_reshaped)
        y_pred = (y_pred > 0.5).astype(int)
    elif model_type == "Hybrid":
        y_pred = model(X_test)  # Call lambda function for Hybrid
    else:
        y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"{model_type} - Accuracy: {accuracy * 100:.2f}%")
    print(f"{model_type} - Precision: {precision * 100:.2f}%")
    print(f"{model_type} - Recall: {recall * 100:.2f}%")
    print(f"{model_type} - F1-score: {f1 * 100:.2f}%")

# Evaluate both models
evaluate_model(lstm_model, X_test, y_test, "LSTM")
evaluate_model(best_xgb_model, X_test, y_test, "XGBoost")  # Use the best XGBoost model

# Hybrid Prediction (Averaging probabilities)
lstm_prob = lstm_model.predict(X_test_lstm).flatten()
xgb_prob = best_xgb_model.predict_proba(X_test)[:, 1]  # Use best_xgb_model

hybrid_prob = (lstm_prob + xgb_prob) / 2
hybrid_pred = (hybrid_prob > 0.5).astype(int)

# Evaluate Hybrid Model (using lambda function)
print("\nHybrid Model Performance:")
evaluate_model(lambda x: hybrid_pred, X_test, y_test, "Hybrid")



2025-02-20 22:26:17.341096: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-20 22:26:17.342127: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 22:26:17.344553: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 22:26:17.351000: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740070577.362607   33200 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740070577.36

Epoch 1/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5560 - loss: 0.6876 - val_accuracy: 0.7966 - val_loss: 0.6027
Epoch 2/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8486 - loss: 0.5603 - val_accuracy: 0.7966 - val_loss: 0.5205
Epoch 3/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8344 - loss: 0.4763 - val_accuracy: 0.7966 - val_loss: 0.4733
Epoch 4/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8284 - loss: 0.4313 - val_accuracy: 0.7966 - val_loss: 0.4602
Epoch 5/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8620 - loss: 0.3615 - val_accuracy: 0.7966 - val_loss: 0.4544
Epoch 6/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8507 - loss: 0.3516 - val_accuracy: 0.7966 - val_loss: 0.4507
Epoch 7/30
[1m30/30[0m [32m━━━━━━━━━━

In [2]:
# Predict attrition for new data
def predict_attrition(new_employee_data):
    new_employee_df = pd.DataFrame([new_employee_data])

    # One-hot encode categorical features in new data
    for col in categorical_cols:
        if col in new_employee_df.columns:
            if new_employee_df[col].nunique() == 2:
                new_employee_df[col] = LabelEncoder().fit_transform(new_employee_df[col])
            else:
                # Get one-hot encoded columns from training data
                onehot_cols = [c for c in X_train.columns if c.startswith(col + '_')]

                # Create one-hot encoded columns for new data, filled with 0
                for onehot_col in onehot_cols:
                    new_employee_df[onehot_col] = 0

                # Set the appropriate one-hot column to 1 based on the value in new data
                value = new_employee_df[col][0]  # Get the value of the categorical feature


                matching_col = col + '_' + str(value)

                if matching_col in new_employee_df.columns:
                    new_employee_df[matching_col] = 1

    # Drop original categorical columns
    new_employee_df = new_employee_df.drop(categorical_cols, axis=1, errors='ignore')
    # Reindex to match the training data columns

    new_employee_df = new_employee_df.reindex(columns=X_train.columns, fill_value=0)
   # print(new_employee_df.columns)
    # LSTM Prediction
    lstm_input = new_employee_df.values.reshape((1, 1, X_train.shape[1]))
    lstm_prediction = lstm_model.predict(lstm_input).flatten()[0]

    # XGBoost Prediction (using best model)
    xgb_prediction = best_xgb_model.predict_proba(new_employee_df)[:, 1][0]  # Use best_xgb_model

    # Hybrid Prediction
    hybrid_prediction = (lstm_prediction + xgb_prediction) / 2

    if hybrid_prediction < 0.06:
        print("Prediction: Employee is likely to leave.")
    else:
        print("Prediction: Employee is likely to stay.")


In [6]:
import joblib

# Save the trained LSTM model
lstm_model.save('lstm_model.h5')

# Save the best XGBoost model
joblib.dump(best_xgb_model, 'best_xgb_model.pkl')

# Save the label encoder
joblib.dump(label_encoder, 'label_encoder.pkl')

# Save the standard scaler
joblib.dump(scaler, 'scaler.pkl')

joblib.dump(X_train, "X_train.pkl")  # Save it for future use




['X_train.pkl']

In [7]:
#No
# Example usage with new_employee_data:
new_employee_data = {
    'Age': 49,
    'Department': 'Research & Development',
    'JobRole': 'Research Scientist',
    'JobSatisfaction': 2,
    'MaritalStatus': 'Married',
    'MonthlyIncome': 5130,
    'OverTime': 'NO',
    'PercentSalaryHike': 23,
    'PerformanceRating': 4,
    'StockOptionLevel': 1,
    'TotalWorkingYears': 10,
    'WorkLifeBalance': 3,
    'YearsAtCompany': 10,
    'YearsInCurrentRole': 7,
    'YearsSinceLastPromotion': 1,
}

predict_attrition(new_employee_data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Prediction: Employee is likely to stay.


In [8]:
#Yes
# Example usage with new_employee_data:
new_employee_data = {
    'Age': 41,
    'Department': 'Sales',
    'JobRole': 'Sales Executive',
    'JobSatisfaction': 4,
    'MaritalStatus': 'Single',
    'MonthlyIncome': 5993,
    'OverTime': 'YES',
    'PercentSalaryHike': 11,
    'PerformanceRating': 3,
    'StockOptionLevel': 0,
    'TotalWorkingYears': 8,
    'WorkLifeBalance': 1,
    'YearsAtCompany': 6,
    'YearsInCurrentRole': 4,
    'YearsSinceLastPromotion': 0,
}

predict_attrition(new_employee_data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Employee is likely to leave.
