In [1]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import numpy as np

# Reading data

In [2]:
combined_data_path = '../EDA_kaggle/combined_data.csv'
combined_data = pd.read_csv(combined_data_path)

print(combined_data.head())

   Age  BusinessTravel  DistanceFromHome  Education  EmployeeNumber  \
0   37               1                 1          4              77   
1   54               2                 1          4            1245   
2   34               2                 7          3             147   
3   39               1                 1          1            1026   
4   28               2                 1          3            1111   

   EnvironmentSatisfaction  Gender  JobInvolvement  JobLevel  JobSatisfaction  \
0                        1       0               2         2                3   
1                        4       1               3         3                3   
2                        1       0               1         2                3   
3                        4       1               2         4                4   
4                        1       0               2         1                2   

   ...  JobRole_Human Resources  JobRole_Laboratory Technician  \
0  ...              

# Fitting the model

In [3]:
# Set 'Label' as the target variable y, and the rest as features X
X = combined_data.drop(columns=['Label'])
y = combined_data['Label']

In [4]:
# Standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
# Initialize the Bernoulli Naive Bayes model
nb = BernoulliNB()

# Train the model
nb.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.85
Precision: 0.5281
Recall: 0.5595
F1 Score: 0.5434


## Feature Selection

In [6]:
# Use SelectKBest and chi2 to calculate the importance of features
select_k_best = SelectKBest(chi2, k='all')  # k='all' means selecting all features for scoring
X_new = select_k_best.fit_transform(X, y)

# Retrieve p-values and chi2 scores
p_values = select_k_best.pvalues_
chi2_scores = select_k_best.scores_

# Put the feature names, chi2 scores, and p-values into a DataFrame
feature_scores = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi2_scores, 'P-Value': p_values})

# Sort by p-value in ascending order to view significant features
significant_features = feature_scores.sort_values(by='P-Value')
print(significant_features)                      


                              Feature     Chi2 Score        P-Value
10                      MonthlyIncome  219048.632532   0.000000e+00
4                      EmployeeNumber     580.133277  3.508248e-128
18                  TotalWorkingYears     441.363157   5.467286e-98
21                     YearsAtCompany     267.007502   5.094768e-60
22                 YearsInCurrentRole     208.430493   3.021995e-47
24               YearsWithCurrManager     197.387327   7.762365e-45
0                                 Age     158.510684   2.393619e-36
2                    DistanceFromHome     124.849113   5.491557e-29
12                           OverTime     120.224231   5.649941e-28
39       JobRole_Sales Representative      58.823873   1.724364e-14
41               MaritalStatus_Single      56.314079   6.177152e-14
17                   StockOptionLevel      44.292631   2.827787e-11
8                            JobLevel      43.639927   3.947035e-11
23            YearsSinceLastPromotion      30.82

In [7]:
# Select significant features with p-value less than 0.05
selected_features = significant_features[significant_features['P-Value'] < 0.05]['Feature']

# Select the significant features from X
X_selected = X[selected_features]

from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
scaler = StandardScaler()
X_selected_scaled = scaler.fit_transform(X_selected)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected_scaled, y, test_size=0.2, random_state=42)

# Initialize the Bernoulli Naive Bayes model
nb = BernoulliNB()

# Train the model
nb.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nb = nb.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred_nb)
precision = precision_score(y_test, y_pred_nb)
recall = recall_score(y_test, y_pred_nb)
f1 = f1_score(y_test, y_pred_nb)

# Print the results with formatted output
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Accuracy: 0.8463
Precision: 0.5281
Recall: 0.5595
F1 Score: 0.5434


## Addressing data imbalance

In [9]:
# Perform oversampling using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_selected_scaled, y)

# Split the balanced dataset into training and testing sets
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Retrain the Bernoulli Naive Bayes model
nb = BernoulliNB()
nb.fit(X_train_res, y_train_res)

# Make predictions on the test set and evaluate
y_pred_res = nb.predict(X_test_res)
accuracy = accuracy_score(y_test_res, y_pred_res)
precision = precision_score(y_test_res, y_pred_res)
recall = recall_score(y_test_res, y_pred_res)
f1 = f1_score(y_test_res, y_pred_res)

# Output the results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Accuracy: 0.6984
Precision: 0.6682
Recall: 0.7111
F1 Score: 0.6890


In [44]:
# Get the predicted probabilities
y_pred_proba = nb.predict_proba(X_test)[:, 1]

# Set a lower threshold
threshold = 0.3
y_pred_adjusted = np.where(y_pred_proba > threshold, 1, 0)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_adjusted)
precision = precision_score(y_test, y_pred_adjusted)
recall = recall_score(y_test, y_pred_adjusted)
f1 = f1_score(y_test, y_pred_adjusted)

# Output the results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Accuracy: 0.6245
Precision: 0.2829
Recall: 0.8452
F1 Score: 0.4239
