In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Reading data

In [24]:
combined_data_path = '../EDA_kaggle/combined_data.csv'
combined_data = pd.read_csv(combined_data_path)

print(combined_data.head())

   Age  BusinessTravel  DistanceFromHome  Education  EmployeeNumber  \
0   37               1                 1          4              77   
1   54               2                 1          4            1245   
2   34               2                 7          3             147   
3   39               1                 1          1            1026   
4   28               2                 1          3            1111   

   EnvironmentSatisfaction  Gender  JobInvolvement  JobLevel  JobSatisfaction  \
0                        1       0               2         2                3   
1                        4       1               3         3                3   
2                        1       0               1         2                3   
3                        4       1               2         4                4   
4                        1       0               2         1                2   

   ...  JobRole_Human Resources  JobRole_Laboratory Technician  \
0  ...              

In [25]:
combined_data.columns

Index(['Age', 'BusinessTravel', 'DistanceFromHome', 'Education',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
       'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Label', 'Department_Research & Development',
       'Department_Sales', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_

# Fitting the model

In [26]:
# Set 'Label' as the target variable y, and the rest as features X
X = combined_data.drop(columns=['Label'])
y = combined_data['Label']

In [27]:
# Standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [28]:
# Initialize and train the logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Output the results with formatted output
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Accuracy: 0.8930
Precision: 0.7959
Recall: 0.4643
F1 Score: 0.5865


###  Select Features(p_value < 0.05)

In [29]:
# Select features using SelectKBest and f_classif
select_k_best = SelectKBest(score_func=f_classif, k='all')  # Select all features
X_new = select_k_best.fit_transform(X, y)

# Get the p-values
p_values = select_k_best.pvalues_

# Put feature names and p-values into a DataFrame
feature_scores = pd.DataFrame({'Feature': X.columns, 'P-Value': p_values})

# Select features with p-value less than 0.05
significant_features = feature_scores[feature_scores['P-Value'] < 0.05]['Feature']

# Print significant features
print("Significant features with p-value < 0.05:")
print(significant_features)

# Select these significant features from X
X_selected = X[significant_features]

Significant features with p-value < 0.05:
0                                   Age
1                        BusinessTravel
2                      DistanceFromHome
5               EnvironmentSatisfaction
7                        JobInvolvement
8                              JobLevel
9                       JobSatisfaction
10                        MonthlyIncome
12                             OverTime
15             RelationshipSatisfaction
17                     StockOptionLevel
18                    TotalWorkingYears
19                TrainingTimesLastYear
20                      WorkLifeBalance
21                       YearsAtCompany
22                   YearsInCurrentRole
23              YearsSinceLastPromotion
24                 YearsWithCurrManager
25    Department_Research & Development
26                     Department_Sales
28             EducationField_Marketing
29               EducationField_Medical
31      EducationField_Technical Degree
32              JobRole_Human Resource

  f = msb / msw


In [30]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Standardize the numerical features (if needed)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the logistic regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test_scaled)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Output the results with formatted output
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.8911
Precision: 0.7692
Recall: 0.4762
F1 Score: 0.5882


## Addressing data imbalance

In [32]:
# Perform oversampling using SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_selected, y)

# Split the oversampled dataset into training and testing sets
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train the logistic regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_res, y_train_res)

# Predict and evaluate
y_pred_res = logreg.predict(X_test_res)

# Calculate performance metrics
accuracy = accuracy_score(y_test_res, y_pred_res)
precision = precision_score(y_test_res, y_pred_res)
recall = recall_score(y_test_res, y_pred_res)
f1 = f1_score(y_test_res, y_pred_res)

# Output the results with formatted output
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.7773
Precision: 0.7810
Recall: 0.7309
F1 Score: 0.7551


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
# Train the model using class_weight='balanced'
logreg = LogisticRegression(max_iter=1000, class_weight='balanced')
logreg.fit(X_train, y_train)

# Predict and evaluate
y_pred_bal = logreg.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred_bal)
precision = precision_score(y_test, y_pred_bal)
recall = recall_score(y_test, y_pred_bal)
f1 = f1_score(y_test, y_pred_bal)

# Output the results with formatted output
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.7276
Precision: 0.3409
Recall: 0.7143
F1 Score: 0.4615


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
