<a href="https://colab.research.google.com/github/Ryan98-sawe/ElectricCo_HR_Data.ipynb/blob/main/ElectricCo_HR_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [53]:
try:
    df = pd.read_csv('ElectricCo_HR_Data.csv')
except FileNotFoundError:
    print("Error: 'ElectricCo_HR_Data.csv' not found. Ensure the file is in the same directory.")
    exit()

In [54]:
# Data Cleaning
df = df.dropna()
df["MonthlySalary"] = df["MonthlySalary"].astype(float)
df['AnnualBonus'] = df['AnnualBonus'].astype(float)
df['Age'] = df['Age'].astype(int)
df['YearsAtCompany'] = df['YearsAtCompany'].astype(int)
df['PerformanceScore'] = df['PerformanceScore'].str.strip()


In [12]:
le_perf = LabelEncoder()
df['PerformanceScore'] = le_perf.fit_transform(df['PerformanceScore'])

In [59]:
# Binary Target 1 for below others and 0 for others
df['Underperformance'] = df['PerformanceScore'].apply(lambda x: 1 if x == 'Below Average' else 0)

In [33]:
# Checking class distribution
print("Class distribution in Underperformance:")
print(df['Underperformance'].value_counts())

Class distribution in Underperformance:
Underperformance
0    50
Name: count, dtype: int64


In [60]:
# Selecting Features
features = ["MonthlySalary", "AnnualBonus", "YearsAtCompany", "Age"]
x = df[features]
y = df['Underperformance']

In [35]:
# Ensure both classes represented through a train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [61]:
# Check on Class distribution in Train Set
print("Class distribution in y_train:")
print(pd.Series(y_train).value_counts())

Class distribution in y_train:
0    40
Name: count, dtype: int64


In [64]:
# Selecting Features
features = ['MonthlySalary', 'AnnualBonus', 'Age', 'YearsAtCompany']
X = df[features]
y = df['Underperformance']

In [63]:
# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [65]:
# Checking class distribution in training set
print("\nClass distribution in y_train:")
print(pd.Series(y_train).value_counts())


Class distribution in y_train:
Underperformance
0    28
1    12
Name: count, dtype: int64


In [67]:
# Handling class imbalance with SMOTE
minority_count = pd.Series(y_train).value_counts().get(1, 0)
if minority_count < 3:
    print("Warning: Too few 'Below Average' samples in y_train. Using class_weight='balanced' instead of SMOTE.")
    X_train_balanced, y_train_balanced = X_train, y_train
else:
    try:
        smote = SMOTE(random_state=42, k_neighbors=min(3, minority_count - 1))
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
        print("\nClass distribution after SMOTE:")
        print(pd.Series(y_train_balanced).value_counts())
    except ValueError as e:
        print(f"SMOTE failed: {e}. Using original training data with class_weight='balanced'.")
        X_train_balanced, y_train_balanced = X_train, y_train



Class distribution after SMOTE:
Underperformance
0    28
1    28
Name: count, dtype: int64


In [72]:
# Handling class imbalance with SMOTE
minority_count = pd.Series(y_train).value_counts().get(1, 0)
if minority_count < 3:
    print("Warning: Too few 'Below Average' samples in y_train. Using class_weight='balanced' instead of SMOTE.")
    X_train_balanced, y_train_balanced = X_train, y_train
else:
    try:
        smote = SMOTE(random_state=42, k_neighbors=min(3, minority_count - 1))
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
        print("\nClass distribution after SMOTE:")
        print(pd.Series(y_train_balanced).value_counts())
    except ValueError as e:
        print(f"SMOTE failed: {e}. Using original training data with class_weight='balanced'.")
        X_train_balanced, y_train_balanced = X_train, y_train

# Scaling
scaler = StandardScaler()
X_train_balanced_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)


Class distribution after SMOTE:
Underperformance
0    28
1    28
Name: count, dtype: int64


In [75]:
# Training logistic regression model
model = LogisticRegression(class_weight='balanced' if minority_count < 3 else None)
model.fit(X_train_balanced_scaled, y_train_balanced)

In [76]:
# Evaluating model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'\nModel Accuracy: {accuracy:.2f}')
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Below Average', 'Below Average']))


Model Accuracy: 0.40

Classification Report:
                   precision    recall  f1-score   support

Not Below Average       0.60      0.43      0.50         7
    Below Average       0.20      0.33      0.25         3

         accuracy                           0.40        10
        macro avg       0.40      0.38      0.38        10
     weighted avg       0.48      0.40      0.42        10



In [77]:
# Saving cleaned data and model metrics
df.to_csv('cleaned_data.csv', index=False)
with open('model_metrics.txt', 'w') as f:
    f.write(f'Model Accuracy: {accuracy:.2f}\n')
    f.write('\nClassification Report:\n')
    f.write(classification_report(y_test, y_pred, target_names=['Not Below Average', 'Below Average']))
