###  Install Required Libraries

In [1]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


###  Import Required Libraries

In [2]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler 
from sklearn.feature_selection import SelectKBest, f_classif 
from sklearn.decomposition import PCA 
from imblearn.over_sampling import SMOTE

###  Load Dataset

In [3]:
# Sample data
data = {
    "Employee_ID": [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    "Age": [28, 35, 40, 30, 45, 32, 29, 38, 41, 36],
    "Salary": [50000, 70000, 60000, 55000, 80000, 52000, 48000, 75000, 67000, 72000],
    "Job_Role": ["Engineer", "Manager", "Engineer", "Analyst", "Manager", 
                 "Engineer", "Analyst", "Manager", "Analyst", "Engineer"],
    "Department": ["IT", "HR", "IT", "Finance", "HR", "IT", "Finance", "HR", "Finance", "IT"],
    "Work_Hours": [40, 45, 50, 42, 48, 40, 38, 46, 44, 41],
    "Satisfaction_Score": [8, 6, 5, 7, 4, 9, 8, 5, 6, 7],
    "Attrition": ["Yes", "No", "Yes", "No", "Yes", "No", "No", "Yes", "No", "No"]
}

# Create DataFrame
df = pd.DataFrame(data)

# Show the DataFrame
print(df)

   Employee_ID  Age  Salary  Job_Role Department  Work_Hours  \
0          101   28   50000  Engineer         IT          40   
1          102   35   70000   Manager         HR          45   
2          103   40   60000  Engineer         IT          50   
3          104   30   55000   Analyst    Finance          42   
4          105   45   80000   Manager         HR          48   
5          106   32   52000  Engineer         IT          40   
6          107   29   48000   Analyst    Finance          38   
7          108   38   75000   Manager         HR          46   
8          109   41   67000   Analyst    Finance          44   
9          110   36   72000  Engineer         IT          41   

   Satisfaction_Score Attrition  
0                   8       Yes  
1                   6        No  
2                   5       Yes  
3                   7        No  
4                   4       Yes  
5                   9        No  
6                   8        No  
7                   5  

###  Encoding Categorical Data 

In [5]:
# Label Encoding for Attrition (Target Variable) 
label_encoder = LabelEncoder() 
df["Attrition"] = label_encoder.fit_transform(df["Attrition"])  # Yes → 1, No → 0 
 
# One-Hot Encoding for Job Role and Department 
df = pd.get_dummies(df, columns=["Job_Role", "Department"], drop_first=True)

### Feature Selection

In [6]:
X = df.drop(columns=["Attrition"])  # Features 
y = df["Attrition"]  # Target 
 
# Select the top 5 best features 
selector = SelectKBest(score_func=f_classif, k=5) 
X_new = selector.fit_transform(X, y) 
print(X_new.shape)

(10, 5)


###  Feature Extraction (PCA)

In [7]:
pca = PCA(n_components=2)  # Reduce to 2 components 
X_pca = pca.fit_transform(X_new) 
print(X_pca.shape)

(10, 2)


### Handling Imbalanced Data

In [9]:
print("Before SMOTE:", y.value_counts()) 
 
smote = SMOTE(sampling_strategy='auto', k_neighbors=3) 
X_resampled, y_resampled = smote.fit_resample(X_pca, y) 
 
print("After SMOTE:", y_resampled.value_counts()) 

Before SMOTE: Attrition
0    6
1    4
Name: count, dtype: int64
After SMOTE: Attrition
1    6
0    6
Name: count, dtype: int64
