### Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

### Load Sample Dataset

In [2]:
data = pd.DataFrame({
    'employee_id': ['E001', 'E002', 'E003', 'E004', 'E005', 'E006', 'E007', 'E008', 'E009', 'E010'],
    'department': ['Sales', 'HR', 'IT', 'Finance', 'Sales', 'IT', 'HR', 'Finance', 'Sales', 'IT'],
    'education': ['Bachelor', 'Master', 'Bachelor', 'PhD', 'Master', 'Bachelor', 'PhD', 'Master', 'Bachelor', 'PhD'],
    'experience_years': [3, 5, 2, 8, 6, 4, 9, 7, 3, 10],
    'training_score': [75, 82, 68, 90, 85, 77, 95, 88, 72, 99],
    'performance_rating': [3, 4, 2, 5, 4, 3, 5, 4, 3, 5],
    'salary': [40000, 50000, 35000, 80000, 60000, 45000, 85000, 75000, 38000, 90000],
    'promotion': [0, 1, 0, 1, 1, 0, 1, 1, 0, 1]  # Target
})

# Display first five rows
print(data.head())

  employee_id department education  experience_years  training_score  \
0        E001      Sales  Bachelor                 3              75   
1        E002         HR    Master                 5              82   
2        E003         IT  Bachelor                 2              68   
3        E004    Finance       PhD                 8              90   
4        E005      Sales    Master                 6              85   

   performance_rating  salary  promotion  
0                   3   40000          0  
1                   4   50000          1  
2                   2   35000          0  
3                   5   80000          1  
4                   4   60000          1  


### Understand Features & Target

In [3]:
print("Data Types:\n", data.dtypes, "\n")
print("Target Variable: promotion\n")

Data Types:
 employee_id           object
department            object
education             object
experience_years       int64
training_score         int64
performance_rating     int64
salary                 int64
promotion              int64
dtype: object 

Target Variable: promotion



### Handle Missing Values

In [4]:
print("Missing Values:\n", data.isnull().sum(), "\n")

Missing Values:
 employee_id           0
department            0
education             0
experience_years      0
training_score        0
performance_rating    0
salary                0
promotion             0
dtype: int64 



### Encode Categorical Variables (Label Encoding)

In [5]:
label_enc = LabelEncoder()
data['department'] = label_enc.fit_transform(data['department'])
data['education'] = label_enc.fit_transform(data['education'])

### Feature Scaling

In [8]:
scaler = StandardScaler()
data[['experience_years', 'training_score', 'performance_rating', 'salary']] = \
    scaler.fit_transform(data[['experience_years', 'training_score', 'performance_rating', 'salary']])


###  Split Data into Train/Test

In [9]:
X = data.drop(['employee_id', 'promotion'], axis=1)
y = data['promotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train Classification Models

#### 1. Logistic Regression

In [10]:
log_reg = LogisticRegression() 
log_reg.fit(X_train, y_train) 
y_pred_log = log_reg.predict(X_test)

#### 2. Decision Tree Classifier

In [11]:
dt_clf = DecisionTreeClassifier() 
dt_clf.fit(X_train, y_train) 
y_pred_dt = dt_clf.predict(X_test)

#### 3. Random Forest Classifier

In [12]:
rf_clf = RandomForestClassifier(n_estimators=100) 
rf_clf.fit(X_train, y_train) 
y_pred_rf = rf_clf.predict(X_test)

#### 4️. Support Vector Machine (SVM) 

In [13]:
svm_clf = SVC() 
svm_clf.fit(X_train, y_train) 
y_pred_svm = svm_clf.predict(X_test)

#### 5. K-Nearest Neighbors (KNN)

In [14]:
knn_clf = KNeighborsClassifier(n_neighbors=5) 
knn_clf.fit(X_train, y_train) 
y_pred_knn = knn_clf.predict(X_test)

###  Model Evaluation

In [15]:
models = { 
    "Logistic Regression": y_pred_log, 
    "Decision Tree": y_pred_dt, 
    "Random Forest": y_pred_rf, 
    "SVM": y_pred_svm, 
    "KNN": y_pred_knn 
} 
 
for model_name, y_pred in models.items(): 
    print(f"\n{model_name} Performance:") 
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}") 
    print(f"Precision: {precision_score(y_test, y_pred):.2f}") 
    print(f"Recall: {recall_score(y_test, y_pred):.2f}") 
    print(f"F1 Score: {f1_score(y_test, y_pred):.2f}") 
    print("-" * 50) 


Logistic Regression Performance:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
--------------------------------------------------

Decision Tree Performance:
Accuracy: 0.50
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
--------------------------------------------------

Random Forest Performance:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
--------------------------------------------------

SVM Performance:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
--------------------------------------------------

KNN Performance:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Hyperparameter Tuning (Random Forest Example)

In [16]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [2, 4, 6]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                           param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("\nBest Parameters from GridSearchCV:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_


Best Parameters from GridSearchCV: {'max_depth': 2, 'n_estimators': 50}


### Feature Selection (SelectKBest)

In [17]:
selector = SelectKBest(score_func=f_classif, k=3)
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("\nTop 3 Selected Features:", list(selected_features))


Top 3 Selected Features: ['education', 'training_score', 'performance_rating']


### Save Final Model (Best Random Forest)

In [18]:
joblib.dump(best_rf_model, "best_promotion_model.pkl")
print("\n Final Model Saved as 'best_promotion_model.pkl'")


 Final Model Saved as 'best_promotion_model.pkl'
