In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
#import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier


In [9]:
import matplotlib
matplotlib.use('TkAgg')  
import matplotlib.pyplot as plt


In [10]:
#load dataset
df = pd.read_csv('promotion_dataset.csv')
df.head()

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not
0,YAK/S/00001,Commercial Sales and Marketing,MSc MBA and PhD,Female,Direct Internal process,2,1986,12.5,2011,1,0,41,ANAMBRA,No,Married,No,No,0,0
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,ANAMBRA,Yes,Married,No,No,0,0
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,KATSINA,Yes,Married,No,No,0,0
3,YAK/S/00004,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,1982,2.5,2009,0,0,42,NIGER,Yes,Single,No,No,1,0
4,YAK/S/00006,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,1990,7.5,2012,0,0,77,AKWA IBOM,Yes,Married,No,No,1,0


In [11]:
# getting information about dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38312 entries, 0 to 38311
Data columns (total 19 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   EmployeeNo                           38312 non-null  object 
 1   Division                             38312 non-null  object 
 2   Qualification                        36633 non-null  object 
 3   Gender                               38312 non-null  object 
 4   Channel_of_Recruitment               38312 non-null  object 
 5   Trainings_Attended                   38312 non-null  int64  
 6   Year_of_birth                        38312 non-null  int64  
 7   Last_performance_score               38312 non-null  float64
 8   Year_of_recruitment                  38312 non-null  int64  
 9   Targets_met                          38312 non-null  int64  
 10  Previous_Award                       38312 non-null  int64  
 11  Training_score_average      

In [12]:
# more info on rows and collumns
df.shape

(38312, 19)

In [13]:
#information about missing data
round((df.isnull().sum() / df.shape[0]) * 100, 2)

EmployeeNo                             0.00
Division                               0.00
Qualification                          4.38
Gender                                 0.00
Channel_of_Recruitment                 0.00
Trainings_Attended                     0.00
Year_of_birth                          0.00
Last_performance_score                 0.00
Year_of_recruitment                    0.00
Targets_met                            0.00
Previous_Award                         0.00
Training_score_average                 0.00
State_Of_Origin                        0.00
Foreign_schooled                       0.00
Marital_Status                         0.00
Past_Disciplinary_Action               0.00
Previous_IntraDepartmental_Movement    0.00
No_of_previous_employers               0.00
Promoted_or_Not                        0.00
dtype: float64

In [14]:
# handling missing data by filling with mode
df['Qualification'] = df['Qualification'].fillna(df['Qualification'].mode()[0])

In [15]:
#crosscheck missing data filled
round((df.isnull().sum() / df.shape[0]) * 100, 2)

EmployeeNo                             0.0
Division                               0.0
Qualification                          0.0
Gender                                 0.0
Channel_of_Recruitment                 0.0
Trainings_Attended                     0.0
Year_of_birth                          0.0
Last_performance_score                 0.0
Year_of_recruitment                    0.0
Targets_met                            0.0
Previous_Award                         0.0
Training_score_average                 0.0
State_Of_Origin                        0.0
Foreign_schooled                       0.0
Marital_Status                         0.0
Past_Disciplinary_Action               0.0
Previous_IntraDepartmental_Movement    0.0
No_of_previous_employers               0.0
Promoted_or_Not                        0.0
dtype: float64

In [16]:
# identify collumn data types
cat_col = [col for col in df.columns if df[col].dtype == 'object']
num_col = [col for col in df.columns if df[col].dtype != 'object']

print('Categorical columns:', cat_col)
print('Numerical columns:', num_col)

Categorical columns: ['EmployeeNo', 'Division', 'Qualification', 'Gender', 'Channel_of_Recruitment', 'State_Of_Origin', 'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement']
Numerical columns: ['Trainings_Attended', 'Year_of_birth', 'Last_performance_score', 'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average', 'No_of_previous_employers', 'Promoted_or_Not']


In [17]:
#count unique data for categorical data types
df[cat_col].nunique()

EmployeeNo                             38312
Division                                   9
Qualification                              3
Gender                                     2
Channel_of_Recruitment                     3
State_Of_Origin                           37
Foreign_schooled                           2
Marital_Status                             3
Past_Disciplinary_Action                   2
Previous_IntraDepartmental_Movement        2
dtype: int64

In [18]:
#statistical summary of dataset
df.describe()


Unnamed: 0,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,No_of_previous_employers,Promoted_or_Not
count,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0
mean,2.25368,1986.209334,7.698959,2013.139695,0.352996,0.023152,55.366465,1.040953,0.084595
std,0.609443,7.646047,3.744135,4.261451,0.477908,0.150388,13.362741,1.235738,0.278282
min,2.0,1950.0,0.0,1982.0,0.0,0.0,31.0,0.0,0.0
25%,2.0,1982.0,5.0,2012.0,0.0,0.0,43.0,0.0,0.0
50%,2.0,1988.0,7.5,2014.0,0.0,0.0,52.0,1.0,0.0
75%,2.0,1992.0,10.0,2016.0,1.0,0.0,68.0,1.0,0.0
max,11.0,2001.0,12.5,2018.0,1.0,1.0,91.0,6.0,1.0


In [19]:
# generate profiling report 
from ydata_profiling import ProfileReport
df.profile_report()


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 27.90it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [20]:
#correlation analysis
df.corr(numeric_only=True)

Unnamed: 0,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,No_of_previous_employers,Promoted_or_Not
Trainings_Attended,1.0,0.07871,-0.062042,0.056215,-0.044789,-0.007409,0.041065,0.000796,-0.024345
Year_of_birth,0.07871,1.0,-0.175572,0.654666,0.025337,0.013627,0.04839,-0.003117,0.017991
Last_performance_score,-0.062042,-0.175572,1.0,-0.190333,0.27635,0.026587,0.057836,-0.005428,0.11969
Year_of_recruitment,0.056215,0.654666,-0.190333,1.0,0.07691,0.041995,0.037477,-0.00355,0.012287
Targets_met,-0.044789,0.025337,0.27635,0.07691,1.0,0.092934,0.077201,-0.003308,0.224518
Previous_Award,-0.007409,0.013627,0.026587,0.041995,0.092934,1.0,0.07236,0.003887,0.201434
Training_score_average,0.041065,0.04839,0.057836,0.037477,0.077201,0.07236,1.0,0.008194,0.178448
No_of_previous_employers,0.000796,-0.003117,-0.005428,-0.00355,-0.003308,0.003887,0.008194,1.0,0.00169
Promoted_or_Not,-0.024345,0.017991,0.11969,0.012287,0.224518,0.201434,0.178448,0.00169,1.0


In [21]:
sns.countplot(x='Promoted_or_Not', data=df, hue='Promoted_or_Not', palette='Set1')
plt.title('promotion distribution')
plt.show()

  plt.show()


In [22]:
plt.boxplot(df['Training_score_average'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Targets_met')
plt.title('Box Plot')
plt.show()

  plt.show()


In [23]:
plt.boxplot(df['Last_performance_score'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Last_performance_score')
plt.title('Box Plot')
plt.show()

  plt.show()


In [24]:
plt.boxplot(df['Year_of_recruitment'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Year_of_recruitment')
plt.title('corr Plot in search of outliers')
plt.show()

  plt.show()


In [25]:
# feature engineering - creating new feature 'Age' from 'Year_of_birth' to drop redundant data
df["Age"] = 2025 - df["Year_of_birth"]
df.head()



Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not,Age
0,YAK/S/00001,Commercial Sales and Marketing,MSc MBA and PhD,Female,Direct Internal process,2,1986,12.5,2011,1,0,41,ANAMBRA,No,Married,No,No,0,0,39
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,ANAMBRA,Yes,Married,No,No,0,0,34
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,KATSINA,Yes,Married,No,No,0,0,38
3,YAK/S/00004,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,1982,2.5,2009,0,0,42,NIGER,Yes,Single,No,No,1,0,43
4,YAK/S/00006,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,1990,7.5,2012,0,0,77,AKWA IBOM,Yes,Married,No,No,1,0,35


In [None]:
#separating features and target variable
X = df[['Division', 'Qualification', 'Gender', 'Channel_of_Recruitment','Age', 'Trainings_Attended', 'Last_performance_score', 'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average', 'Foreign_schooled', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement', 'No_of_previous_employers']]
Y = df['Promoted_or_Not']


In [27]:
# scaling numerical features
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

x1 = X.copy()

scaler = MinMaxScaler(feature_range=(0, 1))

num_col_ = [col for col in X.columns if X[col].dtype != 'object']

num_cols = x1.select_dtypes(include=['int64', 'float64']).columns
x1[num_col_] = scaler.fit_transform(x1[num_col_])

# encoding categorical features
cat_cols = x1.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_cols:
    x1[col] = le.fit_transform(x1[col])
x1.head()

Unnamed: 0,Division,Qualification,Gender,Channel_of_Recruitment,Age,Trainings_Attended,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,Foreign_schooled,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers
0,1,1,0,1,0.294118,0.0,1.0,0.805556,1.0,0.0,0.166667,0,0,0,0.0
1,2,0,1,0,0.196078,0.0,1.0,0.916667,0.0,0.0,0.35,1,0,0,0.0
2,1,0,1,1,0.27451,0.0,0.6,0.833333,0.0,0.0,0.183333,1,0,0,0.0
3,1,0,1,0,0.372549,0.111111,0.2,0.75,0.0,0.0,0.183333,1,0,0,0.166667
4,4,0,1,1,0.215686,0.111111,0.6,0.833333,0.0,0.0,0.766667,1,0,0,0.166667


In [28]:
# standardizing numerical features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(x1)
print(X_standardized[:5])

[[-0.83158796  1.41479513 -1.53339317  0.99209997  0.02737841 -0.41625517
   1.28229978 -0.5021114   1.35384256 -0.15395043 -1.07512768 -3.24810652
  -0.06290405 -0.32218928 -0.84238477]
 [-0.41929682 -0.61751078  0.65214846 -0.86183301 -0.62656278 -0.41625517
   1.28229978  0.43654831 -0.73863832 -0.15395043 -0.25193251  0.30787168
  -0.06290405 -0.32218928 -0.84238477]
 [-0.83158796 -0.61751078  0.65214846  0.99209997 -0.10340983 -0.41625517
  -0.05313941 -0.26744648 -0.73863832 -0.15395043 -1.00029176  0.30787168
  -0.06290405 -0.32218928 -0.84238477]
 [-0.83158796 -0.61751078  0.65214846 -0.86183301  0.55053137  1.22460994
  -1.3885786  -0.97144126 -0.73863832 -0.15395043 -1.00029176  0.30787168
  -0.06290405 -0.32218928 -0.03314114]
 [ 0.40528546 -0.61751078  0.65214846  0.99209997 -0.49577454  1.22460994
  -0.05313941 -0.26744648 -0.73863832 -0.15395043  1.61896561  0.30787168
  -0.06290405 -0.32218928 -0.03314114]]


In [29]:
# preparing for train-test split
categorical_features_X = X.select_dtypes(include=['object']).columns.tolist()
numerical_features_X = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features_X),
        ("num", StandardScaler(), numerical_features_X)
    ]
)

In [30]:
# train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [31]:
# building random forest model
rf_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        class_weight='balanced'  # handle imbalanced classes
    ))
])
# Fit
rf_model.fit(X_train, Y_train)

In [32]:
# accuracy and classification report
rf_model.fit(X_train, Y_train)

rf_preds = rf_model.predict(X_test)

print("\n=== RANDOM FOREST RESULTS ===")
print("Accuracy:", accuracy_score(Y_test, rf_preds))
print(classification_report(Y_test, rf_preds))


=== RANDOM FOREST RESULTS ===
Accuracy: 0.9323449571935686
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      8768
           1       0.84      0.25      0.38       810

    accuracy                           0.93      9578
   macro avg       0.89      0.62      0.67      9578
weighted avg       0.93      0.93      0.91      9578



In [1]:
import xgboost
from xgboost import XGBClassifier

In [4]:
xgb_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", XGBClassifier(
        n_estimators=350,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss"
    ))
])

NameError: name 'Pipeline' is not defined

In [5]:
import sys
print(sys.executable)


c:\Users\HP\AppData\Local\Python\pythoncore-3.14-64\python.exe


In [9]:
import sys
print(sys.executable)

c:\Users\HP\AppData\Local\Python\pythoncore-3.14-64\python.exe


In [8]:
!py -m pip install xgboost


