# Preparing Dataset 

In [14]:
import pandas as pd 
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import seaborn as sns 
from sklearn.metrics import accuracy_score




In [15]:
df= pd.read_csv('recruitment_data.csv')
df.head()

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,HiringDecision
0,26,1,2,0,3,26.783828,48,78,91,1,1
1,39,1,4,12,3,25.862694,35,68,80,2,1
2,48,0,2,3,2,9.920805,20,67,13,2,0
3,34,1,2,5,2,6.407751,36,27,70,3,0
4,30,0,1,6,1,43.105343,23,52,85,2,0


# Check the null values :

In [193]:
df.isnull().sum()

Age                    0
Gender                 0
EducationLevel         0
ExperienceYears        0
PreviousCompanies      0
DistanceFromCompany    0
InterviewScore         0
SkillScore             0
PersonalityScore       0
RecruitmentStrategy    0
HiringDecision         0
dtype: int64

# Data Splitting 

In [16]:
x = df.drop(columns=['HiringDecision'], axis=1) 

y = df['HiringDecision']
x.head()

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy
0,26,1,2,0,3,26.783828,48,78,91,1
1,39,1,4,12,3,25.862694,35,68,80,2
2,48,0,2,3,2,9.920805,20,67,13,2
3,34,1,2,5,2,6.407751,36,27,70,3
4,30,0,1,6,1,43.105343,23,52,85,2


#  Splitting & Feature Selection & hyperparameter tuning and modeling 


In [17]:
x_train , x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=123)

In [196]:
from sklearn.feature_selection import SelectKBest, f_classif
# Define k values for feature selection
k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Define parameter grid for KNN
param_dist_knn = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

for k in k_values:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(x_train, y_train)
    X_test_selected = selector.transform(x_test)

    # Get the selected feature names
    selected_features = x.columns[selector.get_support()].tolist()
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_dist_knn, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_selected, y_train)  
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Output the results
    print(f"Selected Features for k={k}: {selected_features}")
    print(f"Accuracy for k={k}: {accuracy}")
    print(f"Best parameters for k={k}: {grid_search.best_params_}")
    # Print evaluation metrics
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
    print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
    print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    if hasattr(best_model, "predict_proba"):
        y_proba = best_model.predict_proba(X_test_selected)
        if len(y_proba[0]) == 2:  # Binary classification case
            roc_auc = roc_auc_score(y_test, y_proba[:, 1])
        else:  # Multi-class classification case
            roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr")
        print('ROC AUC Score: %.3f' % roc_auc)
    else:
        print("ROC AUC Score: N/A (best model does not support probability estimates)")

    print()


    

Selected Features for k=1: ['RecruitmentStrategy']
Accuracy for k=1: 0.8333333333333334
Best parameters for k=1: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Accuracy: 0.833
Recall: 0.833
Precision: 0.832
F1 Score: 0.833
ROC AUC Score: 0.793

Selected Features for k=2: ['EducationLevel', 'RecruitmentStrategy']
Accuracy for k=2: 0.8333333333333334
Best parameters for k=2: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
Accuracy: 0.833
Recall: 0.833
Precision: 0.832
F1 Score: 0.833
ROC AUC Score: 0.817

Selected Features for k=3: ['EducationLevel', 'SkillScore', 'RecruitmentStrategy']
Accuracy for k=3: 0.8066666666666666
Best parameters for k=3: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'uniform'}
Accuracy: 0.807
Recall: 0.807
Precision: 0.799
F1 Score: 0.799
ROC AUC Score: 0.788

Selected Features for k=4: ['EducationLevel', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=4: 0.7444444444444445
Best parameters for k=4:

In [25]:

import joblib
import pickle
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif


best_model = KNeighborsClassifier(metric = 'euclidean', n_neighbors= 3, weights= 'uniform')
best_k = 1
pipeline = Pipeline([
    ('feature_selection', SelectKBest(score_func=f_classif, k=best_k)),
    ('KNeighbors_Classifier', best_model)
])

# Fit the pipeline on the training data
pipeline.fit(x_train, y_train)

# Make predictions and calculate accuracy
y_pred = pipeline.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
# Save the pipeline and the best_k value to a file using pickle
model_filename = 'Knn_model_imb_fs.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump({'model': pipeline}, file)


Accuracy: 83.33%


# Apply SMOTE for oversampling

In [197]:

smote = SMOTE(random_state=123)
x_sm, y_sm = smote.fit_resample(x_train, y_train)
y_sm.value_counts()

HiringDecision
1    714
0    714
Name: count, dtype: int64

# Oversampling Stage 

In [198]:

# Define k values for feature selection
k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Define parameter grid for KNN
param_dist_knn = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

for k in k_values:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(x_sm, y_sm)
    X_test_selected = selector.transform(x_test)

    # Get the selected feature names
    selected_features = x.columns[selector.get_support()].tolist()
    
    # Perform grid search with KNN
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_dist_knn, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_selected, y_sm)
    
    # Get the best estimator
    best_model = grid_search.best_estimator_
    
    # Predict and evaluate the model
    y_pred = best_model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Output the results
    print(f"Selected Features for k={k}: {selected_features}")
    print(f"Accuracy for k={k}: {accuracy}")
    print(f"Best parameters for k={k}: {grid_search.best_params_}")



Selected Features for k=1: ['RecruitmentStrategy']
Accuracy for k=1: 0.7133333333333334
Best parameters for k=1: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Selected Features for k=2: ['SkillScore', 'RecruitmentStrategy']
Accuracy for k=2: 0.7911111111111111
Best parameters for k=2: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
Selected Features for k=3: ['SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=3: 0.6822222222222222
Best parameters for k=3: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Selected Features for k=4: ['EducationLevel', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=4: 0.68
Best parameters for k=4: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Selected Features for k=5: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=5: 0.6266666666666667
Best parameters for k=5: {'metric': 'manhatt

In [199]:
from sklearn.feature_selection import SelectKBest, f_classif
# Define k values for feature selection
k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Define parameter grid for KNN
param_dist_knn = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

for k in k_values:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(x_sm, y_sm)
    X_test_selected = selector.transform(x_test)

    # Get the selected feature names
    selected_features = x.columns[selector.get_support()].tolist()
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_dist_knn, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_selected, y_sm)  
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Output the results
    print(f"Selected Features for k={k}: {selected_features}")
    print(f"Accuracy for k={k}: {accuracy}")
    print(f"Best parameters for k={k}: {grid_search.best_params_}")
    # Print evaluation metrics
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
    print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
    print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    if hasattr(best_model, "predict_proba"):
        y_proba = best_model.predict_proba(X_test_selected)
        if len(y_proba[0]) == 2:  # Binary classification case
            roc_auc = roc_auc_score(y_test, y_proba[:, 1])
        else:  # Multi-class classification case
            roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr")
        print('ROC AUC Score: %.3f' % roc_auc)
    else:
        print("ROC AUC Score: N/A (best model does not support probability estimates)")

    print()
    

Selected Features for k=1: ['RecruitmentStrategy']
Accuracy for k=1: 0.7133333333333334
Best parameters for k=1: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Accuracy: 0.713
Recall: 0.713
Precision: 0.509
F1 Score: 0.594
ROC AUC Score: 0.793



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Selected Features for k=2: ['SkillScore', 'RecruitmentStrategy']
Accuracy for k=2: 0.7911111111111111
Best parameters for k=2: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
Accuracy: 0.791
Recall: 0.791
Precision: 0.799
F1 Score: 0.794
ROC AUC Score: 0.811

Selected Features for k=3: ['SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=3: 0.6822222222222222
Best parameters for k=3: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Accuracy: 0.682
Recall: 0.682
Precision: 0.710
F1 Score: 0.692
ROC AUC Score: 0.712

Selected Features for k=4: ['EducationLevel', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=4: 0.68
Best parameters for k=4: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Accuracy: 0.680
Recall: 0.680
Precision: 0.712
F1 Score: 0.691
ROC AUC Score: 0.710

Selected Features for k=5: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Acc

# Apply RandomUnderSampler for undersampling

In [200]:
undersample = RandomUnderSampler( random_state=123)
X_resampled, y_resampled = undersample.fit_resample(x_train, y_train)
y_resampled.value_counts()


HiringDecision
0    336
1    336
Name: count, dtype: int64

# Undersampling Stage 


In [201]:
# Define k values for feature selection
k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Define parameter grid for KNN
param_dist_knn = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

for k in k_values:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(X_resampled, y_resampled)
    X_test_selected = selector.transform(x_test)

    # Get the selected feature names
    selected_features = x.columns[selector.get_support()].tolist()
    
    # Perform grid search with KNN
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_dist_knn, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_selected, y_resampled)
    
    # Get the best estimator
    best_model = grid_search.best_estimator_
    
    # Predict and evaluate the model
    y_pred = best_model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Output the results
    print(f"Selected Features for k={k}: {selected_features}")
    print(f"Accuracy for k={k}: {accuracy}")
    print(f"Best parameters for k={k}: {grid_search.best_params_}")



Selected Features for k=1: ['RecruitmentStrategy']
Accuracy for k=1: 0.7133333333333334
Best parameters for k=1: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
Selected Features for k=2: ['EducationLevel', 'RecruitmentStrategy']
Accuracy for k=2: 0.7822222222222223
Best parameters for k=2: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
Selected Features for k=3: ['EducationLevel', 'SkillScore', 'RecruitmentStrategy']
Accuracy for k=3: 0.7644444444444445
Best parameters for k=3: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Selected Features for k=4: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'RecruitmentStrategy']
Accuracy for k=4: 0.7222222222222222
Best parameters for k=4: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
Selected Features for k=5: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=5: 0.5933333333333334
Best parameters for k=5: {'met

# Evaluating the model without resampling 


In [202]:
from sklearn.feature_selection import SelectKBest, f_classif
# Define k values for feature selection
k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Define parameter grid for KNN
param_dist_knn = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

for k in k_values:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(X_resampled, y_resampled)
    X_test_selected = selector.transform(x_test)

    # Get the selected feature names
    selected_features = x.columns[selector.get_support()].tolist()
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_dist_knn, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_selected, y_resampled)  
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Output the results
    print(f"Selected Features for k={k}: {selected_features}")
    print(f"Accuracy for k={k}: {accuracy}")
    print(f"Best parameters for k={k}: {grid_search.best_params_}")
    # Print evaluation metrics
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
    print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
    print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    if hasattr(best_model, "predict_proba"):
        y_proba = best_model.predict_proba(X_test_selected)
        if len(y_proba[0]) == 2:  # Binary classification case
            roc_auc = roc_auc_score(y_test, y_proba[:, 1])
        else:  # Multi-class classification case
            roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr")
        print('ROC AUC Score: %.3f' % roc_auc)
    else:
        print("ROC AUC Score: N/A (best model does not support probability estimates)")

    print()
    

Selected Features for k=1: ['RecruitmentStrategy']
Accuracy for k=1: 0.7133333333333334
Best parameters for k=1: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
Accuracy: 0.713
Recall: 0.713
Precision: 0.509
F1 Score: 0.594
ROC AUC Score: 0.500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Selected Features for k=2: ['EducationLevel', 'RecruitmentStrategy']
Accuracy for k=2: 0.7822222222222223
Best parameters for k=2: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
Accuracy: 0.782
Recall: 0.782
Precision: 0.807
F1 Score: 0.737
ROC AUC Score: 0.663

Selected Features for k=3: ['EducationLevel', 'SkillScore', 'RecruitmentStrategy']
Accuracy for k=3: 0.7644444444444445
Best parameters for k=3: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Accuracy: 0.764
Recall: 0.764
Precision: 0.787
F1 Score: 0.772
ROC AUC Score: 0.778

Selected Features for k=4: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'RecruitmentStrategy']
Accuracy for k=4: 0.7222222222222222
Best parameters for k=4: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
Accuracy: 0.722
Recall: 0.722
Precision: 0.760
F1 Score: 0.733
ROC AUC Score: 0.773

Selected Features for k=5: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'PersonalityScore', 'Recruitment