In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
import mlflow
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import re

In [4]:
# initialize mlflow autologging
mlflow.set_experiment('Job Recommender 2.0')
#mlflow.sklearn.autolog()

  return FileStore(store_uri, store_uri)


<Experiment: artifact_location='file:///c:/Users/ebina/Documents/GitHub/SkillGapPipeline/notebooks/mlruns/182739913865806057', creation_time=1763377093387, experiment_id='182739913865806057', last_update_time=1763377093387, lifecycle_stage='active', name='Job Recommender 2.0', tags={}>

In [5]:
# load dataset
df = pd.read_csv('C:/Users/ebina/Documents/GitHub/SkillGapPipeline/data/all_job_post.csv')
df.head()

Unnamed: 0,job_id,category,job_title,job_description,job_skill_set
0,3902668440,HR,Sr Human Resource Generalist,SUMMARY\nTHE SR. HR GENERALIST PROVIDES HR EXP...,"['employee relations', 'talent acquisition', '..."
1,3905823748,HR,Human Resources Manager,BE PART OF A STELLAR TEAM AT YSB AS THE MANAGE...,"['Talent Acquisition', 'Employee Performance M..."
2,3905854799,HR,Director of Human Resources,OUR CLIENT IS A THRIVING ORGANIZATION OFFERING...,"['Human Resources Management', 'Recruitment', ..."
3,3905834061,HR,Chief Human Resources Officer,JOB TITLE: CHIEF HUMAN RESOURCES OFFICER (CHRO...,"['talent management', 'organizational developm..."
4,3906250451,HR,Human Resources Generalist (Hybrid Role),DESCRIPTION\n\n WHO WE ARE \n\nAVI-SPL IS A DI...,"['Microsoft Office', 'Data analysis', 'Employe..."


In [6]:
# look for missing 
df.isnull().sum()

print('Dataset Size: ', df.shape)

Dataset Size:  (1167, 5)


In [None]:
# basic cleaning
def clean_skills(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9,]', '', text)
    text = text.replace('js', 'javascript')
    job_skill_set = sorted(list((set(text.split(',')))))
    return ','.join([s.strip() for s in job_skill_set if s.strip()])    

In [None]:
# run clean_skills function

df['job_skill_set'] = df['job_skill_set'].apply(clean_skills)   

In [12]:
# validate job_titles, any job title appearing <= 1 is removed. Reomved because it is essentially noise for out XGBoost model

counts = df['job_title'].value_counts()
valid_classes = counts[counts > 1].index

data = df[df['job_title'].isin(valid_classes)]

In [13]:
data

Unnamed: 0,job_id,category,job_title,job_description,job_skill_set
1,3905823748,HR,Human Resources Manager,BE PART OF A STELLAR TEAM AT YSB AS THE MANAGE...,"adaptability,attentiontodetail,backgroundcheck..."
2,3905854799,HR,Director of Human Resources,OUR CLIENT IS A THRIVING ORGANIZATION OFFERING...,"adaptability,attentiontodetail,collaboration,c..."
5,3901389277,HR,Human Resources Manager,JOB DESCRIPTION: · THE HR MANAGER WILL SUPPORT...,"attentiontodetail,benefitsadministration,commu..."
6,3902348043,HR,Human Resources Generalist,DRIVE YOUR FUTURE WITH TURN 14 DISTRIBUTION! N...,"attentiontodetail,benefitsadministration,colla..."
7,3906258424,HR,Human Resources Information System Specialist,THE ROLE OF AN HR INFORMATION SYSTEMS SPECIALI...,"adphris,attentiontodetail,businessrequirements..."
...,...,...,...,...,...
1156,3899524220,BUSINESS-DEVELOPMENT,Business Development Manager,BUSINESS DEVELOPMENT MANAGER – COMMERCIAL STAF...,"b2bsales,collaboration,communication,consultat..."
1158,3885831622,BUSINESS-DEVELOPMENT,Business Development Representative,IT'S FUN TO WORK IN A COMPANY WHERE PEOPLE TRU...,"adaptability,collaboration,communication,creat..."
1159,3902951563,BUSINESS-DEVELOPMENT,Business Development Manager,IF YOU WANT TO RESHAPE THE WORLD AND DISCOVER ...,"adaptability,analyticcapability,b2bsales,bluep..."
1161,3902782563,BUSINESS-DEVELOPMENT,Business Development Representative,COMPANY DESCRIPTIONTRU LEAF INTERNATIONAL IS A...,"businessdevelopment,collaborativework,communic..."


In [14]:
# encode job titles
label_encoder = LabelEncoder()
data['job_title_encoded'] = label_encoder.fit_transform(data['job_title'])
data[['job_title', 'job_title_encoded']].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['job_title_encoded'] = label_encoder.fit_transform(data['job_title'])


Unnamed: 0,job_title,job_title_encoded
1,Human Resources Manager,35
2,Director of Human Resources,13
5,Human Resources Manager,35
6,Human Resources Generalist,31
7,Human Resources Information System Specialist,33
8,Human Resources Generalist,31
9,Human Resources Project Manager,36
10,Human Resources Information System Partner III,32
11,Human Resources Specialist,37
12,Human Resources Generalist,31


In [15]:
# split into train/test
X = data['job_skill_set']
y = data['job_title_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape


((433,), (109,))

In [16]:
## Vectorize skills

vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=2, max_features=9000)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec.shape, X_test_vec.shape

((433, 1919), (109, 1919))

In [17]:
# enable autologging with mlfow
mlflow.sklearn.autolog()

In [None]:
# train model
num_classes = len(np.unique(y_train))

model = XGBClassifier(
    objective='multi:softprob',
    num_classes=num_classes,
    eval_metric='mlogloss',
    #learning_rate=0.01,
    #max_depth=8,
    #n_estimators=1000,
    #subsample=0.8,
    #colsample_bytree=0.8,
    random_state=42,
    #tree_method='gpu_hist'
    tree_method='hist'               
)

# hyperparameter tuning using random search

params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'n_estimators': [100, 300, 400, 500],
    'max_depth': [5, 7, 9],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8]
}

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    n_iter=30,
    scoring='accuracy',
    cv=2,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# execute hyperparameter tuning
random_search.fit(X_train_vec, y_train)

best_model = random_search.best_estimator_

# log best model with mlflow
with mlflow.start_run(run_name='Hyperparameter tuning for models'):
    mlflow.log_params(random_search.best_params_)
    mlflow.log_metric("best_cv_accuracy", random_search.best_score_)
    mlflow.sklearn.log_model(best_model, 'tuned-xgb_model')


2025/11/17 12:55:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '87f9592ac0c9467e8b8735286b66352d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 5 folds for each of 30 candidates, totalling 150 fits


KeyboardInterrupt: 

In [20]:
# train the best model
best_model.fit(X_train_vec, y_train)

Parameters: { "num_classes" } are not used.



In [22]:
# model evaluation
y_pred = best_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print('accuracy: ', accuracy)

labels = np.unique(y_test)
print('\nClassification Report: ')
classification_report = classification_report(y_test, y_pred, labels=labels, target_names=label_encoder.classes_)
print('\n', classification_report)

accuracy:  0.26605504587155965

Classification Report: 

                                                                                          precision    recall  f1-score   support

                                                   Business Development (Sales) Manager       0.00      0.00      0.00         1
                                                         Business Development Associate       0.00      0.00      0.00         1
                                             Business Development Center Representative       0.00      0.00      0.00         1
                                                          Business Development Director       0.00      0.00      0.00         1
                                                         Business Development Executive       0.36      0.73      0.48        11
                                                            Business Development Intern       0.18      0.43      0.25         7
                                      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# save model and encoder
joblib.dump(model, 'xgb_job_title_recommender.pkl')
joblib.dump(vectorizer, 'skills_tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'job_title_label_encoder.pkl')

['job_title_label_encoder.pkl']

In [None]:
# load model for use sample code
model = joblib.load('C:/Users/ebina/Documents/GitHub/SkillGapPipeline/models/xgb_job_title_recommender.pkl')
vectorizer = joblib.load('C:/Users/ebina/Documents/GitHub/SkillGapPipeline/models/skills_tfidf_vectorizer.pkl')
label_encoder = joblib.load('C:/Users/ebina/Documents/GitHub/SkillGapPipeline/models/job_title_label_encoder.pkl')

In [24]:
# function to test preds

def predict_titles(user_skill_text, top_k=3):
    '''
    Given a skillset, return the top k most likely job_titles
    '''  
    vec = vectorizer.transform([user_skill_text.lower()])
    probs = model.predict_proba(vec)[0]
    
    top_indices = np.argsort(probs)[::-1][:top_k]
    top_titles = label_encoder.inverse_transform(top_indices)
    top_scores = probs[top_indices]  
    
    return list(zip(top_titles, top_scores))

In [25]:
# log accuracy
mlflow.log_metric('test_accuracy', accuracy)

#log classification report
mlflow.log_text(classification_report, 'classification_report.txt') 
mlflow.log_dict(classification_report, 'classification_report.json')

In [27]:
#log model on mlflow
mlflow.sklearn.log_model(best_model, 'xgb_job_title_recommender_model')



<mlflow.models.model.ModelInfo at 0x24faede3cb0>

In [28]:
# run a prediction
user_input = 'python, java, css, html'
results = predict_titles(user_input)

print('Recommended job titles: ')
for titles, score in results:
    print(f'- {titles} (confidence: {score:.3f})')

NotFittedError: need to call fit or load_model beforehand