In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import mlflow
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import re

In [2]:
# initialize mlflow autologging
mlflow.set_experiment('Job Recommender')
mlflow.sklearn.autolog()

  return FileStore(store_uri, store_uri)


In [52]:
# load dataset
df = pd.read_csv('C:/Users/ebina/Documents/GitHub/SkillGapPipeline/data/all_job_post.csv')
df.head()

Unnamed: 0,job_id,category,job_title,job_description,job_skill_set
0,3902668440,HR,Sr Human Resource Generalist,SUMMARY\nTHE SR. HR GENERALIST PROVIDES HR EXP...,"['employee relations', 'talent acquisition', '..."
1,3905823748,HR,Human Resources Manager,BE PART OF A STELLAR TEAM AT YSB AS THE MANAGE...,"['Talent Acquisition', 'Employee Performance M..."
2,3905854799,HR,Director of Human Resources,OUR CLIENT IS A THRIVING ORGANIZATION OFFERING...,"['Human Resources Management', 'Recruitment', ..."
3,3905834061,HR,Chief Human Resources Officer,JOB TITLE: CHIEF HUMAN RESOURCES OFFICER (CHRO...,"['talent management', 'organizational developm..."
4,3906250451,HR,Human Resources Generalist (Hybrid Role),DESCRIPTION\n\n WHO WE ARE \n\nAVI-SPL IS A DI...,"['Microsoft Office', 'Data analysis', 'Employe..."


In [38]:
# look for missing 
df.isnull().sum()

print('Dataset Size: ', df.shape)

Dataset Size:  (1167, 5)


In [42]:
# basic cleaning
df = df.dropna(subset=['job_skill_set', 'job_title'])

In [50]:
# basic cleaning
def clean_skills(text):
    text = text.lower()
    #text = re.sub(r'[^a-z0-9,]',  '', text)
    #text = re.sub(r'[a-z]', '', text)
    text = text.replace('js', 'javascript')
    job_skill_set = sorted(list((set(text.split(',')))))
    return ','.join([s.strip() for s in job_skill_set if s.strip()])    

In [51]:
# run clean_skills function

df['job_skill_set'] = df['job_skill_set'].apply(clean_skills)   

In [49]:
df

Unnamed: 0,job_id,category,job_title,job_description,job_skill_set
0,3902668440,HR,Sr Human Resource Generalist,SUMMARY\nTHE SR. HR GENERALIST PROVIDES HR EXP...,"' ',' ',' ',' '],'',[' '"
1,3905823748,HR,Human Resources Manager,BE PART OF A STELLAR TEAM AT YSB AS THE MANAGE...,"' ',' ','','- '],[' '"
2,3905854799,HR,Director of Human Resources,OUR CLIENT IS A THRIVING ORGANIZATION OFFERING...,"' ',' ',' & ',' ','',''],[' '"
3,3905834061,HR,Chief Human Resources Officer,JOB TITLE: CHIEF HUMAN RESOURCES OFFICER (CHRO...,"' ',' ',' ',' '],'','- ',[' '"
4,3906250451,HR,Human Resources Generalist (Hybrid Role),DESCRIPTION\n\n WHO WE ARE \n\nAVI-SPL IS A DI...,"' ',' ','',''],[' '"
...,...,...,...,...,...
1162,3905299905,BUSINESS-DEVELOPMENT,Intern - Business Development,REQUIREMENTS\n\n DESCRIPTION & REQUIREMENTS \n...,"' ',' '],' ','',[' '"
1163,3885829894,BUSINESS-DEVELOPMENT,Business Development Representative,IT'S FUN TO WORK IN A COMPANY WHERE PEOPLE TRU...,"' ',' ','',''],'.',[''"
1164,3901649881,BUSINESS-DEVELOPMENT,Enterprise Business Development Representative...,JOIN OUR DYNAMIC AI TEAM AS AN ENTERPRISE BUSI...,"' '],' ','','- ','-',[''"
1165,3904049863,BUSINESS-DEVELOPMENT,Senior Director Business Development,ROOM 8 GROUP IS THE WORLD’S FASTEST GROWING ST...,"' ',' '],' ','',[' '"


In [13]:
# validate job_titles, any job title appearing <= 1 is removed. Reomved because it is essentially noise for out XGBoost model

counts = df['job_title'].value_counts()
valid_classes = counts[counts > 1].index

data = df[df['job_title'].isin(valid_classes)]

In [14]:
# encode job titles
label_encoder = LabelEncoder()
data['job_title_encoded'] = label_encoder.fit_transform(data['job_title'])
data[['job_title', 'job_title_encoded']].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['job_title_encoded'] = label_encoder.fit_transform(data['job_title'])


Unnamed: 0,job_title,job_title_encoded
1,Human Resources Manager,35
2,Director of Human Resources,13
5,Human Resources Manager,35
6,Human Resources Generalist,31
7,Human Resources Information System Specialist,33
8,Human Resources Generalist,31
9,Human Resources Project Manager,36
10,Human Resources Information System Partner III,32
11,Human Resources Specialist,37
12,Human Resources Generalist,31


In [15]:
# split into train/test
X = data['job_skill_set']
y = data['job_title_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape


((433,), (109,))

In [21]:
## Vectorize skills

vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=2, max_features=4000)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec.shape, X_test_vec.shape

((433, 1480), (109, 1480))

In [22]:
# train model
num_classes = len(np.unique(y_train))

model = XGBClassifier(
    objective='multi:softprob',
    num_classes=num_classes,
    eval_metric='mlogloss',
    learning_rate=0.05,
    max_depth=8,
    n_estimators=400,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method='hist',
    device='cuda'               
)

model.fit(X_train_vec, y_train)



Parameters: { "num_classes" } are not used.



In [28]:
import inspect

print("--- Type Diagnostic ---")
print(f"Type of TfidfVectorizer class: {type(TfidfVectorizer)}")
print(f"Type of XGBClassifier class: {type(XGBClassifier)}")
print(f"Type of LabelEncoder class: {type(LabelEncoder)}")
print(f"Type of train_test_split function: {type(train_test_split)}")
print("-" * 25)

# Assuming you've already instantiated these
try:
    print(f"Type of 'model' instance: {type(model)}")
    print(f"Type of 'vectorizer' instance: {type(vectorizer)}")
    print(f"Type of 'label_encoder' instance: {type(label_encoder)}")
except NameError:
    print("Instance variables (model, vectorizer, etc.) not yet defined.")

--- Type Diagnostic ---
Type of TfidfVectorizer class: <class 'type'>
Type of XGBClassifier class: <class 'type'>
Type of LabelEncoder class: <class 'type'>
Type of train_test_split function: <class 'function'>
-------------------------
Type of 'model' instance: <class 'xgboost.sklearn.XGBClassifier'>
Type of 'vectorizer' instance: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Type of 'label_encoder' instance: <class 'sklearn.preprocessing._label.LabelEncoder'>


In [None]:
# model evaluation
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print('accuracy: ', accuracy)

labels = np.unique(y_test)
print('\nClassification Report: ')
#classification_report = classification_report(y_test, y_pred, labels=labels, target_names=label_encoder.classes_)
#print('\n', classification_report)

accuracy:  0.13761467889908258

Classification Report: 


TypeError: 'str' object is not callable

In [13]:
# save model and encoder
joblib.dump(model, 'xgb_job_title_recommender.pkl')
joblib.dump(vectorizer, 'skills_tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'job_title_label_encoder.pkl')

['job_title_label_encoder.pkl']

In [14]:
# load model for use sample code
model = joblib.load('C:/Users/ebina/Documents/GitHub/SkillGapPipeline/models/xgb_job_title_recommender.pkl')
vectorizer = joblib.load('C:/Users/ebina/Documents/GitHub/SkillGapPipeline/models/skills_tfidf_vectorizer.pkl')
label_encoder = joblib.load('C:/Users/ebina/Documents/GitHub/SkillGapPipeline/models/job_title_label_encoder.pkl')

In [15]:
# function to test preds

def predict_titles(user_skill_text, top_k=3):
    '''
    Given a skillset, return the top k most likely job_titles
    '''  
    vec = vectorizer.transform([user_skill_text.lower()])
    probs = model.predict_proba(vec)[0]
    
    top_indices = np.argsort(probs)[::-1][:top_k]
    top_titles = label_encoder.inverse_transform(top_indices)
    top_scores = probs[top_indices]  
    
    return list(zip(top_titles, top_scores))

In [16]:
# log accuracy
mlflow.log_metric('test_accuracy', accuracy)

#log classification report
mlflow.log_text(classification_report, 'classification_report.txt') 
mlflow.log_dict(classification_report, 'classification_report.json')

In [17]:
#log model on mlflow
mlflow.sklearn.log_model(model, 'xgb_job_title_recommender_model')



<mlflow.models.model.ModelInfo at 0x24d7549ae70>

In [18]:
# run a prediction
user_input = 'python, java, css, html'
results = predict_titles(user_input)

print('Recommended job titles: ')
for titles, score in results:
    print(f'- {titles} (confidence: {score:.3f})')

Recommended job titles: 
- Business Development Specialist (confidence: 0.051)
- Business Development Center Representative (confidence: 0.044)
- Information Technology Security Administrator (confidence: 0.035)
