In [1]:
import random
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

np.random.seed(42)
random.seed(42)
data_path = Path('..') / 'data' / 'synthetic_career_data.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,id,skills,education,experience_years,role
0,1,"flask, gcp, mongodb_atlas, postgresql, rails, ...",Bachelors,2,Backend Developer
1,2,"aws, bash, cpp, figma, powershell, r",Bootcamp,8,Systems Administrator
2,3,"css, cybersecurity, figma, firebase, hadoop, n...",Bootcamp,4,Cybersecurity Analyst
3,4,"aws, devops, pytest, supabase",Bootcamp,8,DevOps Engineer
4,5,"apache, html, kubernetes, linux, python, sql, ...",Bachelors,1,Data Analyst


In [2]:
# Basic sanity checks
required_cols = {'id','skills','education','experience_years','role'}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f'Missing columns: {missing}')
df['skills'] = df['skills'].fillna('')
df['role'] = df['role'].fillna('')
len(df), df['role'].nunique()

(2500, 20)

In [4]:
# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    df['skills'], df['role'], test_size=0.2, stratify=df['role'], random_state=42
)

def clean_text(text: str) -> str:
    return str(text).lower().replace(',', ' ')

labeler = LabelEncoder()
Y_train_enc = labeler.fit_transform(y_train)
Y_val_enc = labeler.transform(y_val)

pipeline = make_pipeline(
    CountVectorizer(
        preprocessor=clean_text,
        token_pattern=r'[a-zA-Z0-9_\+#\.]+',
        min_df=1
    ),
    LogisticRegression(max_iter=400, multi_class='multinomial')
)
pipeline.fit(X_train, Y_train_enc)



0,1,2
,steps,"[('countvectorizer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,<function cle...001D206015A20>
,tokenizer,
,stop_words,
,token_pattern,'[a-zA-Z0-9_\\+#\\.]+'
,ngram_range,"(1, ...)"

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,400


In [5]:
# Evaluate top-5 accuracy
probs = pipeline.predict_proba(X_val)
top5_idx = np.argsort(probs, axis=1)[:, -5:]
hits = sum(1 for i, true_lab in enumerate(Y_val_enc) if true_lab in top5_idx[i])
top5_acc = hits / len(Y_val_enc)
top1_acc = (pipeline.predict(X_val) == Y_val_enc).mean()
top1_acc, top5_acc

(np.float64(0.494), 0.902)

In [6]:
# Persist artifacts
models_dir = Path('..') / 'models'
models_dir.mkdir(exist_ok=True)
artifacts = {
    'pipeline': pipeline,
    'label_encoder': labeler,
    'top1_acc': float(top1_acc),
    'top5_acc': float(top5_acc)
}
model_path = models_dir / 'role_matcher.joblib'
joblib.dump(artifacts, model_path)
model_path

WindowsPath('../models/role_matcher.joblib')

In [7]:
# Inference helper
def predict_roles(skills_list, top_k=5):
    text = ', '.join(skills_list)
    probs = pipeline.predict_proba([text])[0]
    top_idx = np.argsort(probs)[-top_k:][::-1]
    roles = labeler.inverse_transform(top_idx)
    scores = probs[top_idx]
    return list(zip(roles, scores))

predict_roles(['python','django','postgresql','docker','aws'])

[('Backend Developer', np.float64(0.43362263669992357)),
 ('Full Stack Developer', np.float64(0.255069897081921)),
 ('Machine Learning Engineer', np.float64(0.10152030602481787)),
 ('DevOps Engineer', np.float64(0.07679932148333651)),
 ('Software Developer', np.float64(0.057746773494223226))]