In [1]:
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import joblib

In [2]:
DATA_FILE = Path("data/sample.csv")
MODEL_FILE = Path("model/recommender.pkl")

In [4]:
DEFAULT_DATA = [
    ("Software Engineer", "python, algorithms, data structures, git, sql, problem solving"),
    ("Data Scientist", "python, pandas, numpy, machine learning, statistics, sklearn, sql"),
    ("Web Developer", "html, css, javascript, react, git, web"),
    ("ML Engineer", "python, sklearn, tensorflow, mlops, docker, model deployment"),
    ("Business Analyst", "excel, sql, data visualization, tableau, communication, statistics"),
    ("DevOps Engineer", "linux, docker, ci cd, kubernetes, aws, bash, git"),
    ("Cybersecurity Analyst", "networking, linux, security, risk, monitoring, scripting"),
    ("UI/UX Designer", "figma, design, prototyping, user research, wireframing, css"),
    ("Cloud Architect", "aws, azure, gcp, cloud, networking, security, terraform"),
    ("Database Admin", "sql, postgres, backup, performance tuning, normalization")
]

In [6]:
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
MODEL_FILE.parent.mkdir(parents=True, exist_ok=True)

In [7]:
if not DATA_FILE.exists():
    df = pd.DataFrame(DEFAULT_DATA, columns=["career", "skills"])
    df.to_csv(DATA_FILE, index=False)
    print(f"Saved sample dataset → {DATA_FILE}")
else:
    df = pd.read_csv(DATA_FILE)
    print(f"Loaded dataset → {DATA_FILE}")

df.head()

Saved sample dataset → data\sample.csv


Unnamed: 0,career,skills
0,Software Engineer,"python, algorithms, data structures, git, sql,..."
1,Data Scientist,"python, pandas, numpy, machine learning, stati..."
2,Web Developer,"html, css, javascript, react, git, web"
3,ML Engineer,"python, sklearn, tensorflow, mlops, docker, mo..."
4,Business Analyst,"excel, sql, data visualization, tableau, commu..."


In [8]:
# 🔹 TF-IDF vectorization of skills
vectorizer = TfidfVectorizer(token_pattern=r"[a-zA-Z\+\#]+")
X = vectorizer.fit_transform(df["skills"].astype(str))

print("TF-IDF features shape:", X.shape)


TF-IDF features shape: (10, 54)


In [9]:
# 🔹 Train NearestNeighbors model
nn = NearestNeighbors(metric="cosine")
nn.fit(X)
print("NearestNeighbors model trained successfully!")


NearestNeighbors model trained successfully!


In [10]:
# 🔹 Prepare model bundle
model_bundle = {
    "vectorizer": vectorizer,
    "nn": nn,
    "careers": df["career"].tolist(),
    "catalog_skills": sorted(
        set(s.strip() for row in df["skills"] for s in str(row).split(",") if s.strip())
    )
}

print("Model bundle prepared!")


Model bundle prepared!


In [11]:
# 🔹 Save model to disk using joblib
joblib.dump(model_bundle, MODEL_FILE)
print(f"Saved trained model → {MODEL_FILE}")


Saved trained model → model\recommender.pkl


In [12]:
# 🔹 Optional: Test the model with sample skills
sample_skills = ["python", "pandas", "sql"]
skill_text = ", ".join(sample_skills).lower()
vec = vectorizer.transform([skill_text])
distances, idx = nn.kneighbors(vec, n_neighbors=3)

recommendations = [df["career"].iloc[i] for i in idx[0]]
print("Sample Skills:", sample_skills)
print("Recommended Careers:", recommendations)


Sample Skills: ['python', 'pandas', 'sql']
Recommended Careers: ['Data Scientist', 'Software Engineer', 'ML Engineer']
