# Smart Job Role Advisor – Baseline Notebook

In [1]:
import re
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple, Dict, Any

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option("display.max_colwidth", 200)

def norm_col(name: str) -> str:
    """Maak kolomnamen klein en 'snake_case'."""
    name = str(name).strip().lower()
    name = re.sub(r"\s+", "_", name)
    name = name.replace("-", "_")
    return name


## 1. Data inladen & verkennen


In [2]:
# Pad naar je eigen dataset (pas aan indien nodig)
DATA_PATH = Path("job_postings_dataset.csv")

df_raw = pd.read_csv(DATA_PATH, sep=None, engine="python")  # autodetect ; of ,
df = df_raw.copy()

# Kolomnamen normaliseren
df.columns = [norm_col(c) for c in df.columns]

print("Vorm van de data:", df.shape)
display(df.head())
print("\nKolominformatie:")
print(df.info())
print("\nAantal missende waarden per kolom:")
print(df.isna().sum())


Vorm van de data: (5, 10)


Unnamed: 0,job_id,job_title,short_description,job_description,required_skills,role_tags,location,salary_range,difficulty,popularity_score
0,1,Data Analyst,Analyse business data and create dashboards.,"You will work with Python, SQL, BI tools and create insights.","Python, SQL, data visualization, analytics",data analytics,Amsterdam,€3000-€3800,2,80
1,2,Junior Software Engineer,Develop software features and fix bugs.,"You will work with Java, APIs, backend services and testing.","Java, Git, debugging, APIs",software engineering,Rotterdam,€2800-€3500,3,70
2,3,Marketing Specialist,Support campaigns and content creation.,"You will analyze market trends, create marketing content and ads.","SEO, social media, content writing",marketing,Utrecht,€2600-€3300,2,60
3,4,AI Research Intern,Assist in machine learning experiments.,"You will build ML models, preprocess data, and evaluate experiments.","Machine learning, Python, data preprocessing",artificial intelligence,Amsterdam,€500 allowance,4,90
4,5,UX Designer,Design user-friendly digital interfaces.,"You will create wireframes, prototypes, and improve usability.","Figma, UX research, prototyping",ux design,Eindhoven,€3200-€4000,3,75



Kolominformatie:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   job_id             5 non-null      int64 
 1   job_title          5 non-null      object
 2   short_description  5 non-null      object
 3   job_description    5 non-null      object
 4   required_skills    5 non-null      object
 5   role_tags          5 non-null      object
 6   location           5 non-null      object
 7   salary_range       5 non-null      object
 8   difficulty         5 non-null      int64 
 9   popularity_score   5 non-null      int64 
dtypes: int64(3), object(7)
memory usage: 532.0+ bytes
None

Aantal missende waarden per kolom:
job_id               0
job_title            0
short_description    0
job_description      0
required_skills      0
role_tags            0
location             0
salary_range         0
difficulty           0
popularity_sco

## 2. Belangrijke kolommen selecteren

We willen de code robuust maken voor kleine verschillen in kolomnamen.
Daarom werken we met **alias-lijsten** per soort kolom (titel, beschrijving, locatie, enz.).

In [3]:
ALIASES: Dict[str, List[str]] = {
    "job_id": ["job_id", "id", "vacature_id", "functie_id"],
    "title": ["job_title", "title", "functie", "functie_titel"],
    "short_description": ["short_description", "samenvatting", "kort_omschrijving"],
    "description": ["job_description", "description", "omschrijving", "vacaturetekst"],
    "responsibilities": ["responsibilities", "taken", "verantwoordelijkheden"],
    "required_skills": ["required_skills", "skills_required", "vereiste_vaardigheden", "vereisten"],
    "nice_to_have_skills": ["nice_to_have_skills", "nice_to_have", "pre", "pre_vaardigheden"],
    "role_tags": ["role_tags", "tags", "categorie", "categorieen", "domain"],
    "location": ["location", "locatie", "standplaats"],
    "job_type": ["job_type", "contract_type", "employment_type"],
    "salary": ["salary_range", "salary", "salaris", "salaris_range"],
    "difficulty": ["difficulty", "estimated_difficulty"],
    "popularity": ["popularity_score", "popularity", "num_applicants"],
}

def pick_col(df: pd.DataFrame, aliases: List[str]) -> Optional[str]:
    cols = list(df.columns)
    for alias in aliases:
        alias_norm = norm_col(alias)
        for c in cols:
            if norm_col(c) == alias_norm:
                return c
    return None

COLS: Dict[str, Optional[str]] = {key: pick_col(df, aliases) for key, aliases in ALIASES.items()}
COLS


{'job_id': 'job_id',
 'title': 'job_title',
 'short_description': 'short_description',
 'description': 'job_description',
 'responsibilities': None,
 'required_skills': 'required_skills',
 'nice_to_have_skills': None,
 'role_tags': 'role_tags',
 'location': 'location',
 'job_type': None,
 'salary': 'salary_range',
 'difficulty': 'difficulty',
 'popularity': 'popularity_score'}

## 3. Inhoudsprofiel van vacatures (`job_text`)

We combineren verschillende tekstvelden (beschrijving, verantwoordelijkheden, skills, tags)
tot één lange tekst per vacature. Dit gebruiken we later voor TF–IDF.

In [4]:
text_parts = []

for key in ["short_description", "description", "responsibilities", "required_skills", "nice_to_have_skills", "role_tags"]:
    col = COLS.get(key)
    if col and col in df.columns:
        text_parts.append(df[col].fillna("").astype(str))

if not text_parts:
    raise ValueError("Geen geschikte tekstkolommen gevonden om 'job_text' te maken. Controleer je data.")

job_text = text_parts[0]
for part in text_parts[1:]:
    job_text = job_text + " " + part

df["job_text"] = job_text.str.replace(r"\s+", " ", regex=True).str.strip()
df[[COLS.get("title") or df.columns[0], "job_text"]].head()


Unnamed: 0,job_title,job_text
0,Data Analyst,"Analyse business data and create dashboards. You will work with Python, SQL, BI tools and create insights. Python, SQL, data visualization, analytics data analytics"
1,Junior Software Engineer,"Develop software features and fix bugs. You will work with Java, APIs, backend services and testing. Java, Git, debugging, APIs software engineering"
2,Marketing Specialist,"Support campaigns and content creation. You will analyze market trends, create marketing content and ads. SEO, social media, content writing marketing"
3,AI Research Intern,"Assist in machine learning experiments. You will build ML models, preprocess data, and evaluate experiments. Machine learning, Python, data preprocessing artificial intelligence"
4,UX Designer,"Design user-friendly digital interfaces. You will create wireframes, prototypes, and improve usability. Figma, UX research, prototyping ux design"


## 4. TF–IDF representatie van vacatureteksten

We zetten `job_text` om naar numerieke vectoren met **TF–IDF** en gebruiken later
**cosine similarity** om overeenkomsten te bepalen tussen kandidaat en vacatures.

In [5]:
from nltk.corpus import stopwords

dutch_stop = stopwords.words("dutch")

tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    stop_words=dutch_stop
)

X = tfidf.fit_transform(df["job_text"].fillna(""))
X.shape


(5, 160)

## 5. Kandidatenprofiel definiëren

We houden het profiel bewust eenvoudig: een vrij tekstveld met interesses / ervaring,
en optioneel wat voorkeuren (locatie, minimum salaris, domein-tags).

In [6]:
@dataclass
class CandidateProfile:
    interests_text: str
    preferred_location: Optional[str] = None
    min_salary: Optional[float] = None
    max_difficulty: Optional[float] = None
    role_include: Optional[List[str]] = None  # bijv. ['data', 'analytics']

def profile_vector(profile: CandidateProfile):
    return tfidf.transform([profile.interests_text])


## 6. Constraint-score

Naast inhoudelijke overeenkomst voegen we een simpele **constraint-score** toe op basis van:

- locatie (match / geen match),
- minimum salaris,
- maximale moeilijkheid,
- domein / rol-tags.

In [7]:
def extract_numeric_series(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.replace(",", ".", regex=False)
    nums = pd.to_numeric(
        s.str.extract(r"([0-9]+\.?[0-9]*)")[0],
        errors="coerce"
    )
    return nums

def constraint_score(row: pd.Series, profile: CandidateProfile) -> Tuple[float, Dict[str, str]]:
    scores = []
    reasons: Dict[str, str] = {}

    # Locatie
    loc_col = COLS.get("location")
    if profile.preferred_location and loc_col and loc_col in row.index:
        job_loc = str(row[loc_col]).lower()
        pref = profile.preferred_location.lower()
        match = pref in job_loc
        scores.append(1.0 if match else 0.0)
        reasons["location"] = f"Locatie {'matcht' if match else 'matcht niet'}: kandidaat={profile.preferred_location}, vacature={row[loc_col]}"
    else:
        reasons["location"] = "Geen locatie-voorkeur opgegeven."

    # Minimum salaris
    sal_col = COLS.get("salary")
    if profile.min_salary is not None and sal_col and sal_col in row.index:
        salary_val = extract_numeric_series(pd.Series([row[sal_col]])).iloc[0]
        if pd.isna(salary_val):
            scores.append(0.5)  # onbekend salaris → neutraal
            reasons["salary"] = "Salaris onbekend; neutrale score."
        else:
            match = salary_val >= profile.min_salary
            scores.append(1.0 if match else 0.0)
            reasons["salary"] = f"Indicatie salaris {'voldoende' if match else 'lager dan voorkeur'} (≈ {salary_val})."
    else:
        reasons["salary"] = "Geen salaris-constraint of kolom."

    # Moeilijkheid
    diff_col = COLS.get("difficulty")
    if profile.max_difficulty is not None and diff_col and diff_col in row.index:
        diff_val = extract_numeric_series(pd.Series([row[diff_col]])).iloc[0]
        if pd.isna(diff_val):
            scores.append(0.5)
            reasons["difficulty"] = "Moeilijkheid onbekend; neutrale score."
        else:
            match = diff_val <= profile.max_difficulty
            scores.append(1.0 if match else 0.0)
            reasons["difficulty"] = f"Moeilijkheid {'binnen' if match else 'boven'} voorkeursniveau (≈ {diff_val})."
    else:
        reasons["difficulty"] = "Geen moeilijkheids-constraint of kolom."

    # Domein / rol-tags
    role_col = COLS.get("role_tags")
    if profile.role_include and role_col and role_col in row.index:
        tags = str(row[role_col]).lower()
        wanted = [w.lower() for w in profile.role_include]
        match_count = sum(1 for w in wanted if w in tags)
        sc = min(1.0, match_count / max(1, len(wanted)))
        scores.append(sc)
        reasons["role"] = f"Domeinmatch: {match_count} van {len(wanted)} gewenste termen gevonden."
    else:
        reasons["role"] = "Geen domein- / tag-constraint opgegeven."

    if not scores:
        return 0.0, reasons

    return float(np.mean(scores)), reasons


## 7. Aanbevelingsfunctie

We combineren drie componenten in één eindscore:

- **content_sim**: cosine similarity tussen profieltekst en `job_text`;
- **constraint_score**: gemiddeld van de bovenstaande constraints;
- **popularity**: optioneel, als de dataset een populariteitsmaat bevat.

De gewichten `alpha`, `beta`, `gamma` moeten optellen tot 1.

In [8]:
def recommend(
    profile: CandidateProfile,
    k: int = 10,
    alpha: float = 0.7,   # gewicht content
    beta: float = 0.2,    # gewicht constraints
    gamma: float = 0.1    # gewicht populariteit
) -> pd.DataFrame:
    if not np.isclose(alpha + beta + gamma, 1.0):
        raise ValueError("alpha + beta + gamma moeten samen 1.0 zijn.")

    p_vec = profile_vector(profile)
    content_scores = cosine_similarity(p_vec, X).flatten()

    # Populariteit (optioneel)
    pop_col = COLS.get("popularity")
    if pop_col and pop_col in df.columns:
        pop_raw = extract_numeric_series(df[pop_col]).fillna(0.0)
        pop_scaled = (pop_raw - pop_raw.min()) / (pop_raw.max() - pop_raw.min() + 1e-9)
    else:
        pop_scaled = pd.Series(0.0, index=df.index)

    rows = []
    title_col = COLS.get("title") or df.columns[0]
    loc_col = COLS.get("location")

    for idx, row in df.iterrows():
        cstr_score, reasons = constraint_score(row, profile)
        c_score = float(content_scores[idx])
        pop_score = float(pop_scaled.loc[idx])

        final_score = alpha * c_score + beta * cstr_score + gamma * pop_score

        rows.append({
            "index": idx,
            "job_title": row.get(title_col, ""),
            "location": row.get(loc_col, ""),
            "final_score": final_score,
            "content_sim": c_score,
            "constraint_score": cstr_score,
            "popularity_score": pop_score,
            "constraint_reasons": reasons,
        })

    rec_df = pd.DataFrame(rows).sort_values("final_score", ascending=False).head(k).reset_index(drop=True)
    return rec_df


## 8. Diversiteit van aanbevelingen & uitleg

We voegen twee eenvoudige extra functies toe:

- `diversity_score(indices)`: 1 – gemiddelde onderlinge cosine similarity.
- `explain_overlap(...)`: overlappende trefwoorden tussen profiel en vacaturetekst.

In [9]:
def diversity_score(indices: List[int]) -> float:
    if len(indices) < 2:
        return 0.0
    sub = X[indices]
    sim = cosine_similarity(sub)
    n = sim.shape[0]
    mask = np.triu(np.ones_like(sim, dtype=bool), k=1)
    if mask.sum() == 0:
        return 0.0
    avg_sim = sim[mask].mean()
    return float(1.0 - avg_sim)

def explain_overlap(profile_text: str, job_text: str, top_k: int = 8) -> List[str]:
    def tokenize(s: str) -> List[str]:
        return re.findall(r"\w+", s.lower())

    p_tokens = tokenize(profile_text)
    j_tokens = tokenize(job_text)

    common = [t for t in j_tokens if t in set(p_tokens)]
    counts: Dict[str, int] = {}
    for t in common:
        counts[t] = counts.get(t, 0) + 1

    sorted_terms = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    return [w for w, _ in sorted_terms[:top_k]]


## 9. Demo: aanbevelingen voor een voorbeeldprofiel

In [10]:
demo_profile = CandidateProfile(
    interests_text="data analytics machine learning python",
    preferred_location="amsterdam",
    min_salary=3000,
    max_difficulty=3.0,
    role_include=["data", "analytics"]
)

demo_recs = recommend(demo_profile, k=5, alpha=0.7, beta=0.2, gamma=0.1)
demo_recs


Unnamed: 0,index,job_title,location,final_score,content_sim,constraint_score,popularity_score,constraint_reasons
0,3,AI Research Intern,Amsterdam,0.492642,0.489489,0.25,1.0,"{'location': 'Locatie matcht: kandidaat=amsterdam, vacature=Amsterdam', 'salary': 'Indicatie salaris lager dan voorkeur (≈ 500).', 'difficulty': 'Moeilijkheid boven voorkeursniveau (≈ 4).', 'role'..."
1,0,Data Analyst,Amsterdam,0.489015,0.31764,1.0,0.666667,"{'location': 'Locatie matcht: kandidaat=amsterdam, vacature=Amsterdam', 'salary': 'Indicatie salaris voldoende (≈ 3000).', 'difficulty': 'Moeilijkheid binnen voorkeursniveau (≈ 2).', 'role': 'Dome..."
2,4,UX Designer,Eindhoven,0.15,0.0,0.5,0.5,"{'location': 'Locatie matcht niet: kandidaat=amsterdam, vacature=Eindhoven', 'salary': 'Indicatie salaris voldoende (≈ 3200).', 'difficulty': 'Moeilijkheid binnen voorkeursniveau (≈ 3).', 'role': ..."
3,1,Junior Software Engineer,Rotterdam,0.083333,0.0,0.25,0.333333,"{'location': 'Locatie matcht niet: kandidaat=amsterdam, vacature=Rotterdam', 'salary': 'Indicatie salaris lager dan voorkeur (≈ 2800).', 'difficulty': 'Moeilijkheid binnen voorkeursniveau (≈ 3).',..."
4,2,Marketing Specialist,Utrecht,0.05,0.0,0.25,0.0,"{'location': 'Locatie matcht niet: kandidaat=amsterdam, vacature=Utrecht', 'salary': 'Indicatie salaris lager dan voorkeur (≈ 2600).', 'difficulty': 'Moeilijkheid binnen voorkeursniveau (≈ 2).', '..."


In [11]:
# Diversiteitsscore van de aanbevelingen
indices = demo_recs["index"].tolist()
print("Diversiteitsscore (0 = zeer gelijkend, 1 = zeer divers):", diversity_score(indices))

# Toon overlappende termen voor de eerste 3 aanbevelingen
title_col = "job_title"
for _, row in demo_recs.head(3).iterrows():
    idx = row["index"]
    job_text = df.loc[idx, "job_text"]
    overlap = explain_overlap(demo_profile.interests_text, job_text, top_k=8)
    print(f"\nVacature: {row[title_col]}")
    print("Overlappende trefwoorden:", ", ".join(overlap) if overlap else "(geen expliciete overlap gevonden)")


Diversiteitsscore (0 = zeer gelijkend, 1 = zeer divers): 0.9500817653445062

Vacature: AI Research Intern
Overlappende trefwoorden: machine, learning, data, python

Vacature: Data Analyst
Overlappende trefwoorden: data, python, analytics

Vacature: UX Designer
Overlappende trefwoorden: (geen expliciete overlap gevonden)
