## Approach 6 (Baseline) **and then** Approach 6 + 5 (Hybrid)
Order:
1) **Approach 6 only**: TF–IDF (char n-grams) + Logistic Regression aus CSVs → Evaluation on `annotated.json` (Original)
2) **Approach 6 + 5**: RuleLabeler + Agreement Pseudo-Labels auf `not_annotated.json` → RF (Optuna) + Hybrid → Evaluation on `annotated.json` (Original + Fair)


In [17]:
import json
import re
from dataclasses import dataclass
from collections import Counter, defaultdict
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import GroupShuffleSplit

import optuna


### 1) Config / Paths

In [None]:
@dataclass
class Paths:
    seniority_csv: str = "seniority-v2.csv"
    department_csv: str = "department-v2.csv"
    not_annotated_json: str = "linkedin-cvs-not-annotated.json"
    annotated_json: str = "linkedin-cvs-annotated.json"

@dataclass
class PseudoConfig:
    # Agreement pseudo-labels without calibration:
    base_hi: float = 0.85          # baseline very confident -> accept
    base_agree_min: float = 0.60   # if baseline == rule -> accept from here

    # Person Filter:
    min_keep_ratio: float = 0.80
    min_conf_for_person: float = 0.60

    only_active: bool = True

@dataclass
class HybridConfig:
    base_hi: float = 0.85
    rf_hi: float = 0.70
    dept_fallback: bool = True

@dataclass
class TrainConfig:
    random_state: int = 42

paths = Paths()
pseudo_cfg = PseudoConfig()
hybrid_cfg = HybridConfig()
train_cfg = TrainConfig()

paths


Paths(seniority_csv='seniority-v2.csv', department_csv='department-v2.csv', not_annotated_json='linkedin-cvs-not-annotated.json', annotated_json='linkedin-cvs-annotated.json')

### 2) Helpers (Load + Normalize)

In [19]:
def load_json(path: str) -> List[List[Dict[str, Any]]]:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_label_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if "text" not in df.columns or "label" not in df.columns:
        raise ValueError(f"{path}: Erwartet Spalten 'text' und 'label'")
    return df

def normalize(text: Any) -> str:
    if text is None or (isinstance(text, float) and pd.isna(text)):
        return ""
    text = str(text).lower()
    text = (text.replace("ä","ae").replace("ö","oe").replace("ü","ue").replace("ß","ss"))
    text = re.sub(r"(innen|in)\b", "", text)
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


## Part A — Approach 6 (Baseline)

### 3) Baseline Training (TF-IDF char + LR)

In [20]:
def train_baseline_text_model(df: pd.DataFrame, use_char_ngrams: bool = True) -> Pipeline:
    if use_char_ngrams:
        vec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), lowercase=True)
    else:
        vec = TfidfVectorizer(ngram_range=(1,2), lowercase=True)  # word n-grams

    clf = LogisticRegression(max_iter=3000, class_weight="balanced")
    pipe = Pipeline([("tfidf", vec), ("clf", clf)])

    X = df["text"].astype(str).map(normalize)
    y = df["label"].astype(str)
    pipe.fit(X, y)
    return pipe

def baseline_predict_label_conf(model: Pipeline, text: str) -> Tuple[str, float]:
    x = normalize(text)
    proba = model.predict_proba([x])[0]
    i = int(np.argmax(proba))
    return str(model.classes_[i]), float(proba[i])

def baseline_predict_label(model: Pipeline, text: str) -> str:
    return baseline_predict_label_conf(model, text)[0]


In [21]:
df_sen = load_label_csv(paths.seniority_csv)
df_dept = load_label_csv(paths.department_csv)

sen_baseline = train_baseline_text_model(df_sen)
dept_baseline = train_baseline_text_model(df_dept)

print("Sen classes:", sen_baseline.classes_)
print("Dept classes:", dept_baseline.classes_)


Sen classes: ['Director' 'Junior' 'Lead' 'Management' 'Senior']
Dept classes: ['Administrative' 'Business Development' 'Consulting' 'Customer Support'
 'Human Resources' 'Information Technology' 'Marketing' 'Other'
 'Project Management' 'Purchasing' 'Sales']


### 4) Baseline Evaluation auf annotated (Original + Fair)

In [None]:
def collect_predictions_on_annotated_with_model(annotated, sen_model: Pipeline, dept_model: Pipeline):
    true_sen, pred_sen_list = [], []
    true_dept, pred_dept_list = [], []

    for person_jobs in annotated:
        for idx, job in enumerate(person_jobs):
            if job.get("status") != "ACTIVE":
                continue
            if job.get("seniority") is None or job.get("department") is None:
                continue

            pos = str(job.get("position", ""))

            true_sen.append(str(job["seniority"]))
            true_dept.append(str(job["department"]))

            pred_sen_list.append(baseline_predict_label(sen_model, pos))
            pred_dept_list.append(baseline_predict_label(dept_model, pos))

    return true_sen, pred_sen_list, true_dept, pred_dept_list

def evaluate_original(true_sen, pred_sen_list, true_dept, pred_dept_list, title="EVALUATION"):
    print("\n" + "="*70)
    print(title)
    print("="*70)

    print("\nSENIORITY (original)")
    print("Accuracy:", accuracy_score(true_sen, pred_sen_list))
    print(classification_report(true_sen, pred_sen_list))

    print("\nDEPARTMENT (original)")
    print("Accuracy:", accuracy_score(true_dept, pred_dept_list))
    print(classification_report(true_dept, pred_dept_list))

def fair_evaluation(true_sen, pred_sen_list, true_dept, pred_dept_list, title="FAIRE EVALUATION"):
    # Seniority fair: 
    true_sen_adjusted, pred_sen_adjusted = [], []
    for t, p in zip(true_sen, pred_sen_list):
        if t == "Professional" and p in ["Junior", "Senior"]:
            true_sen_adjusted.append("Professional")
            pred_sen_adjusted.append("Professional")
        else:
            true_sen_adjusted.append(t)
            pred_sen_adjusted.append(p)

    acc_sen_fair = accuracy_score(true_sen_adjusted, pred_sen_adjusted)

    # Department fair: Other rausfiltern
    true_dept_filtered, pred_dept_filtered = [], []
    for t, p in zip(true_dept, pred_dept_list):
        if t != "Other":
            true_dept_filtered.append(t)
            pred_dept_filtered.append(p)

    acc_dept_fair = accuracy_score(true_dept_filtered, pred_dept_filtered)

    print("\n" + "="*70)
    print(title)
    print("="*70)

    print("\nSENIORITY (fair): Professional korrekt, wenn Junior oder Senior vorhergesagt")
    print(f"Accuracy: {acc_sen_fair:.4f}")
    print(classification_report(true_sen_adjusted, pred_sen_adjusted))

    print("\nDEPARTMENT (fair): 'Other' ausgeschlossen")
    print(f"Evaluiert auf {len(true_dept_filtered)}/{len(true_dept)} Jobs")
    print(f"Accuracy: {acc_dept_fair:.4f}")
    print(classification_report(true_dept_filtered, pred_dept_filtered))

    comparison = pd.DataFrame({
        "Metric": ["Seniority Accuracy", "Department Accuracy"],
        "Original": [accuracy_score(true_sen, pred_sen_list), accuracy_score(true_dept, pred_dept_list)],
        "Fair": [acc_sen_fair, acc_dept_fair],
    })

    print("\n" + "="*70)
    print("VERGLEICH: Original vs Fair")
    print("="*70)
    print(comparison.to_string(index=False))

    return acc_sen_fair, acc_dept_fair

annotated = load_json(paths.annotated_json)

true_sen_6, pred_sen_6, true_dept_6, pred_dept_6 = collect_predictions_on_annotated_with_model(
    annotated, sen_baseline, dept_baseline
)

evaluate_original(true_sen_6, pred_sen_6, true_dept_6, pred_dept_6, title="ANSATZ 6 (Baseline) — ORIGINAL")
fair_evaluation(true_sen_6, pred_sen_6, true_dept_6, pred_dept_6, title="ANSATZ 6 (Baseline) — FAIR")



ANSATZ 6 (Baseline) — ORIGINAL

SENIORITY (original)
Accuracy: 0.478330658105939
              precision    recall  f1-score   support

    Director       0.56      0.97      0.71        34
      Junior       0.20      0.33      0.25        12
        Lead       0.49      0.73      0.59       125
  Management       0.71      0.70      0.70       192
Professional       0.00      0.00      0.00       216
      Senior       0.21      0.80      0.33        44

    accuracy                           0.48       623
   macro avg       0.36      0.59      0.43       623
weighted avg       0.37      0.48      0.40       623


DEPARTMENT (original)
Accuracy: 0.2696629213483146
                        precision    recall  f1-score   support

        Administrative       0.07      0.21      0.11        14
  Business Development       0.38      0.30      0.33        20
            Consulting       0.87      0.51      0.65        39
      Customer Support       0.33      0.17      0.22         6
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


(0.6324237560192616, 0.5913978494623656)

## Part B — Approach 6 + 5 (Hybrid)

### 5) RuleLabeler (Approach 5-Component)

In [23]:
STOPWORDS = set(["and","of","for","und","der","die","das","in","mit","to","de","la","le","des","et","en","as"])

SENIORITY_ABBR = {"jr":"Junior","sr":"Senior","lead":"Lead","chief":"Lead","dir":"Director","vp":"Director","mgr":"Management"}
DEPARTMENT_ABBR = {"it":"Information Technology","hr":"Human Resources","bd":"Business Development","ops":"Operations"}

C_LEVEL_ABBR = {
    "CEO":"Chief Executive Officer","CFO":"Chief Financial Officer","COO":"Chief Operating Officer",
    "CTO":"Chief Technology Officer","CMO":"Chief Marketing Officer","CIO":"Chief Information Officer",
    "CHRO":"Chief Human Resources Officer","EVP":"Executive Vice President","SVP":"Senior Vice President",
    "VP":"Vice President","AVP":"Assistant / Associate Vice President",
}

def build_keyword_dict(df: pd.DataFrame) -> Dict[str, Dict[str, int]]:
    label_words: Dict[str, List[str]] = defaultdict(list)
    for _, row in df.iterrows():
        lab = str(row["label"])
        txt = normalize(row["text"])
        for w in txt.split():
            if len(w) >= 3 and w not in STOPWORDS:
                label_words[lab].append(w)
    return {lab: dict(Counter(words).most_common(30)) for lab, words in label_words.items()}

def vote(labels: List[Optional[str]]) -> Optional[str]:
    labels = [l for l in labels if l is not None]
    return Counter(labels).most_common(1)[0][0] if labels else None

def length_based_seniority(pos_norm: str) -> Optional[str]:
    n = len(pos_norm.split())
    if n <= 3: return "Junior"
    if n >= 6: return "Senior"
    return None

class RuleLabeler:
    def __init__(self, df_sen: pd.DataFrame, df_dept: pd.DataFrame):
        ds = df_sen.copy()
        dd = df_dept.copy()
        ds["text_clean"] = ds["text"].map(normalize)
        dd["text_clean"] = dd["text"].map(normalize)
        self.sen_lookup = dict(zip(ds["text_clean"], ds["label"]))
        self.dept_lookup = dict(zip(dd["text_clean"], dd["label"]))
        self.sen_keywords = build_keyword_dict(df_sen)
        self.dept_keywords = build_keyword_dict(df_dept)

    def label_position(self, position: str) -> Tuple[Optional[str], Optional[str]]:
        pos = normalize(position)
        sen_votes: List[Optional[str]] = [self.sen_lookup.get(pos)]
        dept_votes: List[Optional[str]] = [self.dept_lookup.get(pos)]

        for lab, kws in self.sen_keywords.items():
            for kw in kws.keys():
                if kw in pos: sen_votes.append(lab)
        for lab, kws in self.dept_keywords.items():
            for kw in kws.keys():
                if kw in pos: dept_votes.append(lab)

        for abbr, lab in SENIORITY_ABBR.items():
            if abbr in pos: sen_votes.append(lab)
        for abbr, lab in DEPARTMENT_ABBR.items():
            if abbr in pos: dept_votes.append(lab)

        for abbr, long in C_LEVEL_ABBR.items():
            if abbr.lower() in pos or long.lower() in pos:
                sen_votes.append("Management")

        len_vote = length_based_seniority(pos)
        if all(v is None for v in sen_votes):
            sen_votes.append(len_vote)

        final_sen = vote(sen_votes)
        final_dept = vote(dept_votes) or "Other"
        return final_sen, final_dept

rule_labeler = RuleLabeler(df_sen, df_dept)


### 6) Agreement Pseudo-Labeling on not_annotated

In [24]:
def pseudo_label_job_agreement(
    job: Dict[str, Any],
    sen_model: Pipeline,
    dept_model: Pipeline,
    rule_labeler: RuleLabeler,
    cfg: PseudoConfig
) -> Dict[str, Any]:
    out = dict(job)
    pos = str(job.get("position","")).strip()

    sen_pred, sen_conf = baseline_predict_label_conf(sen_model, pos)
    dept_pred, dept_conf = baseline_predict_label_conf(dept_model, pos)

    r_sen, r_dept = rule_labeler.label_position(pos)

    sen_label = None
    dept_label = None

    if sen_conf >= cfg.base_hi:
        sen_label = sen_pred
        sen_src = "base_hi"
    elif (r_sen is not None) and (sen_pred == r_sen) and (sen_conf >= cfg.base_agree_min):
        sen_label = sen_pred
        sen_src = "agree_rule"
    else:
        sen_src = None

    if dept_conf >= cfg.base_hi:
        dept_label = dept_pred
        dept_src = "base_hi"
    elif (r_dept is not None) and (dept_pred == r_dept) and (dept_conf >= cfg.base_agree_min):
        dept_label = dept_pred
        dept_src = "agree_rule"
    else:
        dept_src = None

    out["seniority"] = sen_label
    out["department"] = dept_label
    out["confidence_sen"] = float(sen_conf)
    out["confidence_dept"] = float(dept_conf)
    out["pseudo_source_sen"] = sen_src
    out["pseudo_source_dept"] = dept_src
    return out

def pseudo_label_dataset_agreement(data, sen_model, dept_model, rule_labeler, cfg: PseudoConfig):
    labeled = []
    for person_jobs in data:
        pj = []
        for job in person_jobs:
            if cfg.only_active and job.get("status") != "ACTIVE":
                pj.append(dict(job))
            else:
                pj.append(pseudo_label_job_agreement(job, sen_model, dept_model, rule_labeler, cfg))
        labeled.append(pj)
    return labeled

not_annotated = load_json(paths.not_annotated_json)

pseudo_all = pseudo_label_dataset_agreement(
    data=not_annotated,
    sen_model=sen_baseline,
    dept_model=dept_baseline,
    rule_labeler=rule_labeler,
    cfg=pseudo_cfg
)

(len(pseudo_all), len(pseudo_all[0]) if pseudo_all else None)


(390, 8)

### 7) Person Filtering

In [25]:
def filter_persons_by_confidence(labeled_data, min_confidence, min_keep_ratio, only_active=True):
    filtered = []
    for person_jobs in labeled_data:
        considered = [j for j in person_jobs if (not only_active) or j.get("status") == "ACTIVE"]
        total = len(considered)
        if total == 0:
            continue

        ok = 0
        for j in considered:
            cs = float(j.get("confidence_sen", 0))
            cd = float(j.get("confidence_dept", 0))
            if cs >= min_confidence and cd >= min_confidence and j.get("seniority") and j.get("department"):
                ok += 1

        if (ok / total) >= min_keep_ratio:
            filtered.append(person_jobs)
    return filtered

pseudo_filtered = filter_persons_by_confidence(
    pseudo_all,
    min_confidence=pseudo_cfg.min_conf_for_person,
    min_keep_ratio=pseudo_cfg.min_keep_ratio,
    only_active=pseudo_cfg.only_active
)

print(f"Kept persons: {len(pseudo_filtered)}/{len(pseudo_all)} ({len(pseudo_filtered)/max(1,len(pseudo_all))*100:.1f}%)")


Kept persons: 41/390 (10.5%)


### 8) Features from Job-Historie

In [None]:
def calculate_months_between(start_date: Any, end_date: Any) -> float:
    try:
        start = pd.to_datetime(start_date)
        end = pd.to_datetime(end_date)
        return float((end - start).days / 30)
    except Exception:
        return 0.0

def extract_job_history_features(person_jobs: List[Dict[str, Any]], target_job_idx: int = 0) -> Dict[str, Any]:
    target_job = person_jobs[target_job_idx]

    feats: Dict[str, Any] = {
        "total_jobs": len(person_jobs),
        "job_number": target_job_idx + 1,

        "previous_seniority_junior": 0,
        "previous_seniority_senior": 0,
        "previous_seniority_lead": 0,
        "previous_seniority_management": 0,
        "previous_seniority_director": 0,

        "previous_dept_administrative": 0,
        "previous_dept_business_dev": 0,
        "previous_dept_consulting": 0,
        "previous_dept_customer_support": 0,
        "previous_dept_hr": 0,
        "previous_dept_it": 0,
        "previous_dept_marketing": 0,
        "previous_dept_other": 0,
        "previous_dept_project_mgmt": 0,
        "previous_dept_purchasing": 0,
        "previous_dept_sales": 0,

        "same_department_as_previous": 0,

        "months_in_current_job": calculate_months_between(target_job.get("startDate"), target_job.get("endDate") or datetime.now()),
        "avg_job_duration": 0.0,

        "seniority_increases": 0,
        "department_changes": 0,
    }

    durs = [calculate_months_between(j.get("startDate"), j.get("endDate") or datetime.now()) for j in person_jobs]
    feats["avg_job_duration"] = float(np.mean(durs)) if durs else 0.0

    # previous job = idx+1 (correct when list newest->oldest)
    if len(person_jobs) > target_job_idx + 1:
        prev = person_jobs[target_job_idx + 1]
        ps = prev.get("seniority")
        pdp = prev.get("department")

        if ps == "Junior": feats["previous_seniority_junior"] = 1
        elif ps == "Senior": feats["previous_seniority_senior"] = 1
        elif ps == "Lead": feats["previous_seniority_lead"] = 1
        elif ps == "Management": feats["previous_seniority_management"] = 1
        elif ps == "Director": feats["previous_seniority_director"] = 1

        dept_map = {
            "Administrative": "previous_dept_administrative",
            "Business Development": "previous_dept_business_dev",
            "Consulting": "previous_dept_consulting",
            "Customer Support": "previous_dept_customer_support",
            "Human Resources": "previous_dept_hr",
            "Information Technology": "previous_dept_it",
            "Marketing": "previous_dept_marketing",
            "Other": "previous_dept_other",
            "Project Management": "previous_dept_project_mgmt",
            "Purchasing": "previous_dept_purchasing",
            "Sales": "previous_dept_sales",
        }
        if pdp in dept_map:
            feats[dept_map[pdp]] = 1

        if pdp and target_job.get("department") and pdp == target_job.get("department"):
            feats["same_department_as_previous"] = 1

    order = {"Junior": 1, "Senior": 2, "Lead": 3, "Management": 4, "Director": 5}
    prev_s, prev_d = None, None
    for j in person_jobs:
        s = j.get("seniority")
        d = j.get("department")
        if prev_s and s and order.get(s, 0) > order.get(prev_s, 0):
            feats["seniority_increases"] += 1
        if prev_d and d and d != prev_d:
            feats["department_changes"] += 1
        prev_s, prev_d = s, d

    return feats

FEATURE_COLS = [
    "total_jobs","job_number",
    "previous_seniority_junior","previous_seniority_senior","previous_seniority_lead",
    "previous_seniority_management","previous_seniority_director",
    "previous_dept_administrative","previous_dept_business_dev","previous_dept_consulting",
    "previous_dept_customer_support","previous_dept_hr","previous_dept_it",
    "previous_dept_marketing","previous_dept_other","previous_dept_project_mgmt",
    "previous_dept_purchasing","previous_dept_sales",
    "same_department_as_previous",
    "months_in_current_job","avg_job_duration",
    "seniority_increases","department_changes",
]


### 9) RF Trainingsdaten bauen

In [27]:
def build_rf_training_frames(pseudo_persons, only_active=True):
    rows_s, rows_d = [], []
    for p_idx, person_jobs in enumerate(pseudo_persons):
        for j_idx, job in enumerate(person_jobs):
            if only_active and job.get("status") != "ACTIVE":
                continue
            if job.get("seniority") and job.get("department"):
                feats = extract_job_history_features(person_jobs, j_idx)
                rs = dict(feats); rs["label"] = str(job["seniority"]); rs["person_id"] = p_idx
                rd = dict(feats); rd["label"] = str(job["department"]); rd["person_id"] = p_idx
                rows_s.append(rs); rows_d.append(rd)
    return pd.DataFrame(rows_s), pd.DataFrame(rows_d)

df_rf_sen, df_rf_dept = build_rf_training_frames(pseudo_filtered, only_active=pseudo_cfg.only_active)
print("RF sen rows:", len(df_rf_sen))
print("RF dept rows:", len(df_rf_dept))
df_rf_sen.head()


RF sen rows: 42
RF dept rows: 42


Unnamed: 0,total_jobs,job_number,previous_seniority_junior,previous_seniority_senior,previous_seniority_lead,previous_seniority_management,previous_seniority_director,previous_dept_administrative,previous_dept_business_dev,previous_dept_consulting,...,previous_dept_project_mgmt,previous_dept_purchasing,previous_dept_sales,same_department_as_previous,months_in_current_job,avg_job_duration,seniority_increases,department_changes,label,person_id
0,8,1,0,0,0,0,0,0,0,0,...,0,0,0,0,67.666667,44.991667,0,0,Senior,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,409.566667,409.566667,0,0,Junior,1
2,6,1,0,0,0,0,0,0,0,0,...,0,0,0,0,14.866667,34.605556,0,0,Director,2
3,9,1,0,0,0,0,0,0,0,0,...,0,0,0,0,46.366667,28.807407,0,0,Senior,3
4,5,1,1,0,0,0,0,1,0,0,...,0,0,0,1,45.366667,85.713333,0,0,Junior,4


### 10) Optuna only for RF (on pseudo data, GroupSplit on person_id)

In [28]:
def make_rf_objective_from_df(df_rf: pd.DataFrame, seed=42):
    if len(df_rf) == 0:
        raise RuntimeError("df_rf leer. Check thresholds / filtering.")

    X = df_rf[FEATURE_COLS].fillna(0)
    y = df_rf["label"].astype(str).values
    groups = df_rf["person_id"].values

    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    idxs = np.arange(len(df_rf))
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
    tr_idx, va_idx = next(gss.split(idxs, groups=groups))

    X_train, X_val = X.iloc[tr_idx], X.iloc[va_idx]
    y_train, y_val = y_enc[tr_idx], y_enc[va_idx]

    def objective(trial: optuna.Trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 1200, step=100),
            "max_depth": trial.suggest_int("max_depth", 4, 30),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 15),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "random_state": seed,
            "n_jobs": -1,
        }

        rf = RandomForestClassifier(**params)
        rf.fit(X_train, y_train)
        pred = rf.predict(X_val)
        return f1_score(y_val, pred, average="macro")

    return objective

# Seniority tuning
study_sen = optuna.create_study(direction="maximize")
study_sen.optimize(make_rf_objective_from_df(df_rf_sen, seed=train_cfg.random_state), n_trials=50)
print("Best SEN params:", study_sen.best_params)
print("Best SEN macro-F1 (pseudo-val):", study_sen.best_value)

# Department tuning
study_dept = optuna.create_study(direction="maximize")
study_dept.optimize(make_rf_objective_from_df(df_rf_dept, seed=train_cfg.random_state), n_trials=50)
print("Best DEPT params:", study_dept.best_params)
print("Best DEPT macro-F1 (pseudo-val):", study_dept.best_value)


[I 2026-01-21 17:05:22,822] A new study created in memory with name: no-name-eeebbc6e-911a-42a9-a8b3-78c4965dcaa9
[I 2026-01-21 17:05:23,147] Trial 0 finished with value: 0.14285714285714285 and parameters: {'n_estimators': 300, 'max_depth': 26, 'min_samples_split': 17, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.14285714285714285.
[I 2026-01-21 17:05:23,479] Trial 1 finished with value: 0.14285714285714285 and parameters: {'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 12, 'max_features': None, 'bootstrap': False}. Best is trial 0 with value: 0.14285714285714285.
[I 2026-01-21 17:05:24,247] Trial 2 finished with value: 0.14285714285714285 and parameters: {'n_estimators': 500, 'max_depth': 25, 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 0.14285714285714285.
[I 2026-01-21 17:05:25,381] Trial 3 finished with value: 0.142857142

Best SEN params: {'n_estimators': 500, 'max_depth': 26, 'min_samples_split': 12, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': True}
Best SEN macro-F1 (pseudo-val): 0.17424242424242425


[I 2026-01-21 17:06:10,711] Trial 0 finished with value: 0.09722222222222221 and parameters: {'n_estimators': 1100, 'max_depth': 18, 'min_samples_split': 18, 'min_samples_leaf': 3, 'max_features': None, 'bootstrap': False}. Best is trial 0 with value: 0.09722222222222221.
[I 2026-01-21 17:06:11,944] Trial 1 finished with value: 0.09523809523809523 and parameters: {'n_estimators': 800, 'max_depth': 18, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.09722222222222221.
[I 2026-01-21 17:06:12,396] Trial 2 finished with value: 0.09722222222222221 and parameters: {'n_estimators': 400, 'max_depth': 19, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.09722222222222221.
[I 2026-01-21 17:06:13,645] Trial 3 finished with value: 0.09722222222222221 and parameters: {'n_estimators': 1200, 'max_depth': 19, 'min_samples_split': 12, 'min_samples_leaf': 12,

Best DEPT params: {'n_estimators': 1100, 'max_depth': 11, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': True}
Best DEPT macro-F1 (pseudo-val): 0.16666666666666666


### 11) Final RF training (with best params)

In [29]:
def train_rf_with_params(df_rf: pd.DataFrame, best_params: Dict[str, Any], seed=42):
    X = df_rf[FEATURE_COLS].fillna(0)
    y = df_rf["label"].astype(str)

    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    params = dict(best_params)
    params.update({"random_state": seed, "n_jobs": -1})

    rf = RandomForestClassifier(**params)
    rf.fit(X, y_enc)
    return rf, le

rf_sen, le_sen_rf = train_rf_with_params(df_rf_sen, study_sen.best_params, seed=train_cfg.random_state)
rf_dept, le_dept_rf = train_rf_with_params(df_rf_dept, study_dept.best_params, seed=train_cfg.random_state)

print("RFs trained on full pseudo data.")


RFs trained on full pseudo data.


### 12) Hybrid Predictor (Baseline vs RF)

In [30]:
class HybridPredictorBaseRF:
    def __init__(self, baseline_model: Pipeline, rf_model: RandomForestClassifier, le_rf: LabelEncoder,
                 is_department: bool, cfg: HybridConfig):
        self.baseline_model = baseline_model
        self.rf_model = rf_model
        self.le_rf = le_rf
        self.is_department = is_department
        self.cfg = cfg

    def predict(self, person_jobs: List[Dict[str, Any]], target_job_idx: int = 0) -> str:
        job = person_jobs[target_job_idx]
        text = str(job.get("position","")).strip()

        base_label, base_conf = baseline_predict_label_conf(self.baseline_model, text)

        feats = extract_job_history_features(person_jobs, target_job_idx)
        X = pd.DataFrame([feats])[FEATURE_COLS].fillna(0)
        rf_idx = int(self.rf_model.predict(X)[0])
        rf_probs = self.rf_model.predict_proba(X)[0]
        rf_conf = float(np.max(rf_probs))
        rf_label = str(self.le_rf.inverse_transform([rf_idx])[0])

        if base_conf >= self.cfg.base_hi:
            return base_label
        if rf_conf >= self.cfg.rf_hi:
            return rf_label

        if self.is_department and self.cfg.dept_fallback and base_conf < 0.6 and rf_conf < 0.6:
            if len(person_jobs) > target_job_idx + 1:
                prev = person_jobs[target_job_idx + 1]
                prev_dept = prev.get("department")
                if prev_dept:
                    return str(prev_dept)

        return base_label if base_conf >= rf_conf else rf_label

hybrid_sen = HybridPredictorBaseRF(sen_baseline, rf_sen, le_sen_rf, is_department=False, cfg=hybrid_cfg)
hybrid_dept = HybridPredictorBaseRF(dept_baseline, rf_dept, le_dept_rf, is_department=True, cfg=hybrid_cfg)


### 13) Hybrid Evaluation on annotated (Original + Fair)

In [31]:
pred_sen_h, pred_dept_h = [], []
true_sen_h, true_dept_h = [], []

for person_jobs in annotated:
    for idx, job in enumerate(person_jobs):
        if job.get("status") != "ACTIVE":
            continue
        if job.get("seniority") is None or job.get("department") is None:
            continue

        true_sen_h.append(str(job["seniority"]))
        true_dept_h.append(str(job["department"]))

        pred_sen_h.append(hybrid_sen.predict(person_jobs, idx))
        pred_dept_h.append(hybrid_dept.predict(person_jobs, idx))

evaluate_original(true_sen_h, pred_sen_h, true_dept_h, pred_dept_h, title="ANSATZ 6 + 5 (Hybrid) — ORIGINAL")
fair_evaluation(true_sen_h, pred_sen_h, true_dept_h, pred_dept_h, title="ANSATZ 6 + 5 (Hybrid) — FAIR")



ANSATZ 6 + 5 (Hybrid) — ORIGINAL

SENIORITY (original)
Accuracy: 0.42857142857142855
              precision    recall  f1-score   support

    Director       0.50      0.97      0.66        34
      Junior       0.24      0.33      0.28        12
        Lead       0.88      0.55      0.68       125
  Management       0.94      0.62      0.75       192
Professional       0.00      0.00      0.00       216
      Senior       0.12      0.95      0.22        44

    accuracy                           0.43       623
   macro avg       0.45      0.57      0.43       623
weighted avg       0.51      0.43      0.42       623


DEPARTMENT (original)
Accuracy: 0.5698234349919743
                        precision    recall  f1-score   support

        Administrative       0.36      0.36      0.36        14
  Business Development       0.21      0.35      0.26        20
            Consulting       0.40      0.46      0.43        39
      Customer Support       0.43      0.50      0.46         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


(0.7479935794542536, 0.5985663082437276)

# Save all models for streamlit

In [None]:
import joblib

MODEL_DIR = "saved_models"
import os
os.makedirs(MODEL_DIR, exist_ok=True)

# ---------------------------
# Save Baseline Models
# ---------------------------
joblib.dump(sen_baseline, f"{MODEL_DIR}/sen_baseline_tfidf_lr.joblib")
joblib.dump(dept_baseline, f"{MODEL_DIR}/dept_baseline_tfidf_lr.joblib")

# ---------------------------
# Save RF Models + Encoders
# ---------------------------
joblib.dump(rf_sen, f"{MODEL_DIR}/rf_seniority.joblib")
joblib.dump(rf_dept, f"{MODEL_DIR}/rf_department.joblib")

joblib.dump(le_sen_rf, f"{MODEL_DIR}/le_seniority.joblib")
joblib.dump(le_dept_rf, f"{MODEL_DIR}/le_department.joblib")

# ---------------------------
# Save Configs 
# ---------------------------
joblib.dump(hybrid_cfg, f"{MODEL_DIR}/hybrid_cfg.joblib")
joblib.dump(pseudo_cfg, f"{MODEL_DIR}/pseudo_cfg.joblib")

print("✅ Models saved to:", MODEL_DIR)


✅ Models saved to: saved_models
