# LINKEDIN JOB CLASSIFICATION - HYBRID MODEL
## Seniority & Department Prediction mit Ensemble Pseudo-Labeling & Hybrid ML Model

## 1 IMPORTS

In [2]:
import pandas as pd
import numpy as np
import json
import re
import torch
from collections import Counter, defaultdict
from datetime import datetime
from dataclasses import dataclass
from typing import Dict, Any, List, Optional, Tuple

# Sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split

# Transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Optuna
import optuna



## 2 LOAD DATA & TRAIN-TEST SPLIT
Load the CSV Files and performe a 80:20 Train/Test Split

In [3]:
# Load CSV Training Data
df_seniority = pd.read_csv('seniority-v2.csv')
df_department = pd.read_csv('department-v2.csv')

print(f"   Seniority: {len(df_seniority)} Eintr√§ge")
print(f"   Department: {len(df_department)} Eintr√§ge")

# Train-Test Split 
train_sen, test_sen = train_test_split(
    df_seniority,
    test_size=0.2,
    stratify=df_seniority['label'],
    #random_state=config.random_state
    random_state = 42
)

train_dept, test_dept = train_test_split(
    df_department,
    test_size=0.2,
    stratify=df_department['label'],
    random_state=42
)

print(f"\n Train-Test Split:")
print(f"   Seniority - Train: {len(train_sen)}, Test: {len(test_sen)}")
print(f"   Department - Train: {len(train_dept)}, Test: {len(test_dept)}")

   Seniority: 9428 Eintr√§ge
   Department: 10145 Eintr√§ge

 Train-Test Split:
   Seniority - Train: 7542, Test: 1886
   Department - Train: 8116, Test: 2029


## 3 TEXT NORMALIZATION & TF-IDF TRAINING

In [4]:
print("TF-IDF TRAINING\n")

# Text Normalization Function
def normalize(text):
    """Text-Normalization: Umlauts, Gendering, special characters"""
    if text is None or (isinstance(text, float) and pd.isna(text)):
        return ""
    text = str(text).lower()
    text = text.replace("√§","ae").replace("√∂","oe").replace("√º","ue").replace("√ü","ss")
    text = re.sub(r"(innen|in)\b", "", text)  # Gendering
    text = re.sub(r"[^a-z0-9 ]", " ", text)  # Nur Buchstaben/Zahlen
    text = re.sub(r"\s+", " ", text).strip()
    return text

# TF-IDF Training Function
def train_tfidf_model(df, use_char_ngrams=True):
    """trains TF-IDF + Logistic Regression Model"""
    if use_char_ngrams:
        vec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), lowercase=True)
    else:
        vec = TfidfVectorizer(ngram_range=(1,2), lowercase=True)
    
    clf = LogisticRegression(max_iter=3000, class_weight="balanced")
    pipe = Pipeline([("tfidf", vec), ("clf", clf)])
    
    X = df["text"].astype(str).map(normalize)
    y = df["label"].astype(str)
    pipe.fit(X, y)
    return pipe


sen_TFIDF = train_tfidf_model(train_sen, use_char_ngrams=True)
print(f"   Classes: {list(sen_TFIDF.classes_)}")

dept_TFIDF = train_tfidf_model(train_dept, use_char_ngrams=True)
print(f"   Classes: {list(dept_TFIDF.classes_)}")

# Helper Function for Predictions with Confidence
def predict_with_confidence(model, text):
    """TF-IDF Prediction mit Confidence Score"""
    text_norm = normalize(text)
    proba = model.predict_proba([text_norm])[0]
    pred_idx = int(np.argmax(proba))
    pred_label = str(model.classes_[pred_idx])
    confidence = float(proba[pred_idx])
    return pred_label, confidence


TF-IDF TRAINING

   Classes: ['Director', 'Junior', 'Lead', 'Management', 'Senior']
   Classes: ['Administrative', 'Business Development', 'Consulting', 'Customer Support', 'Human Resources', 'Information Technology', 'Marketing', 'Other', 'Project Management', 'Purchasing', 'Sales']


## 4 RULE-BASED SYSTEM SETUP
Implements the a Rule-based labeling system. That derives seniority and department from job titles. This involves combining exact text matches, statistically learned keywords from training data, abbreviations and simple heuristics, and merging them into a label by majority decision.

In [5]:
print("RULE-BASED SYSTEM SETUP\n")

# Configuration
STOPWORDS = set(["and","of","for","und","der","die","das","in","mit","to","de","la","le","des","et","en","as"])

SENIORITY_ABBR = {
    "jr": "Junior", "sr": "Senior", "lead": "Lead", "chief": "Lead",
    "dir": "Director", "vp": "Director", "mgr": "Management"
}

DEPARTMENT_ABBR = {
    "it": "Information Technology", "hr": "Human Resources",
    "bd": "Business Development", "ops": "Operations"
}

C_LEVEL_ABBR = {
    "CEO": "Chief Executive Officer", "CFO": "Chief Financial Officer",
    "COO": "Chief Operating Officer", "CTO": "Chief Technology Officer",
    "CMO": "Chief Marketing Officer", "CIO": "Chief Information Officer",
    "CHRO": "Chief Human Resources Officer", "EVP": "Executive Vice President",
    "SVP": "Senior Vice President", "VP": "Vice President",
    "AVP": "Assistant Vice President",
}

# Helper Functions
def build_keyword_dict(df):
    """Extrahiert Top-Keywords pro Label aus Training Data"""
    label_words = defaultdict(list)
    for _, row in df.iterrows():
        lab = str(row["label"])
        txt = normalize(row["text"])
        for w in txt.split():
            if len(w) >= 3 and w not in STOPWORDS:
                label_words[lab].append(w)
    return {lab: dict(Counter(words).most_common(30)) for lab, words in label_words.items()}

def vote(labels):
    """Majority Voting"""
    labels = [l for l in labels if l is not None]
    return Counter(labels).most_common(1)[0][0] if labels else None

def length_based_seniority(pos_norm):
    """L√§ngen-basierte Heuristik f√ºr Seniority"""
    n = len(pos_norm.split())
    if n <= 3: return "Junior"
    if n >= 6: return "Senior"
    return None

# Build Keyword Dictionaries
print("\nExtrahiere Keywords aus Training Data...")
sen_keywords = build_keyword_dict(train_sen)
dept_keywords = build_keyword_dict(train_dept)
print(f"Keywords extrahiert:")
print(f"   Seniority: {len(sen_keywords)} Labels")
print(f"   Department: {len(dept_keywords)} Labels")

# Build Lookup Dictionaries
print("\nErstelle Lookup-Dictionaries...")
train_sen_norm = train_sen.copy()
train_sen_norm["text_clean"] = train_sen_norm["text"].map(normalize)
sen_lookup = dict(zip(train_sen_norm["text_clean"], train_sen_norm["label"]))

train_dept_norm = train_dept.copy()
train_dept_norm["text_clean"] = train_dept_norm["text"].map(normalize)
dept_lookup = dict(zip(train_dept_norm["text_clean"], train_dept_norm["label"]))

print(f"Lookups erstellt:")
print(f"   Seniority: {len(sen_lookup)} Eintr√§ge")
print(f"   Department: {len(dept_lookup)} Eintr√§ge")

# Rule Labeler Class
class RuleLabeler:
    """Rule-based Labeling System"""
    
    def __init__(self, sen_lookup, dept_lookup, sen_keywords, dept_keywords):
        self.sen_lookup = sen_lookup
        self.dept_lookup = dept_lookup
        self.sen_keywords = sen_keywords
        self.dept_keywords = dept_keywords
    
    def label_position(self, position):
        """Rule-based Labeling f√ºr einen Job Title"""
        pos = normalize(position)
        sen_votes = []
        dept_votes = []
        
        # 1. Exact Lookup
        sen_votes.append(self.sen_lookup.get(pos))
        dept_votes.append(self.dept_lookup.get(pos))
        
        # 2. Keyword Matching
        for lab, kws in self.sen_keywords.items():
            for kw in kws.keys():
                if kw in pos:
                    sen_votes.append(lab)
        
        for lab, kws in self.dept_keywords.items():
            for kw in kws.keys():
                if kw in pos:
                    dept_votes.append(lab)
        
        # 3. Abk√ºrzungen
        for abbr, lab in SENIORITY_ABBR.items():
            if abbr in pos:
                sen_votes.append(lab)
        
        for abbr, lab in DEPARTMENT_ABBR.items():
            if abbr in pos:
                dept_votes.append(lab)
        
        # 4. C-Level Detection
        for abbr, long in C_LEVEL_ABBR.items():
            if abbr.lower() in pos or long.lower() in pos:
                sen_votes.append("Management")
        
        # 5. Length-based Fallback
        if all(v is None for v in sen_votes):
            len_vote = length_based_seniority(pos)
            sen_votes.append(len_vote)
        
        # 6. Voting
        final_sen = vote(sen_votes)
        final_dept = vote(dept_votes) or "Other"
        
        return final_sen, final_dept

# Create Rule Labeler Instance
rule_labeler = RuleLabeler(sen_lookup, dept_lookup, sen_keywords, dept_keywords)
print("\nRule Labeler erstellt")



RULE-BASED SYSTEM SETUP


Extrahiere Keywords aus Training Data...
Keywords extrahiert:
   Seniority: 5 Labels
   Department: 11 Labels

Erstelle Lookup-Dictionaries...
Lookups erstellt:
   Seniority: 7162 Eintr√§ge
   Department: 7648 Eintr√§ge

Rule Labeler erstellt


## 5 LOAD FINE-TUNING MODELS
Load the already trained and saved Fine-Tuning Models

In [6]:
print("FINE-TUNING MODELS LOADING\n")

model_sen_ft = AutoModelForSequenceClassification.from_pretrained('best_sen_model')
tokenizer_sen = AutoTokenizer.from_pretrained('best_sen_model')

model_dept_ft = AutoModelForSequenceClassification.from_pretrained('best_dept_model')
tokenizer_dept = AutoTokenizer.from_pretrained('best_dept_model')

# Load Label Encoders
import pickle
with open('./label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)
    le_sen = label_encoders['sen']
    le_dept = label_encoders['dept']


print(f"Fine-Tuning Modelle geladen")
print(f"   Seniority Classes: {list(le_sen.classes_)}")
print(f"   Department Classes: {list(le_dept.classes_)}")



FINE-TUNING MODELS LOADING



The tokenizer you are loading from 'best_sen_model' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
The tokenizer you are loading from 'best_dept_model' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


Fine-Tuning Modelle geladen
   Seniority Classes: [np.str_('Junior'), np.str_('Senior'), np.str_('Lead'), np.str_('Management'), np.str_('Director')]
   Department Classes: ['Administrative', 'Business Development', 'Consulting', 'Customer Support', 'Human Resources', 'Information Technology', 'Marketing', 'Other', 'Project Management', 'Purchasing', 'Sales']


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## 6 BASELINE COMPARISON 
This section compares the performance of the three "Baseline" Models, which are used for Pseudolabeling and Hybrid prediction later, on the CSV Test-Data.
Models:
- TF-IDF Model
- Fine-Tuning Models
- Rule based labeling

In [7]:
print("BASELINE COMPARISON - CSV TEST-SET\n")

# Evaluate TF-IDF
X_test_sen = test_sen["text"].astype(str).map(normalize)
y_test_sen = test_sen["label"].astype(str)
pred_sen_TFIDF = sen_TFIDF.predict(X_test_sen)
acc_sen_TFIDF = accuracy_score(y_test_sen, pred_sen_TFIDF)

X_test_dept = test_dept["text"].astype(str).map(normalize)
y_test_dept = test_dept["label"].astype(str)
pred_dept_TFIDF = dept_TFIDF.predict(X_test_dept)
acc_dept_TFIDF = accuracy_score(y_test_dept, pred_dept_TFIDF)

# Evaluate Rules
pred_sen_rules = []
true_sen_rules = []
for _, row in test_sen.iterrows():
    pred_sen, _ = rule_labeler.label_position(row["text"])
    if pred_sen is not None:
        pred_sen_rules.append(pred_sen)
        true_sen_rules.append(row["label"])
acc_sen_rules = accuracy_score(true_sen_rules, pred_sen_rules)

pred_dept_rules = []
true_dept_rules = []
for _, row in test_dept.iterrows():
    _, pred_dept = rule_labeler.label_position(row["text"])
    pred_dept_rules.append(pred_dept)
    true_dept_rules.append(row["label"])
acc_dept_rules = accuracy_score(true_dept_rules, pred_dept_rules)

# Evaluate Fine-Tuning
predictions_sen_ft = []
true_sen_ft = []
for idx, row in test_sen.iterrows():
    text = str(row['text']).strip()
    inputs = tokenizer_sen(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    #inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model_sen_ft(**inputs)
    pred_idx = outputs.logits.argmax().item()
    pred_label = le_sen.inverse_transform([pred_idx])[0]
    predictions_sen_ft.append(pred_label)
    true_sen_ft.append(row['label'])
acc_sen_ft = accuracy_score(true_sen_ft, predictions_sen_ft)

predictions_dept_ft = []
true_dept_ft = []
for idx, row in test_dept.iterrows():
    text = str(row['text']).strip()
    inputs = tokenizer_dept(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    #inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model_dept_ft(**inputs)
    pred_idx = outputs.logits.argmax().item()
    pred_label = le_dept.inverse_transform([pred_idx])[0]
    predictions_dept_ft.append(pred_label)
    true_dept_ft.append(row['label'])
acc_dept_ft = accuracy_score(true_dept_ft, predictions_dept_ft)

# Summary Table
baseline_comparison = pd.DataFrame({
    'Model': ['TF-IDF + LogReg', 'Rule-Based', 'Fine-Tuning'],
    'Seniority': [f'{acc_sen_TFIDF:.4f}', f'{acc_sen_rules:.4f}', f'{acc_sen_ft:.4f}'],
    'Department': [f'{acc_dept_TFIDF:.4f}', f'{acc_dept_rules:.4f}', f'{acc_dept_ft:.4f}']
})

print("\n" + baseline_comparison.to_string(index=False))
#print("\nüí° Diese Modelle werden im 3-Way Ensemble kombiniert f√ºr Pseudo-Labeling")


BASELINE COMPARISON - CSV TEST-SET


          Model Seniority Department
TF-IDF + LogReg    0.9735     0.9487
     Rule-Based    0.5911     0.6299
    Fine-Tuning    0.9995     0.9975


## 7 ENSEMBLE PSEUDO-LABELING (3-WAY)

This section creates pseudo labels for the not-annotated JASON data with additional CVs. This file will be used later to extract features.

The Pseudo-Labels are created with the following rules:

1. Predictions of Fine-tuning and TF-IDF agree -> accept Label (confidence = max(FT-Conf, TF-IDF-Conf))
2. Predictions of Fine-tuning and Rule-Label agree and FT-Conf >= 75% -> accept Label (confidence = FT-Conf)
3. Predictions of TF-IDF and Rule-Label agree and TF-IDF-Conf >= 75% -> accept Label (confidence = TF-IDF-Conf)
4. FT-Conf >= 90% -> accept Label (conficence = FT-Conf)
5. TF-IDF-Conf >= 90% -> accept Label (conficence = FT-Conf)

=> only person where >= 80% of the Jobs have Labels with Conf >= 60% were kept for the Pseudo-Label File


In [8]:
print("ENSEMBLE PSEUDO-LABELING - FT + TF-IDF + RULES\n")

# Configuration
@dataclass
class EnsemblePseudoConfig:
    ft_hi: float = 0.90           # Fine-Tuning sehr sicher
    tfidf_hi: float = 0.90        # TF-IDF sehr sicher
    ml_agree_min: float = 0.60    # Beide ML √ºbereinstimmen
    rule_agree_min: float = 0.75  # ML + Rules √ºbereinstimmen
    min_keep_ratio: float = 0.80  # Person Filter
    min_conf_for_person: float = 0.60
    only_active: bool = True

ensemble_cfg = EnsemblePseudoConfig()

print("\nKonfiguration:")
print(f"  FT High Confidence:       {ensemble_cfg.ft_hi}")
print(f"  TF-IDF High Confidence:   {ensemble_cfg.tfidf_hi}")
print(f"  ML Agreement Threshold:   {ensemble_cfg.ml_agree_min}")
print(f"  Rule Agreement Threshold: {ensemble_cfg.rule_agree_min}")

# Ensemble Pseudo-Labeling Function
def ensemble_pseudo_label_job(
    job: Dict[str, Any],
    ft_model_sen, ft_model_dept,
    ft_tokenizer_sen, ft_tokenizer_dept,
    ft_le_sen, ft_le_dept,
    tfidf_model_sen, tfidf_model_dept,
    rule_labeler,
    cfg: EnsemblePseudoConfig
) -> Dict[str, Any]:
    """3-Way Ensemble: Fine-Tuning + TF-IDF + Rules"""
    
    out = dict(job)
    pos = str(job.get("position", "")).strip()
    
    # SENIORITY
    # 1. Fine-Tuning
    inputs_sen = ft_tokenizer_sen(pos, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs_sen = ft_model_sen(**inputs_sen)
    ft_probs_sen = torch.nn.functional.softmax(outputs_sen.logits, dim=-1)
    ft_conf_sen = ft_probs_sen.max().item()
    ft_pred_idx_sen = outputs_sen.logits.argmax().item()
    ft_pred_sen = ft_le_sen.inverse_transform([ft_pred_idx_sen])[0]
    
    # 2. TF-IDF
    tfidf_pred_sen, tfidf_conf_sen = predict_with_confidence(tfidf_model_sen, pos)
    
    # 3. Rules
    rule_pred_sen, _ = rule_labeler.label_position(pos)
    
    # 4. Ensemble Logic
    if ft_pred_sen == tfidf_pred_sen:
        sen_label = ft_pred_sen
        sen_source = "ml_agree"
        sen_conf = max(ft_conf_sen, tfidf_conf_sen)
    elif rule_pred_sen and ft_pred_sen == rule_pred_sen and ft_conf_sen >= cfg.rule_agree_min:
        sen_label = ft_pred_sen
        sen_source = "ft_rule_agree"
        sen_conf = ft_conf_sen
    elif rule_pred_sen and tfidf_pred_sen == rule_pred_sen and tfidf_conf_sen >= cfg.rule_agree_min:
        sen_label = tfidf_pred_sen
        sen_source = "tfidf_rule_agree"
        sen_conf = tfidf_conf_sen
    elif ft_conf_sen >= cfg.ft_hi:
        sen_label = ft_pred_sen
        sen_source = "ft_hi"
        sen_conf = ft_conf_sen
    elif tfidf_conf_sen >= cfg.tfidf_hi:
        sen_label = tfidf_pred_sen
        sen_source = "tfidf_hi"
        sen_conf = tfidf_conf_sen
    else:
        sen_label = None
        sen_source = None
        sen_conf = max(ft_conf_sen, tfidf_conf_sen)
    
    # DEPARTMENT (gleiche Logik)
    inputs_dept = ft_tokenizer_dept(pos, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs_dept = ft_model_dept(**inputs_dept)
    ft_probs_dept = torch.nn.functional.softmax(outputs_dept.logits, dim=-1)
    ft_conf_dept = ft_probs_dept.max().item()
    ft_pred_idx_dept = outputs_dept.logits.argmax().item()
    ft_pred_dept = ft_le_dept.inverse_transform([ft_pred_idx_dept])[0]
    
    tfidf_pred_dept, tfidf_conf_dept = predict_with_confidence(tfidf_model_dept, pos)
    _, rule_pred_dept = rule_labeler.label_position(pos)
    
    if ft_pred_dept == tfidf_pred_dept:
        dept_label = ft_pred_dept
        dept_source = "ml_agree"
        dept_conf = max(ft_conf_dept, tfidf_conf_dept)
    elif rule_pred_dept and ft_pred_dept == rule_pred_dept and ft_conf_dept >= cfg.rule_agree_min:
        dept_label = ft_pred_dept
        dept_source = "ft_rule_agree"
        dept_conf = ft_conf_dept
    elif rule_pred_dept and tfidf_pred_dept == rule_pred_dept and tfidf_conf_dept >= cfg.rule_agree_min:
        dept_label = tfidf_pred_dept
        dept_source = "tfidf_rule_agree"
        dept_conf = tfidf_conf_dept
    elif ft_conf_dept >= cfg.ft_hi:
        dept_label = ft_pred_dept
        dept_source = "ft_hi"
        dept_conf = ft_conf_dept
    elif tfidf_conf_dept >= cfg.tfidf_hi:
        dept_label = tfidf_pred_dept
        dept_source = "tfidf_hi"
        dept_conf = tfidf_conf_dept
    else:
        dept_label = None
        dept_source = None
        dept_conf = max(ft_conf_dept, tfidf_conf_dept)
    
    # Output
    out['seniority'] = sen_label
    out['department'] = dept_label
    out['confidence_sen'] = float(sen_conf)
    out['confidence_dept'] = float(dept_conf)
    out['pseudo_source_sen'] = sen_source
    out['pseudo_source_dept'] = dept_source
    
    return out

# Apply to Dataset
def ensemble_pseudo_label_dataset(data, cfg):
    """Apply ensemble pseudo-labeling to entire dataset"""
    labeled = []
    for person_idx, person_jobs in enumerate(data):
        if person_idx % 100 == 0:
            print(f"Progress: {person_idx}/{len(data)}")
        
        labeled_jobs = []
        for job in person_jobs:
            if cfg.only_active and job.get("status") != "ACTIVE":
                labeled_jobs.append(dict(job))
            else:
                labeled_job = ensemble_pseudo_label_job(
                    job,
                    model_sen_ft, model_dept_ft,
                    tokenizer_sen, tokenizer_dept,
                    le_sen, le_dept,
                    sen_TFIDF, dept_TFIDF,
                    rule_labeler,
                    cfg
                )
                labeled_jobs.append(labeled_job)
        labeled.append(labeled_jobs)
    return labeled

# Load Not-Annotated Data
print("\nNot-Annotated Data")
with open('linkedin-cvs-not-annotated.json', 'r', encoding='utf-8') as f:
    not_annotated = json.load(f)
print(f" {len(not_annotated)} persons geladen")


pseudo_all = ensemble_pseudo_label_dataset(not_annotated, ensemble_cfg)




ENSEMBLE PSEUDO-LABELING - FT + TF-IDF + RULES


Konfiguration:
  FT High Confidence:       0.9
  TF-IDF High Confidence:   0.9
  ML Agreement Threshold:   0.6
  Rule Agreement Threshold: 0.75

Not-Annotated Data
 390 persons geladen
Progress: 0/390
Progress: 100/390
Progress: 200/390
Progress: 300/390


## 8 STATISTICS & PERSON-LEVEL FILTERING

Provides an overview of the resulting pseudo-label file.

In [9]:
print("STATISTICS & PERSON-LEVEL FILTERING\n")

# Statistics Before Filtering
all_jobs = []
for person in pseudo_all:
    all_jobs.extend(person)
df_all = pd.DataFrame(all_jobs)

print("\nSTATISTICS - BEFORE FILTERING")
print(f"  Total jobs: {len(df_all)}")
print(f"  Active jobs: {(df_all['status'] == 'ACTIVE').sum()}")

print("\n  Seniority Label Sources:")
print(df_all['pseudo_source_sen'].value_counts())

print("\n  Department Label Sources:")
print(df_all['pseudo_source_dept'].value_counts())

sen_coverage = (df_all['seniority'].notna()).sum() / len(df_all) * 100
dept_coverage = (df_all['department'].notna()).sum() / len(df_all) * 100
print(f"\n  Label Coverage:")
print(f"    Seniority:  {sen_coverage:.1f}%")
print(f"    Department: {dept_coverage:.1f}%")

# Person-Level Filtering
def filter_persons_by_confidence(labeled_data, min_confidence, min_keep_ratio, only_active=True):
    """Filter persons where too many jobs have uncertain labels"""
    filtered = []
    for person_jobs in labeled_data:
        considered = [j for j in person_jobs if (not only_active) or j.get("status") == "ACTIVE"]
        total = len(considered)
        if total == 0:
            continue
        
        confident = 0
        for job in considered:
            cs = float(job.get('confidence_sen', 0))
            cd = float(job.get('confidence_dept', 0))
            if (cs >= min_confidence and cd >= min_confidence 
                and job.get('seniority') and job.get('department')):
                confident += 1
        
        keep_ratio = confident / total if total > 0 else 0
        if keep_ratio >= min_keep_ratio:
            filtered.append(person_jobs)
    return filtered

print("\nPERSON-LEVEL FILTERING")
print(f"  Minimum Confidence: {ensemble_cfg.min_conf_for_person}")
print(f"  Minimum Keep Ratio: {ensemble_cfg.min_keep_ratio}")

pseudo_filtered = filter_persons_by_confidence(
    pseudo_all,
    min_confidence=ensemble_cfg.min_conf_for_person,
    min_keep_ratio=ensemble_cfg.min_keep_ratio,
    only_active=ensemble_cfg.only_active
)

print(f"\n  Persons kept: {len(pseudo_filtered)}/{len(pseudo_all)} ({len(pseudo_filtered)/len(pseudo_all)*100:.1f}%)")

# Statistics After Filtering
filtered_jobs = []
for person in pseudo_filtered:
    filtered_jobs.extend(person)
df_filtered = pd.DataFrame(filtered_jobs)

print("\nSTATISTICS - AFTER FILTERING")
print(f"  Total jobs: {len(df_filtered)}")
print(f"  Active jobs: {(df_filtered['status'] == 'ACTIVE').sum()}")

print("\n  Seniority Labels:")
print(df_filtered['seniority'].value_counts())

print("\n  Department Labels:")
print(df_filtered['department'].value_counts())



STATISTICS & PERSON-LEVEL FILTERING


STATISTICS - BEFORE FILTERING
  Total jobs: 1886
  Active jobs: 419

  Seniority Label Sources:
pseudo_source_sen
ml_agree            306
ft_rule_agree        45
ft_hi                37
tfidf_rule_agree      4
Name: count, dtype: int64

  Department Label Sources:
pseudo_source_dept
ml_agree            238
ft_hi                39
ft_rule_agree        21
tfidf_rule_agree      1
Name: count, dtype: int64

  Label Coverage:
    Seniority:  20.8%
    Department: 15.9%

PERSON-LEVEL FILTERING
  Minimum Confidence: 0.6
  Minimum Keep Ratio: 0.8

  Persons kept: 183/390 (46.9%)

STATISTICS - AFTER FILTERING
  Total jobs: 1001
  Active jobs: 213

  Seniority Labels:
seniority
Senior        82
Management    63
Lead          31
Junior        22
Director      15
Name: count, dtype: int64

  Department Labels:
department
Information Technology    88
Administrative            27
Consulting                23
Project Management        18
Sales                    

## 9 SAVE PSEUDO-LABELS

In [10]:
print("SAVE PSEUDO-LABELS\n")

output_filename = 'linkedin_pseudo_labeled_ensemble.json'

with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(pseudo_filtered, f, ensure_ascii=False, indent=2)

print(f"\nPseudo-Labels saved: {output_filename}")
print(f"   {len(pseudo_filtered)} persons")
print(f"   {len(df_filtered)} total jobs")

SAVE PSEUDO-LABELS


Pseudo-Labels saved: linkedin_pseudo_labeled_ensemble.json
   183 persons
   1001 total jobs


## 10 FEATURE ENGINEERING & RANDOM FOREST TRAINING
- Person-based grouped split of the Pseudo-Labeled File in Test and Train Data
- Feature extraction and random forest training

In [11]:
print("FEATURE ENGINEERING & RANDOM FOREST\n")

# Feature Extraction Functions
def calculate_months_between(start_date, end_date):
    """Berechnet Monate zwischen zwei Daten"""
    try:
        start = pd.to_datetime(start_date)
        end = pd.to_datetime(end_date) if end_date else pd.to_datetime(datetime.now())
        return float((end - start).days / 30)
    except:
        return 0.0

def extract_job_history_features(person_jobs, target_job_idx=0):
    """Extrahiert 24 Features aus Job-Historie"""
    target_job = person_jobs[target_job_idx]
    
    features = {
        'total_jobs': len(person_jobs),
        'job_number': target_job_idx + 1,
        'previous_seniority_junior': 0,
        'previous_seniority_professional': 0,
        'previous_seniority_senior': 0,
        'previous_seniority_lead': 0,
        'previous_seniority_management': 0,
        'previous_seniority_director': 0,
        'previous_dept_administrative': 0,
        'previous_dept_business_dev': 0,
        'previous_dept_consulting': 0,
        'previous_dept_customer_support': 0,
        'previous_dept_hr': 0,
        'previous_dept_it': 0,
        'previous_dept_marketing': 0,
        'previous_dept_other': 0,
        'previous_dept_project_mgmt': 0,
        'previous_dept_purchasing': 0,
        'previous_dept_sales': 0,
        'same_department_as_previous': 0,
        'months_in_current_job': 0.0,
        'avg_job_duration': 0.0,
        'seniority_increases': 0,
        'department_changes': 0,
    }
    
    # Previous Job
    if len(person_jobs) > target_job_idx + 1:
        prev_job = person_jobs[target_job_idx + 1]
        prev_sen = prev_job.get('seniority')
        
        if prev_sen == 'Junior': features['previous_seniority_junior'] = 1
        elif prev_sen == 'Professional': features['previous_seniority_professional'] = 1
        elif prev_sen == 'Senior': features['previous_seniority_senior'] = 1
        elif prev_sen == 'Lead': features['previous_seniority_lead'] = 1
        elif prev_sen == 'Management': features['previous_seniority_management'] = 1
        elif prev_sen == 'Director': features['previous_seniority_director'] = 1
        
        prev_dept = prev_job.get('department')
        dept_map = {
            'Administrative': 'previous_dept_administrative',
            'Business Development': 'previous_dept_business_dev',
            'Consulting': 'previous_dept_consulting',
            'Customer Support': 'previous_dept_customer_support',
            'Human Resources': 'previous_dept_hr',
            'Information Technology': 'previous_dept_it',
            'Marketing': 'previous_dept_marketing',
            'Other': 'previous_dept_other',
            'Project Management': 'previous_dept_project_mgmt',
            'Purchasing': 'previous_dept_purchasing',
            'Sales': 'previous_dept_sales',
        }
        if prev_dept in dept_map:
            features[dept_map[prev_dept]] = 1
        
        if prev_dept and target_job.get('department') and prev_dept == target_job.get('department'):
            features['same_department_as_previous'] = 1
    
    # Time Features
    features['months_in_current_job'] = calculate_months_between(
        target_job.get('startDate'), target_job.get('endDate')
    )
    
    durations = []
    for job in person_jobs:
        dur = calculate_months_between(job.get('startDate'), job.get('endDate'))
        if dur > 0:
            durations.append(dur)
    if durations:
        features['avg_job_duration'] = float(np.mean(durations))
    
    # Progression Features
    seniority_order = {'Junior': 1, 'Professional': 2, 'Senior': 3, 'Lead': 4, 'Management': 5, 'Director': 6}
    for i in range(len(person_jobs) - 1, 0, -1):
        older_sen = person_jobs[i].get('seniority')
        newer_sen = person_jobs[i-1].get('seniority')
        if older_sen and newer_sen:
            if seniority_order.get(newer_sen, 0) > seniority_order.get(older_sen, 0):
                features['seniority_increases'] += 1
    
    for i in range(len(person_jobs) - 1):
        if person_jobs[i].get('department') != person_jobs[i+1].get('department'):
            features['department_changes'] += 1
    
    return features

# Feature Set Definition
FEATURE_COLS = [
    'total_jobs', 'job_number',
    'previous_seniority_junior', 'previous_seniority_professional',
    'previous_seniority_senior', 'previous_seniority_lead',
    'previous_seniority_management', 'previous_seniority_director',
    'previous_dept_administrative', 'previous_dept_business_dev',
    'previous_dept_consulting', 'previous_dept_customer_support',
    'previous_dept_hr', 'previous_dept_it', 'previous_dept_marketing',
    'previous_dept_other', 'previous_dept_project_mgmt',
    'previous_dept_purchasing', 'previous_dept_sales',
    'same_department_as_previous',
    'months_in_current_job', 'avg_job_duration',
    'seniority_increases', 'department_changes'
]

print(f"\nFeature Set: {len(FEATURE_COLS)} Features")

# Build Training DataFrames mit Person-ID
def build_feature_dataframes_with_person_id(persons_list):
    """Extrahiert Features f√ºr alle aktiven Jobs mit Person-ID"""
    rows_sen = []
    rows_dept = []
    
    for person_id, person_jobs in enumerate(persons_list):
        for idx, job in enumerate(person_jobs):
            if job.get('status') == 'ACTIVE' and job.get('seniority') and job.get('department'):
                features = extract_job_history_features(person_jobs, idx)
                
                row_sen = features.copy()
                row_sen['label'] = job['seniority']
                row_sen['person_id'] = person_id
                rows_sen.append(row_sen)
                
                row_dept = features.copy()
                row_dept['label'] = job['department']
                row_dept['person_id'] = person_id
                rows_dept.append(row_dept)
    
    return pd.DataFrame(rows_sen), pd.DataFrame(rows_dept)

#extract Features
df_rf_sen, df_rf_dept = build_feature_dataframes_with_person_id(pseudo_filtered)

print(f"   Seniority: {len(df_rf_sen)} Jobs von {df_rf_sen['person_id'].nunique()} Personen")
print(f"   Department: {len(df_rf_dept)} Jobs von {df_rf_dept['person_id'].nunique()} Personen")

# Person-based Split for Seniority
print("SENIORITY MODEL\n")

# Person IDs
unique_persons_sen = df_rf_sen['person_id'].unique()
np.random.seed(42)
np.random.shuffle(unique_persons_sen)

# 80/20 Split
split_idx = int(len(unique_persons_sen) * 0.8)
train_persons_sen = unique_persons_sen[:split_idx]
test_persons_sen = unique_persons_sen[split_idx:]

# Train/Test Sets based on Person-IDs
df_sen_train = df_rf_sen[df_rf_sen['person_id'].isin(train_persons_sen)]
df_sen_test = df_rf_sen[df_rf_sen['person_id'].isin(test_persons_sen)]

X_sen_train = df_sen_train[FEATURE_COLS].fillna(0)
y_sen_train = df_sen_train['label']
X_sen_test = df_sen_test[FEATURE_COLS].fillna(0)
y_sen_test = df_sen_test['label']

print(f"\nPerson-based Train-Test-Split:")
print(f"   Train: {len(X_sen_train)} Jobs von {len(train_persons_sen)} Personen")
print(f"   Test:  {len(X_sen_test)} Jobs von {len(test_persons_sen)} Personen")
print(f"\n   Train Labels: {sorted(y_sen_train.unique())}")
print(f"   Test Labels:  {sorted(y_sen_test.unique())}")

# fit LabelEncoder 
le_sen_rf = LabelEncoder()
le_sen_rf.fit(df_rf_sen['label']) 
y_sen_train_enc = le_sen_rf.transform(y_sen_train)
y_sen_test_enc = le_sen_rf.transform(y_sen_test)

#train seniority random Forest
rf_sen = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_sen.fit(X_sen_train, y_sen_train_enc)


# Predictions and Accuracy
y_sen_pred = rf_sen.predict(X_sen_test)
sen_accuracy = accuracy_score(y_sen_test_enc, y_sen_pred)

print(f"\n{'SENIORITY TEST ACCURACY:':<30} {sen_accuracy:.4f} ({sen_accuracy*100:.2f}%)")

# Classification Report 
print("\nClassification Report (Seniority):")
test_labels = sorted(set(y_sen_test_enc))
test_names = [le_sen_rf.classes_[i] for i in test_labels]
print(classification_report(y_sen_test_enc, y_sen_pred, 
                          labels=test_labels,
                          target_names=test_names, 
                          zero_division=0))

# Person-based Split for Department
print("DEPARTMENT MODEL\n")

# Eindeutige Person IDs
unique_persons_dept = df_rf_dept['person_id'].unique()
np.random.seed(42)
np.random.shuffle(unique_persons_dept)

# 80/20 Split
split_idx = int(len(unique_persons_dept) * 0.8)
train_persons_dept = unique_persons_dept[:split_idx]
test_persons_dept = unique_persons_dept[split_idx:]

# Train/Test Sets based on Person-IDs
df_dept_train = df_rf_dept[df_rf_dept['person_id'].isin(train_persons_dept)]
df_dept_test = df_rf_dept[df_rf_dept['person_id'].isin(test_persons_dept)]

X_dept_train = df_dept_train[FEATURE_COLS].fillna(0)
y_dept_train = df_dept_train['label']
X_dept_test = df_dept_test[FEATURE_COLS].fillna(0)
y_dept_test = df_dept_test['label']

print(f"\nPerson-based Train-Test-Split:")
print(f"   Train: {len(X_dept_train)} Jobs von {len(train_persons_dept)} Personen")
print(f"   Test:  {len(X_dept_test)} Jobs von {len(test_persons_dept)} Personen")
print(f"\n   Train Labels: {sorted(y_dept_train.unique())}")
print(f"   Test Labels:  {sorted(y_dept_test.unique())}")

# fit LabelEncoder
le_dept_rf = LabelEncoder()
le_dept_rf.fit(df_rf_dept['label'])  
y_dept_train_enc = le_dept_rf.transform(y_dept_train)
y_dept_test_enc = le_dept_rf.transform(y_dept_test)

# train department Random Forest
rf_dept = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_dept.fit(X_dept_train, y_dept_train_enc)

# Predictions and Accuracy
y_dept_pred = rf_dept.predict(X_dept_test)
dept_accuracy = accuracy_score(y_dept_test_enc, y_dept_pred)

print(f"\n{'DEPARTMENT TEST ACCURACY:':<30} {dept_accuracy:.4f} ({dept_accuracy*100:.2f}%)")

# Classification Report
print("\nClassification Report (Department):")
test_labels = sorted(set(y_dept_test_enc))
test_names = [le_dept_rf.classes_[i] for i in test_labels]
print(classification_report(y_dept_test_enc, y_dept_pred,
                          labels=test_labels,
                          target_names=test_names,
                          zero_division=0))

# Summary
print("\nSummary\n")
print(f"Seniority Test Accuracy:   {sen_accuracy*100:.2f}%")
print(f"Department Test Accuracy:  {dept_accuracy*100:.2f}%")


FEATURE ENGINEERING & RANDOM FOREST


Feature Set: 24 Features
   Seniority: 213 Jobs von 183 Personen
   Department: 213 Jobs von 183 Personen
SENIORITY MODEL


Person-based Train-Test-Split:
   Train: 165 Jobs von 146 Personen
   Test:  48 Jobs von 37 Personen

   Train Labels: [np.str_('Director'), np.str_('Junior'), np.str_('Lead'), np.str_('Management'), np.str_('Senior')]
   Test Labels:  [np.str_('Director'), np.str_('Junior'), np.str_('Lead'), np.str_('Management'), np.str_('Senior')]

SENIORITY TEST ACCURACY:       0.3958 (39.58%)

Classification Report (Seniority):
              precision    recall  f1-score   support

    Director       0.00      0.00      0.00         2
      Junior       0.50      0.12      0.20         8
        Lead       1.00      0.08      0.15        12
  Management       0.43      0.75      0.55        12
      Senior       0.36      0.57      0.44        14

    accuracy                           0.40        48
   macro avg       0.46      0.31     

## 11 OPTUNA HYPERPARAMETER TUNING


In [12]:
print("OPTUNA HYPERPARAMETER TUNING\n")

optuna.logging.set_verbosity(optuna.logging.WARNING)

def create_rf_objective(X_train, y_train, X_val, y_val):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [200, 400, 600, 800]),
            'max_depth': trial.suggest_int('max_depth', 8, 20),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 8),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
            'bootstrap': True,
            'random_state': 42,
            'n_jobs': -1
        }
        
        rf = RandomForestClassifier(**params)
        rf.fit(X_train, y_train)
        
        # Prediction on Validation Set
        y_pred = rf.predict(X_val)
        f1 = f1_score(y_val, y_pred, average='macro', zero_division=0)
        
        return f1
    
    return objective

# Seniority Tuning
print("SENIORITY HYPERPARAMETER OPTIMIZATION\n")
print(f"   Train: {len(X_sen_train)} Samples")
print(f"   Test:  {len(X_sen_test)} Samples")

study_sen = optuna.create_study(direction='maximize')
study_sen.optimize(
    create_rf_objective(X_sen_train, y_sen_train_enc, X_sen_test, y_sen_test_enc), 
    n_trials=30, 
    show_progress_bar=True
)

print(f"   Best Params:")
for key, value in study_sen.best_params.items():
    print(f"      {key}: {value}")

# Train finale optimized Seniority Model
best_params_sen = study_sen.best_params.copy()
best_params_sen['random_state'] = 42
best_params_sen['n_jobs'] = -1
best_params_sen['bootstrap'] = True

rf_sen_final = RandomForestClassifier(**best_params_sen)
rf_sen_final.fit(X_sen_train, y_sen_train_enc)

# Test Set Evaluation
y_sen_final_pred = rf_sen_final.predict(X_sen_test)
sen_final_accuracy = accuracy_score(y_sen_test_enc, y_sen_final_pred)
sen_final_f1 = f1_score(y_sen_test_enc, y_sen_final_pred, average='macro', zero_division=0)

print(f"\n{'OPTIMIZED SENIORITY TEST ACCURACY:':<35} {sen_final_accuracy:.4f} ({sen_final_accuracy*100:.2f}%)")
print(f"{'OPTIMIZED SENIORITY TEST F1-MACRO:':<35} {sen_final_f1:.4f}")

print("\nClassification Report (Optimized Seniority):")
test_labels_sen = sorted(set(y_sen_test_enc))
test_names_sen = [le_sen_rf.classes_[i] for i in test_labels_sen]
print(classification_report(y_sen_test_enc, y_sen_final_pred, 
                          labels=test_labels_sen,
                          target_names=test_names_sen, 
                          zero_division=0))

# Department Tuning
print("DEPARTMENT HYPERPARAMETER OPTIMIZATION\n")
print(f"   Train: {len(X_dept_train)} Samples")
print(f"   Test:  {len(X_dept_test)} Samples")

study_dept = optuna.create_study(direction='maximize')
study_dept.optimize(
    create_rf_objective(X_dept_train, y_dept_train_enc, X_dept_test, y_dept_test_enc), 
    n_trials=30, 
    show_progress_bar=True
)


print(f"   Best Params:")
for key, value in study_dept.best_params.items():
    print(f"      {key}: {value}")

# Train finale optimized Department Modell
best_params_dept = study_dept.best_params.copy()
best_params_dept['random_state'] = 42
best_params_dept['n_jobs'] = -1
best_params_dept['bootstrap'] = True

rf_dept_final = RandomForestClassifier(**best_params_dept)
rf_dept_final.fit(X_dept_train, y_dept_train_enc)

# Test Set Evaluation
y_dept_final_pred = rf_dept_final.predict(X_dept_test)
dept_final_accuracy = accuracy_score(y_dept_test_enc, y_dept_final_pred)
dept_final_f1 = f1_score(y_dept_test_enc, y_dept_final_pred, average='macro', zero_division=0)

print(f"\n{'OPTIMIZED DEPARTMENT TEST ACCURACY:':<35} {dept_final_accuracy:.4f} ({dept_final_accuracy*100:.2f}%)")
print(f"{'OPTIMIZED DEPARTMENT TEST F1-MACRO:':<35} {dept_final_f1:.4f}")

print("\nClassification Report (Optimized Department):")
test_labels_dept = sorted(set(y_dept_test_enc))
test_names_dept = [le_dept_rf.classes_[i] for i in test_labels_dept]
print(classification_report(y_dept_test_enc, y_dept_final_pred,
                          labels=test_labels_dept,
                          target_names=test_names_dept,
                          zero_division=0))

# Final Summary

print("\nBASELINE vs. OPTIMIZED")
print(f"{'Modell':<20} {'Baseline Accuracy':<20} {'Optimized Accuracy':<20}")
print(f"{'Seniority':<20} {sen_accuracy*100:>17.2f}%  {sen_final_accuracy*100:>18.2f}%")
print(f"{'Department':<20} {dept_accuracy*100:>17.2f}%  {dept_final_accuracy*100:>18.2f}%")


OPTUNA HYPERPARAMETER TUNING

SENIORITY HYPERPARAMETER OPTIMIZATION

   Train: 165 Samples
   Test:  48 Samples


  0%|          | 0/30 [00:00<?, ?it/s]

   Best Params:
      n_estimators: 200
      max_depth: 17
      min_samples_split: 4
      min_samples_leaf: 1
      max_features: log2

OPTIMIZED SENIORITY TEST ACCURACY:  0.3958 (39.58%)
OPTIMIZED SENIORITY TEST F1-MACRO:  0.2336

Classification Report (Optimized Seniority):
              precision    recall  f1-score   support

    Director       0.00      0.00      0.00         2
      Junior       0.00      0.00      0.00         8
        Lead       1.00      0.08      0.15        12
  Management       0.39      0.75      0.51        12
      Senior       0.41      0.64      0.50        14

    accuracy                           0.40        48
   macro avg       0.36      0.30      0.23        48
weighted avg       0.47      0.40      0.31        48

DEPARTMENT HYPERPARAMETER OPTIMIZATION

   Train: 165 Samples
   Test:  48 Samples


  0%|          | 0/30 [00:00<?, ?it/s]

   Best Params:
      n_estimators: 800
      max_depth: 14
      min_samples_split: 9
      min_samples_leaf: 1
      max_features: log2

OPTIMIZED DEPARTMENT TEST ACCURACY: 0.3333 (33.33%)
OPTIMIZED DEPARTMENT TEST F1-MACRO: 0.0906

Classification Report (Optimized Department):
                        precision    recall  f1-score   support

        Administrative       0.20      0.40      0.27         5
  Business Development       0.00      0.00      0.00         1
            Consulting       0.00      0.00      0.00         9
       Human Resources       0.00      0.00      0.00         3
Information Technology       0.45      0.70      0.55        20
             Marketing       0.00      0.00      0.00         1
    Project Management       0.00      0.00      0.00         4
            Purchasing       0.00      0.00      0.00         1
                 Sales       0.00      0.00      0.00         4

              accuracy                           0.33        48
             

## 12 HYBRID PREDICTOR (TF-IDF + RF)
1. TF-IDF Prediction:  TF-IDF predictet Label + Confidence
2.  Random Forest Prediction: RF predictet Label + Confidence

3. Hybrid Combination
3a) IF TF-IDF-Conf >= 0.85:
      ‚Üí Return TF-IDF-Label
  
3b) IF RF-Conf >= 0.70:
      ‚Üí Return RF-Label
  
3c) ELSE:
    Seniority: ‚Üí Return Label with higher Confidence
    Department: IF both Confidences < 0.6: Department Fallback (department of previous job, if a previous job exists)
                IF one Conficence > 0.6 or no previous job -> Return label with higher confidence
      

In [13]:
print("HYBRID PREDICTOR - TF-IDF + RANDOM FOREST\n")

# Hybrid Configuration
@dataclass
class HybridConfig:
    base_hi: float = 0.85
    rf_hi: float = 0.70
    dept_fallback: bool = True

hybrid_cfg = HybridConfig()

print("\nHybrid Configuration")
print(f"  TF-IDF High Confidence:  {hybrid_cfg.base_hi}")
print(f"  RF High Confidence:      {hybrid_cfg.rf_hi}")
print(f"  Department Fallback:     {hybrid_cfg.dept_fallback}")

# Hybrid Predictor Class
class HybridPredictorTFIDF:
    """
    Hybrid Model: TF-IDF + Random Forest
    TF-IDF f√ºr Text-Robustheit, RF f√ºr Job-Historie Kontext
    """
    def __init__(self, tfidf_model, rf_model, le_rf, is_department, cfg):
        self.tfidf_model = tfidf_model
        self.rf_model = rf_model
        self.le_rf = le_rf
        self.is_department = is_department
        self.cfg = cfg
        self.rf_expected_features = list(self.rf_model.feature_names_in_)
    
    def predict(self, person_jobs, target_job_idx=0):
        job = person_jobs[target_job_idx]
        text = str(job.get("position", "")).strip()
        
        # 1. TF-IDF Prediction
        tfidf_pred, tfidf_conf = predict_with_confidence(self.tfidf_model, text)
        
        # 2. Random Forest Prediction
        features = extract_job_history_features(person_jobs, target_job_idx)
        feature_df = pd.DataFrame([features])
        
        for feat in self.rf_expected_features:
            if feat not in feature_df.columns:
                feature_df[feat] = 0
        
        feature_vector = feature_df[self.rf_expected_features].fillna(0)
        
        rf_pred_idx = self.rf_model.predict(feature_vector)[0]
        rf_probs = self.rf_model.predict_proba(feature_vector)[0]
        rf_conf = rf_probs.max()
        rf_pred = self.le_rf.inverse_transform([rf_pred_idx])[0]
        
        # 3. Combination Logic
        if tfidf_conf >= self.cfg.base_hi:
            return tfidf_pred
        if rf_conf >= self.cfg.rf_hi:
            return rf_pred
        
        # 4. Department Fallback (previous Department)
        if self.is_department and self.cfg.dept_fallback:
            if tfidf_conf < 0.6 and rf_conf < 0.6:
                if len(person_jobs) > target_job_idx + 1:
                    prev_dept = person_jobs[target_job_idx + 1].get("department")
                    if prev_dept:
                        return prev_dept
        
        # 5. Else: Higher Confidence
        return tfidf_pred if tfidf_conf >= rf_conf else rf_pred

# Create Hybrid Predictors
hybrid_sen = HybridPredictorTFIDF(
    tfidf_model=sen_TFIDF,
    rf_model=rf_sen_final,
    le_rf=le_sen_rf,
    is_department=False,
    cfg=hybrid_cfg
)

hybrid_dept = HybridPredictorTFIDF(
    tfidf_model=dept_TFIDF,
    rf_model=rf_dept_final,
    le_rf=le_dept_rf,
    is_department=True,
    cfg=hybrid_cfg
)




HYBRID PREDICTOR - TF-IDF + RANDOM FOREST


Hybrid Configuration
  TF-IDF High Confidence:  0.85
  RF High Confidence:      0.7
  Department Fallback:     True


## 13 LOAD ANNOTATED DATA & FINAL EVALUATION

In [14]:
print("FINALE EVALUATION - LINKEDIN ANNOTATED DATA\n")

# Load Annotated Data
with open('linkedin-cvs-annotated.json', 'r', encoding='utf-8') as f:
    annotated = json.load(f)


total_jobs = sum(len(person) for person in annotated)
active_jobs = sum(1 for person in annotated for job in person if job.get('status') == 'ACTIVE')
print(f"   Total jobs: {total_jobs}")
print(f"   Active jobs: {active_jobs}")

# Generate Predictions
print("\nGeneriere Predictions auf Annotated Data...")

predictions_sen = []
predictions_dept = []
true_sen = []
true_dept = []

for person_idx, person_jobs in enumerate(annotated):
    if person_idx % 100 == 0 and person_idx > 0:
        print(f"Progress: {person_idx}/{len(annotated)}")
    
    for job_idx, job in enumerate(person_jobs):
        if job.get('status') == 'ACTIVE':
            if job.get('seniority'):
                pred_sen = hybrid_sen.predict(person_jobs, job_idx)
                predictions_sen.append(pred_sen)
                true_sen.append(job['seniority'])
            
            if job.get('department'):
                pred_dept = hybrid_dept.predict(person_jobs, job_idx)
                predictions_dept.append(pred_dept)
                true_dept.append(job['department'])


print(f"   Seniority: {len(predictions_sen)} jobs")
print(f"   Department: {len(predictions_dept)} jobs")



FINALE EVALUATION - LINKEDIN ANNOTATED DATA

   Total jobs: 2638
   Active jobs: 623

Generiere Predictions auf Annotated Data...
Progress: 100/609
Progress: 200/609
Progress: 300/609
Progress: 400/609
Progress: 500/609
Progress: 600/609
   Seniority: 623 jobs
   Department: 623 jobs


## 14 RESULTS 

In [15]:
print("EVALUATION RESULTS\n")

# Seniority Evaluation
print("\nSENIORITY - EVALUATION")
acc_sen = accuracy_score(true_sen, predictions_sen)
print(f"  Accuracy: {acc_sen:.4f}")
print("\n" + classification_report(true_sen, predictions_sen))


# Department Evaluation
print("\nDEPARTMENT - EVALUATION")
acc_dept = accuracy_score(true_dept, predictions_dept)
print(f"  Accuracy: {acc_dept:.4f}")
print("\n" + classification_report(true_dept, predictions_dept))


EVALUATION RESULTS


SENIORITY - EVALUATION
  Accuracy: 0.4671

              precision    recall  f1-score   support

    Director       0.58      0.91      0.71        34
      Junior       0.18      0.25      0.21        12
        Lead       0.77      0.52      0.62       125
  Management       0.51      0.84      0.63       192
Professional       0.00      0.00      0.00       216
      Senior       0.20      0.68      0.31        44

    accuracy                           0.47       623
   macro avg       0.37      0.53      0.41       623
weighted avg       0.36      0.47      0.38       623


DEPARTMENT - EVALUATION
  Accuracy: 0.5714

                        precision    recall  f1-score   support

        Administrative       0.21      0.43      0.29        14
  Business Development       0.27      0.35      0.30        20
            Consulting       0.40      0.44      0.41        39
      Customer Support       0.38      0.50      0.43         6
       Human Resources     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 15 Save Model

In [16]:
import pickle
import joblib
import os

# SAVE HYBRID MODEL FOR DEPLOYMENT

print("SAVE HYBRID MODEL")

# Create models directory
os.makedirs('hybrid_model', exist_ok=True)

# 1. SAVE TF-IDF MODELS
joblib.dump(sen_TFIDF, 'hybrid_model/tfidf_seniority.pkl')
joblib.dump(dept_TFIDF, 'hybrid_model/tfidf_department.pkl')
print(" TF-IDF Modelle gespeichert")

# 2. SAVE RANDOM FOREST MODELS + LABEL ENCODERS
joblib.dump(rf_sen_final, 'hybrid_model/rf_seniority.pkl')
joblib.dump(le_sen_rf, 'hybrid_model/le_seniority_rf.pkl')

joblib.dump(rf_dept_final, 'hybrid_model/rf_department.pkl')
joblib.dump(le_dept_rf, 'hybrid_model/le_department_rf.pkl')
print(" Random Forest Modelle und Label Encoder gespeichert")

# 3. SAVE CONFIGURATION
model_config = {
    'hybrid_config': {
        'base_hi': hybrid_cfg.base_hi,
        'rf_hi': hybrid_cfg.rf_hi,
        'dept_fallback': hybrid_cfg.dept_fallback
    },
    'feature_cols': FEATURE_COLS,
    'model_info': {
        'seniority_classes': list(le_sen_rf.classes_),
        'department_classes': list(le_dept_rf.classes_),
        'rf_sen_params': rf_sen_final.get_params(),
        'rf_dept_params': rf_dept_final.get_params()
    }
}

with open('hybrid_model/config.pkl', 'wb') as f:
    pickle.dump(model_config, f)
print(" Konfiguration gespeichert")

# 4. SAVE HELPER FUNCTIONS AS PYTHON MODULE
helper_code = '''"""
Helper Functions f√ºr Hybrid Model Prediction
Generiert automatisch beim Training
"""

import pandas as pd
import numpy as np
from datetime import datetime

def normalize(text):
    """Text-Normalisierung: Umlaute, Gendering, Sonderzeichen"""
    import re
    if text is None or (isinstance(text, float) and pd.isna(text)):
        return ""
    text = str(text).lower()
    text = text.replace("√§","ae").replace("√∂","oe").replace("√º","ue").replace("√ü","ss")
    text = re.sub(r"(innen|in)\\b", "", text)
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\\s+", " ", text).strip()
    return text

def predict_with_confidence(model, text):
    """TF-IDF Prediction mit Confidence Score"""
    text_norm = normalize(text)
    proba = model.predict_proba([text_norm])[0]
    pred_idx = int(np.argmax(proba))
    pred_label = str(model.classes_[pred_idx])
    confidence = float(proba[pred_idx])
    return pred_label, confidence

def calculate_months_between(start_date, end_date):
    """Berechnet Monate zwischen zwei Daten"""
    try:
        start = pd.to_datetime(start_date)
        end = pd.to_datetime(end_date) if end_date else pd.to_datetime(datetime.now())
        return float((end - start).days / 30)
    except:
        return 0.0

def extract_job_history_features(person_jobs, target_job_idx=0):
    """Extrahiert 24 Features aus Job-Historie"""
    target_job = person_jobs[target_job_idx]
    
    features = {
        'total_jobs': len(person_jobs),
        'job_number': target_job_idx + 1,
        'previous_seniority_junior': 0,
        'previous_seniority_professional': 0,
        'previous_seniority_senior': 0,
        'previous_seniority_lead': 0,
        'previous_seniority_management': 0,
        'previous_seniority_director': 0,
        'previous_dept_administrative': 0,
        'previous_dept_business_dev': 0,
        'previous_dept_consulting': 0,
        'previous_dept_customer_support': 0,
        'previous_dept_hr': 0,
        'previous_dept_it': 0,
        'previous_dept_marketing': 0,
        'previous_dept_other': 0,
        'previous_dept_project_mgmt': 0,
        'previous_dept_purchasing': 0,
        'previous_dept_sales': 0,
        'same_department_as_previous': 0,
        'months_in_current_job': 0.0,
        'avg_job_duration': 0.0,
        'seniority_increases': 0,
        'department_changes': 0,
    }
    
    # Previous Job Features
    if len(person_jobs) > target_job_idx + 1:
        prev_job = person_jobs[target_job_idx + 1]
        prev_sen = prev_job.get('seniority')
        
        if prev_sen == 'Junior': features['previous_seniority_junior'] = 1
        elif prev_sen == 'Professional': features['previous_seniority_professional'] = 1
        elif prev_sen == 'Senior': features['previous_seniority_senior'] = 1
        elif prev_sen == 'Lead': features['previous_seniority_lead'] = 1
        elif prev_sen == 'Management': features['previous_seniority_management'] = 1
        elif prev_sen == 'Director': features['previous_seniority_director'] = 1
        
        prev_dept = prev_job.get('department')
        dept_map = {
            'Administrative': 'previous_dept_administrative',
            'Business Development': 'previous_dept_business_dev',
            'Consulting': 'previous_dept_consulting',
            'Customer Support': 'previous_dept_customer_support',
            'Human Resources': 'previous_dept_hr',
            'Information Technology': 'previous_dept_it',
            'Marketing': 'previous_dept_marketing',
            'Other': 'previous_dept_other',
            'Project Management': 'previous_dept_project_mgmt',
            'Purchasing': 'previous_dept_purchasing',
            'Sales': 'previous_dept_sales',
        }
        if prev_dept in dept_map:
            features[dept_map[prev_dept]] = 1
        
        if prev_dept and target_job.get('department') and prev_dept == target_job.get('department'):
            features['same_department_as_previous'] = 1
    
    # Time Features
    features['months_in_current_job'] = calculate_months_between(
        target_job.get('startDate'), target_job.get('endDate')
    )
    
    durations = []
    for job in person_jobs:
        dur = calculate_months_between(job.get('startDate'), job.get('endDate'))
        if dur > 0:
            durations.append(dur)
    if durations:
        features['avg_job_duration'] = float(np.mean(durations))
    
    # Progression Features
    seniority_order = {
        'Junior': 1, 'Professional': 2, 'Senior': 3, 
        'Lead': 4, 'Management': 5, 'Director': 6
    }
    for i in range(len(person_jobs) - 1, 0, -1):
        older_sen = person_jobs[i].get('seniority')
        newer_sen = person_jobs[i-1].get('seniority')
        if older_sen and newer_sen:
            if seniority_order.get(newer_sen, 0) > seniority_order.get(older_sen, 0):
                features['seniority_increases'] += 1
    
    for i in range(len(person_jobs) - 1):
        if person_jobs[i].get('department') != person_jobs[i+1].get('department'):
            features['department_changes'] += 1
    
    return features
'''

with open('hybrid_model/model_helpers.py', 'w', encoding='utf-8') as f:
    f.write(helper_code)
print(" Helper Functions gespeichert")

# 5. CREATE HYBRID PREDICTOR CLASS FILE
predictor_code = '''"""
Hybrid Predictor Class f√ºr Streamlit Deployment
"""

import pandas as pd
import numpy as np
import pickle
import joblib
import sys
import os
from typing import Dict, List, Any

# Fix imports - add current directory to path
sys.path.insert(0, os.path.dirname(__file__))

# Now import helpers
from model_helpers import normalize, predict_with_confidence, extract_job_history_features

class HybridPredictor:
    """
    Hybrid Model: TF-IDF + Random Forest
    """
    
    def __init__(self, model_dir='hybrid_model'):
        """L√§dt alle Modell-Komponenten"""
        
        # Make paths absolute
        if not os.path.isabs(model_dir):
            model_dir = os.path.join(os.getcwd(), model_dir)
        
        print(f"Lade Modelle aus: {model_dir}")
        
        # Load TF-IDF Models
        self.tfidf_sen = joblib.load(f'{model_dir}/tfidf_seniority.pkl')
        self.tfidf_dept = joblib.load(f'{model_dir}/tfidf_department.pkl')
        
        # Load Random Forest Models
        self.rf_sen = joblib.load(f'{model_dir}/rf_seniority.pkl')
        self.rf_dept = joblib.load(f'{model_dir}/rf_department.pkl')
        
        # Load Label Encoders
        self.le_sen_rf = joblib.load(f'{model_dir}/le_seniority_rf.pkl')
        self.le_dept_rf = joblib.load(f'{model_dir}/le_department_rf.pkl')
        
        # Load Config
        with open(f'{model_dir}/config.pkl', 'rb') as f:
            config = pickle.load(f)
            self.hybrid_cfg = config['hybrid_config']
            self.feature_cols = config['feature_cols']
        
        # Expected Features
        self.rf_sen_features = list(self.rf_sen.feature_names_in_)
        self.rf_dept_features = list(self.rf_dept.feature_names_in_)
        
        print(f" Modelle geladen")
        print(f"   Seniority Classes: {list(self.le_sen_rf.classes_)}")
        print(f"   Department Classes: {list(self.le_dept_rf.classes_)}")
    
    def predict_seniority(self, person_jobs: List[Dict], target_job_idx: int = 0) -> Dict[str, Any]:
        """Predict Seniority f√ºr einen Job"""
        job = person_jobs[target_job_idx]
        text = str(job.get("position", "")).strip()
        
        # TF-IDF Prediction
        tfidf_pred, tfidf_conf = predict_with_confidence(self.tfidf_sen, text)
        
        # Random Forest Prediction
        features = extract_job_history_features(person_jobs, target_job_idx)
        feature_df = pd.DataFrame([features])
        
        # Ensure all required features exist
        for feat in self.rf_sen_features:
            if feat not in feature_df.columns:
                feature_df[feat] = 0
        
        feature_vector = feature_df[self.rf_sen_features].fillna(0)
        rf_pred_idx = self.rf_sen.predict(feature_vector)[0]
        rf_probs = self.rf_sen.predict_proba(feature_vector)[0]
        rf_conf = rf_probs.max()
        rf_pred = self.le_sen_rf.inverse_transform([rf_pred_idx])[0]
        
        # Combination Logic
        if tfidf_conf >= self.hybrid_cfg['base_hi']:
            return {'label': tfidf_pred, 'confidence': tfidf_conf, 'source': 'tfidf'}
        if rf_conf >= self.hybrid_cfg['rf_hi']:
            return {'label': rf_pred, 'confidence': rf_conf, 'source': 'rf'}
        
        # Else: Higher confidence wins
        if tfidf_conf >= rf_conf:
            return {'label': tfidf_pred, 'confidence': tfidf_conf, 'source': 'tfidf_fallback'}
        else:
            return {'label': rf_pred, 'confidence': rf_conf, 'source': 'rf_fallback'}
    
    def predict_department(self, person_jobs: List[Dict], target_job_idx: int = 0) -> Dict[str, Any]:
        """Predict Department f√ºr einen Job"""
        job = person_jobs[target_job_idx]
        text = str(job.get("position", "")).strip()
        
        # TF-IDF Prediction
        tfidf_pred, tfidf_conf = predict_with_confidence(self.tfidf_dept, text)
        
        # Random Forest Prediction
        features = extract_job_history_features(person_jobs, target_job_idx)
        feature_df = pd.DataFrame([features])
        
        # Ensure all required features exist
        for feat in self.rf_dept_features:
            if feat not in feature_df.columns:
                feature_df[feat] = 0
        
        feature_vector = feature_df[self.rf_dept_features].fillna(0)
        rf_pred_idx = self.rf_dept.predict(feature_vector)[0]
        rf_probs = self.rf_dept.predict_proba(feature_vector)[0]
        rf_conf = rf_probs.max()
        rf_pred = self.le_dept_rf.inverse_transform([rf_pred_idx])[0]
        
        # Combination Logic
        if tfidf_conf >= self.hybrid_cfg['base_hi']:
            return {'label': tfidf_pred, 'confidence': tfidf_conf, 'source': 'tfidf'}
        if rf_conf >= self.hybrid_cfg['rf_hi']:
            return {'label': rf_pred, 'confidence': rf_conf, 'source': 'rf'}
        
        # Department Fallback: Use previous job if both confidences are low
        if self.hybrid_cfg['dept_fallback'] and tfidf_conf < 0.6 and rf_conf < 0.6:
            if len(person_jobs) > target_job_idx + 1:
                prev_dept = person_jobs[target_job_idx + 1].get("department")
                if prev_dept:
                    return {'label': prev_dept, 'confidence': 0.5, 'source': 'previous_job'}
        
        # Else: Higher confidence wins
        if tfidf_conf >= rf_conf:
            return {'label': tfidf_pred, 'confidence': tfidf_conf, 'source': 'tfidf_fallback'}
        else:
            return {'label': rf_pred, 'confidence': rf_conf, 'source': 'rf_fallback'}
    
    def predict(self, person_jobs: List[Dict], target_job_idx: int = 0) -> Dict[str, Any]:
        """Predict beide Tasks f√ºr einen Job"""
        sen_result = self.predict_seniority(person_jobs, target_job_idx)
        dept_result = self.predict_department(person_jobs, target_job_idx)
        
        return {
            'seniority': sen_result,
            'department': dept_result
        }
'''

with open('hybrid_model/hybrid_predictor.py', 'w', encoding='utf-8') as f:
    f.write(predictor_code)
print(" Hybrid Predictor Class gespeichert")

# 6. CREATE __init__.py (MAKES IT A PROPER PACKAGE)
init_code = '''"""
Hybrid Model Package f√ºr LinkedIn Job Classification
"""

from .hybrid_predictor import HybridPredictor

__all__ = ['HybridPredictor']
__version__ = '1.0.0'
'''

with open('hybrid_model/__init__.py', 'w', encoding='utf-8') as f:
    f.write(init_code)
print(" __init__.py erstellt (Package)")


print("ALLE MODELL-KOMPONENTEN GESPEICHERT")

print(f"\nGespeicherte Dateien in 'hybrid_model/':")
print("   - tfidf_seniority.pkl")
print("   - tfidf_department.pkl")
print("   - rf_seniority.pkl")
print("   - rf_department.pkl")
print("   - le_seniority_rf.pkl")
print("   - le_department_rf.pkl")
print("   - config.pkl")
print("   - model_helpers.py")
print("   - hybrid_predictor.py")
print("   - __init__.py")

SAVE HYBRID MODEL
 TF-IDF Modelle gespeichert
 Random Forest Modelle und Label Encoder gespeichert
 Konfiguration gespeichert
 Helper Functions gespeichert
 Hybrid Predictor Class gespeichert
 __init__.py erstellt (Package)
ALLE MODELL-KOMPONENTEN GESPEICHERT

Gespeicherte Dateien in 'hybrid_model/':
   - tfidf_seniority.pkl
   - tfidf_department.pkl
   - rf_seniority.pkl
   - rf_department.pkl
   - le_seniority_rf.pkl
   - le_department_rf.pkl
   - config.pkl
   - model_helpers.py
   - hybrid_predictor.py
   - __init__.py
