# Approach 4 (Programmatic Labeling)

#### Load Data

In [201]:
import json
import pandas as pd
import re
from collections import Counter, defaultdict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
import Levenshtein
from sklearn.metrics import classification_report, accuracy_score

# Load JSON
with open("linkedin-cvs-annotated.json", "r", encoding="utf-8") as f:
    annotated_cvs = json.load(f)

with open("linkedin-cvs-not-annotated.json", "r", encoding="utf-8") as f:
    not_annotated_cvs = json.load(f)

# Load CVs
df_dept = pd.read_csv("department-v2.csv")
df_sen  = pd.read_csv("seniority-v2.csv")


#### Text Normalization

In [202]:
def normalize(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()

    # Normalize umlauts
    text = (
        text.replace("ä","ae")
            .replace("ö","oe")
            .replace("ü","ue")
            .replace("ß","ss")
    )

    # Remove gendered forms: managerin -> manager
    text = re.sub(r"(in|innen|in$)", "", text)

    # Remove special characters
    text = re.sub(r"[^a-z0-9 ]", " ", text)

    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


#### DB-Lookup

In [203]:
# Apply text normalization to job titles in the lookup tables
df_dept["text_clean"] = df_dept["text"].apply(normalize)
df_sen["text_clean"]  = df_sen["text"].apply(normalize)

# Build lookup dictionaries
dept_lookup = dict(zip(df_dept["text_clean"], df_dept["label"]))
sen_lookup  = dict(zip(df_sen["text_clean"], df_sen["label"]))



#### Seniority and Department Keywords

In [204]:
# Define a set of stopwords to exclude non-informative words
stopwords = set(["and","of","for","und","der","die","das","in","mit","to","de","la","le","des","et","en","as"])

def build_keyword_dict(df):
    # Collect words per label
    label_words = defaultdict(list)

    for _, row in df.iterrows():
        label = row["label"]
        text = normalize(row["text"])
        words = text.split()

        for w in words:
            # Keep only meaningful words (min length 3, not a stopword)
            if len(w) >= 3 and w not in stopwords:
                label_words[label].append(w)

    keyword_dict = {}
    for label, words in label_words.items():
        # Count word frequencies and keep the most common ones per label
        counts = Counter(words)
        keyword_dict[label] = dict(counts.most_common(30))

    return keyword_dict

# Build keyword dictionaries for seniority and department
sen_keywords  = build_keyword_dict(df_sen)
dept_keywords = build_keyword_dict(df_dept)


#### Abbreviations for Seniority and Department

In [205]:

seniority_abbr = {
    "jr": "Junior",
    "jnr": "Junior",
    "lead": "Lead",
    "ld": "Lead",
    "dir": "Director",
    "dr": "Director",
    "mgr": "Management",
    "mgmt": "Management",
    "manag": "Management",
    "head": "Management",
    "sr": "Senior",
    "snr": "Senior",
    "chief": "Lead",
    "vp": "Director"
}


department_abbr = {
    "mkt": "Marketing",
    "mktg": "Marketing",
    "sales": "Sales",
    "sls": "Sales",
    "it": "Information Technology",
    "bd": "Business Development",
    "pm": "Project Management",
    "cons": "Consulting",
    "cnsltg": "Consulting",
    "admin": "Administrative",
    "adm": "Administrative",
    "other": "Other",
    "purch": "Purchasing",
    "prch": "Purchasing",
    "cs": "Customer Support",
    "cust sup": "Customer Support",
    "cust support": "Customer Support",
    "hr": "Human Resources",
    "fin": "Finance",
    "ops": "Operations",
    "r&d": "Research & Development"
}


#### Length Heuristic

In [206]:
sen_stats = df_sen.copy()
# Calculate the word count for each job title
sen_stats["words"] = sen_stats["text"].apply(lambda x: len(str(x).split()))
# Compute the average word count per seniority label
sen_threshold = sen_stats.groupby("label")["words"].mean()

def length_based_seniority(position):
    words = len(position.split())
    if words <= 3:
        return "Junior"
    elif words >= 6:
        return "Senior"
    return None


#### C-Level Abbreviations

In [207]:
c_level_abbr = {
    "CEO": "Chief Executive Officer",
    "CFO": "Chief Financial Officer",
    "COO": "Chief Operating Officer",
    "CTO": "Chief Technology Officer",
    "CMO": "Chief Marketing Officer",
    "CIO": "Chief Information Officer",
    "CHRO": "Chief Human Resources Officer",
    "CDO": "Chief Digital Officer / Chief Data Officer",
    "CRO": "Chief Revenue Officer",
    "CSO": "Chief Sales Officer / Chief Strategy Officer",
    "CAO": "Chief Administrative Officer",
    "CCO": "Chief Commercial Officer / Chief Compliance Officer",
    "CLO": "Chief Legal Officer / Chief Learning Officer",
    "CPO": "Chief Product Officer / Chief People Officer",
    "EVP": "Executive Vice President",
    "SVP": "Senior Vice President",
    "VP": "Vice President",
    "AVP": "Assistant / Associate Vice President"
}


#### Levenshtein Distance (for Typographical Errors)

In [208]:
def edit_distance_label(pos, keywords_dict, max_distance=2):
    pos_norm = normalize(pos)
    words = pos_norm.split()  

    for word in words: 
        for label, kws in keywords_dict.items():
            for kw in kws:
                if Levenshtein.distance(word, kw) <= max_distance:
                    return label  

    return None  

#### Labeling Function with Majority Voting

In [209]:
def vote(labels):
    labels = [l for l in labels if l is not None]
    if len(labels) == 0:
        return None
    return Counter(labels).most_common(1)[0][0]


def label_position(position):
    # Normalize the job title
    pos = normalize(position)

    # Lookup (hard priority)
    sen_lookup_match  = sen_lookup.get(pos)
    dept_lookup_match = dept_lookup.get(pos)

  
    # Case A: both lookups match
  
    if sen_lookup_match is not None and dept_lookup_match is not None:
        return sen_lookup_match, dept_lookup_match

   
    # Case B: only seniority lookup matches
 
    elif sen_lookup_match is not None and dept_lookup_match is None:
        final_sen = sen_lookup_match

        dept_votes = []

        # Keyword matching for department
        for label, kws in dept_keywords.items():
            for kw in kws:
                if kw in pos:
                    dept_votes.append(label)

        # Fuzzy matching for department
        lev_dept = edit_distance_label(pos, dept_keywords, max_distance=2)
        if lev_dept:
            dept_votes.append(lev_dept)

        # Abbreviation matching for department
        for abbr, lab in department_abbr.items():
            if abbr.lower() in pos.lower():
                dept_votes.append(lab)

        # Majority voting for department
        final_dept = vote(dept_votes)
        if final_dept is None:
            final_dept = "Other"

        return final_sen, final_dept

 
    # Case C: only department lookup matches
   
    elif sen_lookup_match is None and dept_lookup_match is not None:
        final_dept = dept_lookup_match

        sen_votes = []

        # Keyword matching for seniority
        for label, kws in sen_keywords.items():
            for kw in kws:
                if kw in pos:
                    sen_votes.append(label)

        # Fuzzy matching for seniority
        lev_sen = edit_distance_label(pos, sen_keywords, max_distance=1)
        if lev_sen:
            sen_votes.append(lev_sen)

        # Abbreviation matching for seniority
        for abbr, lab in seniority_abbr.items():
            if abbr.lower() in pos.lower():
                sen_votes.append(lab)

        # C-level title detection (forces Management)
        for abbr, lab in c_level_abbr.items():
            if abbr.lower() in pos.lower() or lab.lower() in pos.lower():
                sen_votes.append("Management")

        # Length-based fallback for seniority
        if len(sen_votes) == 0:
            sen_votes.append(length_based_seniority(pos))

        # Majority voting for seniority
        final_sen = vote(sen_votes)

        return final_sen, final_dept


    # Case D: no lookup matches
  
    else:
        sen_votes  = []
        dept_votes = []

        # Keyword matching
        for label, kws in sen_keywords.items():
            for kw in kws:
                if kw in pos:
                    sen_votes.append(label)

        for label, kws in dept_keywords.items():
            for kw in kws:
                if kw in pos:
                    dept_votes.append(label)

        # Fuzzy matching
        lev_sen = edit_distance_label(pos, sen_keywords, max_distance=1)
        if lev_sen:
            sen_votes.append(lev_sen)

        lev_dept = edit_distance_label(pos, dept_keywords, max_distance=2)
        if lev_dept:
            dept_votes.append(lev_dept)

        # Abbreviation matching
        for abbr, lab in seniority_abbr.items():
            if abbr.lower() in pos.lower():
                sen_votes.append(lab)

        for abbr, lab in department_abbr.items():
            if abbr.lower() in pos.lower():
                dept_votes.append(lab)

        # C-level title detection (forces Management)
        for abbr, lab in c_level_abbr.items():
            if abbr.lower() in pos.lower() or lab.lower() in pos.lower():
                sen_votes.append("Management")

        # Length-based fallback for seniority
        if len(sen_votes) == 0:
            sen_votes.append(length_based_seniority(pos))

        # Majority voting
        final_sen  = vote(sen_votes)
        final_dept = vote(dept_votes)
        if final_dept is None:
            final_dept = "Other"

        return final_sen, final_dept


#### Automatically Label JSON (not-annotated)

In [210]:
# Prepare training data for the model
X = [] # Features / input texts (job titles)
y_sen = [] #Target variable: Seniority
y_dept = [] # Target variable: Department

# Iterate through each LinkedIn CV in the non-annotated file
# Only current positions are considered, since we only want to predict current seniority and department
for cv in not_annotated_cvs:
    for job in cv:
        if job["status"] == "ACTIVE":
            sen_label, dept_label = label_position(job["position"])
            job["pred_seniority"] = sen_label
            job["pred_department"] = dept_label

            X.append(normalize(job["position"]))
            y_sen.append(sen_label)
            y_dept.append(dept_label)

print("Training Examples:", len(X))


Training Examples: 419


#### Train ML models

In [211]:
# Clean data: only complete labels
X_clean = []
y_sen_clean = []
y_dept_clean = []

# Remove training examples where Seniority or Department is missing
for x, y_sen_item, y_dept_item in zip(X, y_sen, y_dept):
    if y_sen_item is not None and y_dept_item is not None:
        X_clean.append(x)
        y_sen_clean.append(y_sen_item)
        y_dept_clean.append(y_dept_item)

print(f"Filtered training examples: {len(X_clean)}")
if len(X_clean) == 0:
    raise ValueError("No training data left after filtering None labels.")


# 1) Seniority Classification ---------------------------------------------------------------------------------------------------------------------------------------------------------


# Train test split
X_train_sen, X_test_sen, y_train_sen, y_test_sen = train_test_split(
    X_clean, y_sen_clean, test_size=0.3, random_state=42
)

# Convert job titles to numerical features
vec_sen = TfidfVectorizer(ngram_range=(1, 2))
X_train_vec_sen = vec_sen.fit_transform(X_train_sen)
X_test_vec_sen  = vec_sen.transform(X_test_sen)

# Train logistic regression classifier
clf_sen = LogisticRegression(max_iter=1000)
clf_sen.fit(X_train_vec_sen, y_train_sen)

# In-sample predictions
train_preds_sen = clf_sen.predict(X_train_vec_sen)
train_acc_sen = accuracy_score(y_train_sen, train_preds_sen)
print("Seniority In-sample Accuracy (Training):", train_acc_sen)
print("Seniority In-sample Classification Report:\n", classification_report(y_train_sen, train_preds_sen))

# Out-of-sample predictions
test_preds_sen = clf_sen.predict(X_test_vec_sen)
test_acc_sen = accuracy_score(y_test_sen, test_preds_sen)
print("Seniority Out-of-sample Accuracy (Test):", test_acc_sen)
print("Seniority Out-of-sample Classification Report:\n", classification_report(y_test_sen, test_preds_sen))



# 2) Department Classification ------------------------------------------------------------------------------------------------------------------------------------------------------------

# Train test split
X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(
    X_clean, y_dept_clean, test_size=0.3, random_state=42
)

# Convert job titles to numerical features
vec_dept = TfidfVectorizer(ngram_range=(1, 2))
X_train_vec_dept = vec_dept.fit_transform(X_train_dept)
X_test_vec_dept  = vec_dept.transform(X_test_dept)

# Train logistic regression classfier
clf_dept = LogisticRegression(max_iter=1000)
clf_dept.fit(X_train_vec_dept, y_train_dept)

# In-sample predictions
train_preds_dept = clf_dept.predict(X_train_vec_dept)
train_acc_dept = accuracy_score(y_train_dept, train_preds_dept)
print("\n Department In-sample Accuracy (Training):", train_acc_dept)
print("Department In-sample Classification Report:\n", classification_report(y_train_dept, train_preds_dept))

# Out-of-sample predictions
test_preds_dept = clf_dept.predict(X_test_vec_dept)
test_acc_dept = accuracy_score(y_test_dept, test_preds_dept)
print("Department Out-of-sample Accuracy (Test):", test_acc_dept)
print("Department Out-of-sample Classification Report:\n", classification_report(y_test_dept, test_preds_dept))


Filtered training examples: 410
Seniority In-sample Accuracy (Training): 0.9024390243902439
Seniority In-sample Classification Report:
               precision    recall  f1-score   support

    Director       1.00      0.60      0.75        25
      Junior       0.79      1.00      0.88        87
        Lead       1.00      0.26      0.41        23
  Management       0.99      1.00      0.99        83
      Senior       0.94      0.99      0.96        69

    accuracy                           0.90       287
   macro avg       0.94      0.77      0.80       287
weighted avg       0.92      0.90      0.89       287

Seniority Out-of-sample Accuracy (Test): 0.7804878048780488
Seniority Out-of-sample Classification Report:
               precision    recall  f1-score   support

    Director       0.67      0.67      0.67         9
      Junior       0.67      0.86      0.76        43
        Lead       1.00      0.25      0.40        16
  Management       1.00      0.83      0.91       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


#### Final Evaluation with Annotated JSON File

In [212]:
# Extract all active jobs from the annotated JSON
annotated_jobs = []
for cv in annotated_cvs:
    for job in cv:
        if job["status"] == "ACTIVE":
            annotated_jobs.append(job)

df_val = pd.DataFrame(annotated_jobs)

# Normalize texts
df_val["position_clean"] = df_val["position"].apply(normalize)


# The TF-IDF vectors are reused to map test/validation data to the same feature space, so that the already trained model can make predictions
X_val_vec_sen  = vec_sen.transform(df_val["position_clean"])
X_val_vec_dept = vec_dept.transform(df_val["position_clean"])

# Generate predictions
val_preds_sen  = clf_sen.predict(X_val_vec_sen)
val_preds_dept = clf_dept.predict(X_val_vec_dept)

# True labels from the annotated JSON
y_val_true_sen  = df_val["seniority"]
y_val_true_dept = df_val["department"]

# Calculate accuracy
acc_sen  = accuracy_score(y_val_true_sen, val_preds_sen)
acc_dept = accuracy_score(y_val_true_dept, val_preds_dept)


print("SnapAddy Validation Accuracy")
print(f"Seniority Accuracy: {acc_sen:.2%}")
print(f"Department Accuracy: {acc_dept:.2%}\n")

print("Seniority Classification Report:\n", classification_report(y_val_true_sen, val_preds_sen))
print("Department Classification Report:\n", classification_report(y_val_true_dept, val_preds_dept))


SnapAddy Validation Accuracy
Seniority Accuracy: 31.78%
Department Accuracy: 38.68%

Seniority Classification Report:
               precision    recall  f1-score   support

    Director       0.53      0.68      0.60        34
      Junior       0.04      1.00      0.08        12
        Lead       1.00      0.05      0.09       125
  Management       0.98      0.62      0.76       192
Professional       0.00      0.00      0.00       216
      Senior       0.23      0.86      0.37        44

    accuracy                           0.32       623
   macro avg       0.46      0.53      0.32       623
weighted avg       0.55      0.32      0.31       623

Department Classification Report:
                         precision    recall  f1-score   support

        Administrative       0.00      0.00      0.00        14
  Business Development       0.41      0.45      0.43        20
            Consulting       0.00      0.00      0.00        39
      Customer Support       0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


#### Error Analysis

In [213]:
def analyze_errors(true_labels, pred_labels, positions, title="Error Analysis"):
    errors = defaultdict(list)
    for i, (true, pred) in enumerate(zip(true_labels, pred_labels)):
        if true != pred:
            errors[true].append((positions[i], pred))

    print(f"\n{title}")
    for label, instances in errors.items():
        print(f"\nLabel '{label}' was incorrectly predicted {len(instances)} times:")
        for pos, pred in instances:
            print(f"  Position: '{pos}' → Predicted: '{pred}'")

# Error analysis for Seniority
analyze_errors(y_val_true_sen, val_preds_sen, df_val["position_clean"], title="Seniority Error Analysis")

# Error analysis for Department
analyze_errors(y_val_true_dept, val_preds_dept, df_val["position_clean"], title="Department Error Analysis")



Seniority Error Analysis

Label 'Management' was incorrectly predicted 73 times:
  Position: 'prokurist' → Predicted: 'Junior'
  Position: 'cfo' → Predicted: 'Junior'
  Position: 'prokurist' → Predicted: 'Junior'
  Position: 'cfo' → Predicted: 'Junior'
  Position: 'managg director' → Predicted: 'Director'
  Position: 'managg director' → Predicted: 'Director'
  Position: 'member of the advisory board' → Predicted: 'Junior'
  Position: 'member of the advisory board' → Predicted: 'Junior'
  Position: 'managg director' → Predicted: 'Director'
  Position: 'kurucu ortak' → Predicted: 'Junior'
  Position: 'eigenaar' → Predicted: 'Junior'
  Position: 'managg director' → Predicted: 'Director'
  Position: 'managg director' → Predicted: 'Director'
  Position: 'board member cfo asset management' → Predicted: 'Junior'
  Position: 'arbeitgeber' → Predicted: 'Junior'
  Position: 'kanzler cfo' → Predicted: 'Junior'
  Position: 'member of the board frankfurt school stiftung' → Predicted: 'Junior'
  Po