# Embedding-based labeling

## 1 Setup and load data

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


# CSV-Dateien laden
df_seniority = pd.read_csv('seniority-v2.csv')
df_department = pd.read_csv('department-v2.csv')

df_department.head(40)

Unnamed: 0,text,label
0,Adjoint directeur communication,Marketing
1,Advisor Strategy and Projects,Project Management
2,Beratung & Projekte,Project Management
3,Beratung & Projektmanagement,Project Management
4,Beratung und Projektmanagement kommunale Partner,Project Management
5,Cadre marketing digital,Marketing
6,Chargé de communication,Marketing
7,Chargé de communication digitale,Marketing
8,Chargé de communication et marketing,Marketing
9,Chargé de Webmarketing SEO/SEA,Marketing


## 2 Prepare Data  
- Remove leading and trailing spaces  
- extract labels  
- Train-Test-Split of CSVs


In [2]:
# Seniority
df_seniority['text_clean'] = df_seniority['text'].str.strip() 

# Unique Labels
seniority_labels = df_seniority['label'].unique().tolist()
print(f"Seniority Labels: {seniority_labels}")

# Department
df_department['text_clean'] = df_department['text'].str.strip() 

# Unique Labels 
department_labels = df_department['label'].unique().tolist()
print(f"Department Labels: {department_labels}")


# Train-Test-Split of CSVs

# Seniority
train_sen, test_sen = train_test_split(
    df_seniority,
    test_size=0.2,
    stratify=df_seniority['label'],  # sorgt dafür, dass alle Labels proportional im Test sind
    random_state=42
)

# Department
train_dept, test_dept = train_test_split(
    df_department,
    test_size=0.2,
    stratify=df_department['label'],  # sorgt dafür, dass alle Labels proportional im Test sind
    random_state=42
)

print(f"Seniority - Train: {len(train_sen)}, Test: {len(test_sen)}")
print(f"Department - Train: {len(train_dept)}, Test: {len(test_dept)}")


Seniority Labels: ['Junior', 'Senior', 'Lead', 'Management', 'Director']
Department Labels: ['Marketing', 'Project Management', 'Administrative', 'Business Development', 'Consulting', 'Human Resources', 'Information Technology', 'Other', 'Purchasing', 'Sales', 'Customer Support']
Seniority - Train: 7542, Test: 1886
Department - Train: 8116, Test: 2029


## 3 Comparison of different Sentence transformers

In [3]:
# models to compare
models_to_compare = {
    'MiniLM-L12': 'paraphrase-multilingual-MiniLM-L12-v2',
    'MPNet': 'paraphrase-multilingual-mpnet-base-v2',
    'DistilUSE': 'distiluse-base-multilingual-cased-v2',
}

# prepare labels
seniority_labels = sorted(train_sen['label'].unique())
department_labels = sorted(train_dept['label'].unique())

# save results
results = []


print("COMPARISON SENTENCE TRANSFORMER MODELS")

for model_name, model_path in models_to_compare.items():

    print(f"\nmodel: {model_name}")

    # Modell laden
    model = SentenceTransformer(model_path)
    
    # Label-Embeddings 
    seniority_label_embeddings = model.encode(seniority_labels)
    department_label_embeddings = model.encode(department_labels)
    
    # Test-Embeddings 
    test_sen_embeddings = model.encode(test_sen['text'].tolist())
    test_dept_embeddings = model.encode(test_dept['text'].tolist())
    
    # SENIORITY PREDICTIONS
    seniority_similarities = cosine_similarity(test_sen_embeddings, seniority_label_embeddings)
    seniority_pred_idx = seniority_similarities.argmax(axis=1)
    predictions_sen = [seniority_labels[i] for i in seniority_pred_idx]
    
    # Accuracy
    acc_sen = accuracy_score(test_sen['label'], predictions_sen)
    
    print(f"\nSENIORITY:")
    print(f"  Accuracy: {acc_sen:.4f}")
    print("\n" + classification_report(test_sen['label'], predictions_sen))
    
    # Similarity-Beispiele (erste 3)
    if model_name == 'MiniLM-L12':  # Details only for the first Model
        print("\n  Similarity-Examples (first 3):")
        similarity_df_sen = pd.DataFrame(seniority_similarities[:3], columns=seniority_labels)
        similarity_df_sen['pred'] = [seniority_labels[i] for i in seniority_pred_idx[:3]]
        print(similarity_df_sen.to_string(index=False))
    
   
    # DEPARTMENT PREDICTIONS
        
    department_similarities = cosine_similarity(test_dept_embeddings, department_label_embeddings)
    department_pred_idx = department_similarities.argmax(axis=1)
    predictions_dept = [department_labels[i] for i in department_pred_idx]
    
    # Accuracy
    acc_dept = accuracy_score(test_dept['label'], predictions_dept)
    
    print(f"\nDEPARTMENT:")
    print(f"  Accuracy: {acc_dept:.4f}")
    print("\n" + classification_report(test_dept['label'], predictions_dept))
    
    # Similarity-Beispiele (erste 3)
    if model_name == 'MiniLM-L12':  # Details only for the first one
        print("\n  Similarity-Examples (first 3):")
        similarity_df_dept = pd.DataFrame(department_similarities[:3], columns=department_labels)
        similarity_df_dept['pred'] = [department_labels[i] for i in department_pred_idx[:3]]
        print(similarity_df_dept.to_string(index=False))
    
    # save results
    results.append({
        'Model': model_name,
        'Seniority Accuracy': acc_sen,
        'Department Accuracy': acc_dept,
        'Average': (acc_sen + acc_dept) / 2
    })


# Comparison Table

print("\nSUMMARY - MODEL COMPARISON")

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Average', ascending=False)
print("\n", results_df.to_string(index=False))

# Best Model
best_model = results_df.iloc[0]
print(f"\n Beste model: {best_model['Model']}")
print(f"   Seniority: {best_model['Seniority Accuracy']:.4f}")
print(f"   Department: {best_model['Department Accuracy']:.4f}")
print(f"   Average: {best_model['Average']:.4f}")



COMPARISON SENTENCE TRANSFORMER MODELS

model: MiniLM-L12

SENIORITY:
  Accuracy: 0.1713

              precision    recall  f1-score   support

    Director       0.26      0.97      0.41       197
      Junior       0.82      0.22      0.35        82
        Lead       0.50      0.01      0.02       709
  Management       0.05      0.34      0.08       151
      Senior       0.98      0.07      0.14       747

    accuracy                           0.17      1886
   macro avg       0.52      0.32      0.20      1886
weighted avg       0.64      0.17      0.13      1886


  Similarity-Examples (first 3):
 Director   Junior     Lead  Management   Senior       pred
 0.752080 0.186454 0.330512    0.742902 0.283225   Director
 0.546716 0.201323 0.306462    0.574979 0.250160 Management
 0.348518 0.101277 0.213715    0.388671 0.038635 Management

DEPARTMENT:
  Accuracy: 0.6304

                        precision    recall  f1-score   support

        Administrative       0.04      0.47      

## 4 Improvement of the Model

### Use best model for seniority and department separately

In [4]:
print("TASK-SPECIFIC MODELS")

# Modelle laden

model_seniority = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

model_department = SentenceTransformer('distiluse-base-multilingual-cased-v2')


# SENIORITY

print("\nSENIORITY PREDICTION")

# prepare labels
seniority_labels = sorted(train_sen['label'].unique())

# Label-Embeddings
seniority_label_embeddings = model_seniority.encode(seniority_labels)

# Test-Embeddings
test_sen_embeddings = model_seniority.encode(test_sen['text'].tolist())

# Predictions
seniority_similarities = cosine_similarity(test_sen_embeddings, seniority_label_embeddings)
seniority_pred_idx = seniority_similarities.argmax(axis=1)
test_sen['pred'] = [seniority_labels[i] for i in seniority_pred_idx]


# Evaluation
print(f"\nAccuracy: {accuracy_score(test_sen['label'], test_sen['pred']):.4f}")
print("\n" + classification_report(test_sen['label'], test_sen['pred']))


# DEPARTMENT

print("\nDEPARTMENT PREDICTION")

# prepare lables
department_labels = sorted(train_dept['label'].unique())

# Label-Embeddings
department_label_embeddings = model_department.encode(department_labels)

# Test-Embeddings
test_dept_embeddings = model_department.encode(test_dept['text'].tolist())

# Predictions
department_similarities = cosine_similarity(test_dept_embeddings, department_label_embeddings)
department_pred_idx = department_similarities.argmax(axis=1)
test_dept['pred'] = [department_labels[i] for i in department_pred_idx]

# Evaluation
print(f"\nAccuracy: {accuracy_score(test_dept['label'], test_dept['pred']):.4f}")
print("\n" + classification_report(test_dept['label'], test_dept['pred']))


# Summary
print("\nSUMMARY")

summary = pd.DataFrame({
    'Task': ['Seniority', 'Department'],
    'Model': ['MiniLM-L12', 'DistilUSE'],
    'Accuracy': [
        accuracy_score(test_sen['label'], test_sen['pred']),
        accuracy_score(test_dept['label'], test_dept['pred'])
    ]
})

print("\n", summary.to_string(index=False))


TASK-SPECIFIC MODELS

SENIORITY PREDICTION

Accuracy: 0.1713

              precision    recall  f1-score   support

    Director       0.26      0.97      0.41       197
      Junior       0.82      0.22      0.35        82
        Lead       0.50      0.01      0.02       709
  Management       0.05      0.34      0.08       151
      Senior       0.98      0.07      0.14       747

    accuracy                           0.17      1886
   macro avg       0.52      0.32      0.20      1886
weighted avg       0.64      0.17      0.13      1886


DEPARTMENT PREDICTION

Accuracy: 0.7201

                        precision    recall  f1-score   support

        Administrative       0.01      0.18      0.03        17
  Business Development       0.58      0.71      0.64       124
            Consulting       0.33      0.94      0.49        33
      Customer Support       0.05      0.43      0.09         7
       Human Resources       0.33      0.17      0.22         6
Information Technology

### Label Augmentaion

In [5]:
print("TASK-SPECIFIC MODELS WITH WEIGHTED LABEL-AUGMENTATION")

# Modelle laden
model_seniority = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
model_department = SentenceTransformer('distiluse-base-multilingual-cased-v2')

print("\nSENIORITY PREDICTION")

# Original Labels
seniority_labels_original = sorted(train_sen['label'].unique())

# Augmented Labels (Kontext)
seniority_labels_augmented = {
    'Director': 'Director A strategic leader who oversees multiple teams or departments. They align technical roadmaps with long-term business objectives, manage high-level stakeholders, and define the organizational culture and operational standards at scale.',
    'Junior': 'Junior An entry-level professional focused on learning core technologies and executing well-defined tasks. They contribute to smaller features under the guidance of mentors, emphasizing code quality and following established best practices.',
    'Lead': 'Lead A technical authority who steers the direction of a team or a specific project. They balance hands-on development with technical decision-making, coordinate cross-functional efforts, and bridge the gap between business requirements and technical implementation.',
    'Management': 'Managment A people-focused leader responsible for team growth, performance, and well-being. They manage resource allocation, facilitate career development, and ensure that the team meets its delivery goals by removing blockers and optimizing workflows.',
    'Senior': 'Senior A highly skilled individual contributor who designs and implements complex systems independently. They possess deep domain expertise, mentor junior peers, and take ownership of end-to-end delivery while ensuring system scalability and performance.'
}

# Label-Embeddings (weighted avg.: 40% Original, 60% Augmented)
seniority_label_embeddings = []

for label in seniority_labels_original:
    # Original Embedding
    original_emb = model_seniority.encode(label)
    
    # Augmented Embedding
    augmented_text = seniority_labels_augmented.get(label, label)
    augmented_emb = model_seniority.encode(augmented_text)
    
    # Gewichteter Durchschnitt
    combined_emb = 0.4 * original_emb + 0.6 * augmented_emb
    seniority_label_embeddings.append(combined_emb)

seniority_label_embeddings = np.array(seniority_label_embeddings)

# Test-Embeddings
test_sen_embeddings = model_seniority.encode(test_sen['text'].tolist())

# Predictions
seniority_similarities = cosine_similarity(test_sen_embeddings, seniority_label_embeddings)
seniority_pred_idx = seniority_similarities.argmax(axis=1)
test_sen['pred'] = [seniority_labels_original[i] for i in seniority_pred_idx]

# Evaluation
print(f"\nAccuracy: {accuracy_score(test_sen['label'], test_sen['pred']):.4f}")
print("\n" + classification_report(test_sen['label'], test_sen['pred']))


# DEPARTMENT WITH WEIGHTED AUGMENTATION

print("\nDEPARTMENT PREDICTION")

# Original Labels
department_labels_original = sorted(train_dept['label'].unique())

# Augmented Labels (Kontext)
department_labels_augmented = {
    'Administrative': 'Focuses on organizational support, office management, and clerical tasks. Responsible for maintaining internal workflows, scheduling, and ensuring the smooth day-to-day operation of the business infrastructure.',
    'Business Development': 'Dedicated to long-term growth by identifying new market opportunities, forming strategic partnerships, and expanding the companys reach. Focuses on the "top of the funnel" and strategic positioning.',
    'Consulting': 'Provides expert advice and professional services to external or internal clients. Focuses on problem-solving, delivering specialized knowledge, and managing client-specific projects to improve performance.',
    'Customer Support': 'Manages the direct relationship with existing customers. Responsible for troubleshooting issues, answering inquiries, and ensuring high levels of customer satisfaction and retention.',
    'Human Resources': 'Oversees the entire employee lifecycle, including recruitment, payroll, talent development, and organizational culture. Ensures compliance with labor laws and focuses on people-related strategies.',
    'Information Technology': 'Manages the technical infrastructure, software development, cybersecurity, and hardware systems. Ensures that the organizations digital tools are functional, secure, and scalable.',
    'Marketing': 'Responsible for brand awareness, lead generation, and market communication. Manages advertising, social media, content creation, and analyzing consumer behavior to drive demand.',
    'Other': 'A general category for specialized or niche roles that do not fit into traditional departments. Includes highly unique functions or cross-disciplinary positions that fall outside standard structures.',
    'Project Management': 'Focuses on the planning, execution, and closing of specific initiatives. Responsible for managing timelines, budgets, and resources to ensure projects are delivered on scope and on time.',
    'Purchasing': 'Manages the procurement of goods and services required for the company. Focuses on supplier negotiations, cost reduction, supply chain efficiency, and inventory management.',
    'Sales': 'Directly responsible for revenue generation by closing deals with customers. Involves prospecting, pitching products or services, and meeting specific sales targets and quotas.'
}

# Label-Embeddings (weighted: 50% Original, 50% Augmented)
department_label_embeddings = []

for label in department_labels_original:
    # Original Embedding
    original_emb = model_department.encode(label)
    
    # Augmented Embedding
    augmented_text = department_labels_augmented.get(label, label)
    augmented_emb = model_department.encode(augmented_text)
    
    # Weighted avg.
    combined_emb = 0.5 * original_emb + 0.5 * augmented_emb
    department_label_embeddings.append(combined_emb)

department_label_embeddings = np.array(department_label_embeddings)

# Test-Embeddings
test_dept_embeddings = model_department.encode(test_dept['text'].tolist())

# Predictions
department_similarities = cosine_similarity(test_dept_embeddings, department_label_embeddings)
department_pred_idx = department_similarities.argmax(axis=1)
test_dept['pred'] = [department_labels_original[i] for i in department_pred_idx]

# Evaluation
print(f"\nAccuracy: {accuracy_score(test_dept['label'], test_dept['pred']):.4f}")
print("\n" + classification_report(test_dept['label'], test_dept['pred']))


TASK-SPECIFIC MODELS WITH WEIGHTED LABEL-AUGMENTATION

SENIORITY PREDICTION

Accuracy: 0.1835

              precision    recall  f1-score   support

    Director       0.20      0.99      0.33       197
      Junior       0.86      0.23      0.37        82
        Lead       0.51      0.03      0.06       709
  Management       0.05      0.25      0.08       151
      Senior       0.89      0.10      0.17       747

    accuracy                           0.18      1886
   macro avg       0.50      0.32      0.20      1886
weighted avg       0.61      0.18      0.15      1886


DEPARTMENT PREDICTION

Accuracy: 0.7462

                        precision    recall  f1-score   support

        Administrative       0.04      0.41      0.07        17
  Business Development       0.60      0.66      0.63       124
            Consulting       0.33      0.97      0.49        33
      Customer Support       0.02      0.14      0.03         7
       Human Resources       0.26      0.83      0.40

## 5 Evaluation on SnapAddy labeled dataset

In [6]:
print("FINAL EVALUATION")

# Load test data
with open('linkedin-cvs-annotated.json', 'r', encoding='utf-8') as f:
    linkedin_data = json.load(f)

active_jobs = []
for person_jobs in linkedin_data:
    for job in person_jobs:
        if job.get('status') == 'ACTIVE':
            active_jobs.append(job)

df_linkedin_evaluation = pd.DataFrame(active_jobs)
df_linkedin_evaluation['position'] = df_linkedin_evaluation['position'].fillna("").astype(str).str.strip()



# SENIORITY PREDICTIONS

print("\nSENIORITY - FINAL EVALUATION")

# LinkedIn-Embeddings
linkedin_sen_embeddings = model_seniority.encode(df_linkedin_evaluation['position'].tolist())

# Predictions
linkedin_sen_similarities = cosine_similarity(linkedin_sen_embeddings, seniority_label_embeddings)
linkedin_sen_pred_idx = linkedin_sen_similarities.argmax(axis=1)
df_linkedin_evaluation['seniority_pred'] = [seniority_labels_original[i] for i in linkedin_sen_pred_idx]

# Evaluation
acc_sen = accuracy_score(df_linkedin_evaluation['seniority'], df_linkedin_evaluation['seniority_pred'])
print(f"Accuracy: {acc_sen:.4f}")
print("\n" + classification_report(df_linkedin_evaluation['seniority'], df_linkedin_evaluation['seniority_pred']))


# DEPARTMENT PREDICTIONS


print("\nDEPARTMENT - FINAL EVALUATION")

# LinkedIn-Embeddings
linkedin_dept_embeddings = model_department.encode(df_linkedin_evaluation['position'].tolist())

# Predictions
linkedin_dept_similarities = cosine_similarity(linkedin_dept_embeddings, department_label_embeddings)
linkedin_dept_pred_idx = linkedin_dept_similarities.argmax(axis=1)
df_linkedin_evaluation['department_pred'] = [department_labels_original[i] for i in linkedin_dept_pred_idx]

# Evaluation
acc_dept = accuracy_score(df_linkedin_evaluation['department'], df_linkedin_evaluation['department_pred'])
print(f"Accuracy: {acc_dept:.4f}")
print("\n" + classification_report(df_linkedin_evaluation['department'], df_linkedin_evaluation['department_pred']))


FINAL EVALUATION

SENIORITY - FINAL EVALUATION
Accuracy: 0.2006

              precision    recall  f1-score   support

    Director       0.12      1.00      0.22        34
      Junior       0.17      0.25      0.20        12
        Lead       0.33      0.10      0.16       125
  Management       0.20      0.23      0.22       192
Professional       0.00      0.00      0.00       216
      Senior       0.48      0.68      0.57        44

    accuracy                           0.20       623
   macro avg       0.22      0.38      0.23       623
weighted avg       0.17      0.20      0.15       623


DEPARTMENT - FINAL EVALUATION


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Accuracy: 0.3082

                        precision    recall  f1-score   support

        Administrative       0.06      0.79      0.10        14
  Business Development       0.29      0.80      0.42        20
            Consulting       0.31      0.74      0.44        39
      Customer Support       0.11      0.50      0.18         6
       Human Resources       0.17      0.44      0.25        16
Information Technology       0.69      0.65      0.67        62
             Marketing       0.37      0.50      0.42        22
                 Other       0.95      0.06      0.11       344
    Project Management       0.60      0.72      0.65        39
            Purchasing       0.43      0.60      0.50        15
                 Sales       0.55      0.39      0.46        46

              accuracy                           0.31       623
             macro avg       0.41      0.56      0.38       623
          weighted avg       0.73      0.31      0.28       623

