# Rule-based matching (baseline)

In [None]:
import json
import pandas as pd
import re


# Load CVs (annotated JSON)
with open('linkedin-cvs-annotated.json', 'r', encoding='utf-8') as f:
    all_cvs = json.load(f)

# Load lookup tables (CSV)
df_dept = pd.read_csv('department-v2.csv')   # Columns: text, label
df_sen = pd.read_csv('seniority-v2.csv')     # Columns: text, label

# Convert all jobs into a DataFrame
jobs = []
for cv in all_cvs:
    for job in cv:
        jobs.append(job)
df = pd.DataFrame(jobs)

# Filter only current jobs ("ACTIVE")
active_jobs = df[df['status'] == 'ACTIVE'].copy()

# Text cleaning
def clean_text(s):
    if pd.isnull(s):
        return ""
    s = str(s).lower().strip()
    # Replace umlauts
    s = s.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
    # Optional: remove special characters
    s = re.sub(r"[^a-z0-9 ]", "", s)
    # Remove gender suffixes
    s = re.sub(r"(in|innen|in$)", "", s)
    
    return s

active_jobs['position_clean'] = active_jobs['position'].apply(clean_text)
df_dept['text_clean'] = df_dept['text'].apply(clean_text)
df_sen['text_clean'] = df_sen['text'].apply(clean_text)

# Rule-based assignment: Department & Seniority
# Assign Department
dept_dict = dict(zip(df_dept['text_clean'], df_dept['label']))
active_jobs['pred_department'] = active_jobs['position_clean'].map(dept_dict)

# Assign Seniority
sen_dict = dict(zip(df_sen['text_clean'], df_sen['label']))
active_jobs['pred_seniority'] = active_jobs['position_clean'].map(sen_dict)

# Check results
print("Recognized Departments:", active_jobs['pred_department'].notnull().sum())
print("Recognized Seniorities:", active_jobs['pred_seniority'].notnull().sum())
print("Job titles not assigned (examples):")
print(active_jobs[active_jobs['pred_department'].isnull()]['position'].unique()[:10])

# Compare with the true labels (Evaluation)
department_accuracy = (active_jobs['pred_department'] == active_jobs['department']).mean()
seniority_accuracy = (active_jobs['pred_seniority'] == active_jobs['seniority']).mean()
print(f"Department Accuracy: {department_accuracy:.2%}")
print(f"Seniority Accuracy: {seniority_accuracy:.2%}")


Recognized Departments: 42
Recognized Seniorities: 150
Job titles not assigned (examples):
['Prokurist' 'CFO' 'Betriebswirtin' 'Prokuristin' 'Solutions Architect'
 'Medizintechnik Beratung' 'Director expansión de negocio.'
 'Gerente comercial' 'Administrador Unico' 'APL-ansvarig, samordning']
Department Accuracy: 6.42%
Seniority Accuracy: 16.53%


#### Error Analysis

In [2]:
# Show incorrectly assigned Departments
wron_dept = active_jobs[active_jobs['pred_department'] != active_jobs['department']]
print("Examples of incorrect Department assignments:")
print(wron_dept[['position', 'department', 'pred_department']].head(10))

Examples of incorrect Department assignments:
                          position              department pred_department
0                        Prokurist                   Other             NaN
1                              CFO                   Other             NaN
2                   Betriebswirtin                   Other             NaN
3                      Prokuristin                   Other             NaN
4                              CFO                   Other             NaN
6              Solutions Architect  Information Technology             NaN
14         Medizintechnik Beratung              Consulting             NaN
17  Director expansión de negocio.    Business Development             NaN
18               Gerente comercial                   Sales             NaN
19             Administrador Unico          Administrative             NaN


In [3]:
# Show incorrectly assigned Seniorities
wron_sen = active_jobs[active_jobs['pred_seniority'] != active_jobs['seniority']]
print("Examples of incorrect Seniority assignments:")
print(wron_sen[['position', 'seniority', 'pred_seniority']].head(10))


Examples of incorrect Seniority assignments:
                          position     seniority pred_seniority
0                        Prokurist    Management            NaN
1                              CFO    Management            NaN
2                   Betriebswirtin  Professional            NaN
3                      Prokuristin    Management            NaN
4                              CFO    Management            NaN
6              Solutions Architect  Professional            NaN
14         Medizintechnik Beratung  Professional            NaN
17  Director expansión de negocio.      Director            NaN
18               Gerente comercial          Lead            NaN
19             Administrador Unico  Professional            NaN
