### Cleaning XpressJobs data along with getting some descriptions useful for training.

In [None]:
import pandas as pd
from pathlib import Path
import re
import unicodedata


RAW_DIR = Path("../data/raw/jobs")
PROCESSED_DIR = Path("../data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)


source_file = RAW_DIR / "xpressjobs_ALL_CATEGORIES_CLEAN_20260201_195011.csv"
print(f"Loading source: {source_file}")

try:
    df = pd.read_csv(source_file)
    print(f"Loaded {len(df)} rows.")
except FileNotFoundError:
    print(" File not found. Please ensure data is in data/raw/jobs/")

df.head()
df.describe()
df.info()
print(f"Missing values: {df.isnull().sum()}")
   

Loading source: ..\data\raw\jobs\xpressjobs_ALL_CATEGORIES_CLEAN_20260201_195011.csv
Loaded 1433 rows.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1433 entries, 0 to 1432
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         1433 non-null   object
 1   company       1427 non-null   object
 2   location      1269 non-null   object
 3   job_type      1426 non-null   object
 4   days_left     1433 non-null   object
 5   level         1433 non-null   object
 6   description   1433 non-null   object
 7   job_url       1433 non-null   object
 8   category      1433 non-null   object
 9   search_term   1433 non-null   object
 10  scraped_date  1433 non-null   object
dtypes: object(11)
memory usage: 123.3+ KB
Missing values: title             0
company           6
location        164
job_type          7
days_left         0
level             0
description       0
job_url           0
category          0
searc

### Clean XpressJobs Data

In [None]:
def clean_html(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'&[a-zA-Z]+;', ' ', text)
    text = re.sub(r'&#\d+;', ' ', text)
    
    return text

def normalize_text(text):
    """ Clean text """
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = re.sub(r'[^\w\s.,;:()\-+/&]', '', text)
    
    return text

In [None]:
# Testing 
sampleDes = df['description'].iloc[0]
print(sampleDes[:500])
print("\n\nCleaned:")
print(normalize_text(clean_html(sampleDes))[:500])

Original (first 500 chars):
SAVE JOB We are seeking a seasoned Senior Software Engineer with a strong background in full stack development, particularly in Java and React. The ideal candidate will have extensive experience in building scalable web applications and services. While your primary focus will be on Java and React, experience with Ruby is a significant advantage and will allow you to contribute to our diverse technology stack. Key Responsibilities: Design, develop, and maintain complex, scalable web applications 


Cleaned:
SAVE JOB We are seeking a seasoned Senior Software Engineer with a strong background in full stack development, particularly in Java and React. The ideal candidate will have extensive experience in building scalable web applications and services. While your primary focus will be on Java and React, experience with Ruby is a significant advantage and will allow you to contribute to our diverse technology stack. Key Responsibilities: Design, develop, and main

In [None]:
# Experience years
def extract_years_experience(text):
    """Extract years of experience."""
    if pd.isna(text) or not isinstance(text, str):
        return None
    
    patterns = [
        r'(\d+)\+?\s*(?:to|-)\s*(\d+)\s*years?',
        r'(\d+)\+\s*years?',
        r'(\d+)\s*years?',
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text.lower())
        if match:
            return int(match.group(1))
    
    return None
# Extract education level
def extract_education_level(text):
    """Extract education requirements."""
    if pd.isna(text) or not isinstance(text, str):
        return None
    
    text_lower = text.lower()
    
    if any(term in text_lower for term in ['phd', 'ph.d', 'doctorate']):
        return 'PhD'
    elif any(term in text_lower for term in ['master', 'msc', 'mba']):
        return 'Masters'
    elif any(term in text_lower for term in ['bachelor', 'degree', 'bsc']):
        return 'Bachelors'
    elif any(term in text_lower for term in ['diploma', 'hnd']):
        return 'Diploma'
    elif any(term in text_lower for term in ['professional certificate', 'certification']):
        return 'Certification' # Similar to CIMA
    
    return None
    
    


In [112]:
# Clean descriptions
df['description_cleaned'] = df['description'].apply(lambda x: normalize_text(clean_html(x)))
df['description_length'] = df['description_cleaned'].str.len()

print(f"Description length stats:")
df['description_length'].describe()

Description length stats:


count     1433.000000
mean      1516.145848
std       1199.763668
min          8.000000
25%        823.000000
50%       1301.000000
75%       1891.000000
max      12788.000000
Name: description_length, dtype: float64

### ESCO to get Skills extraction

In [113]:
# Load ESCO skills data
def load_esco_skills():
    """Load ESCO skills taxonomy."""
    esco_skills_path = Path("../data/raw/esco/skills_en.csv")
    
    if not esco_skills_path.exists():
        print("ESCO skills file not found. Using basic extraction.")
        return None
    
    esco_skills = pd.read_csv(esco_skills_path)
    # Get skill labels (preferred terms)
    skill_labels = esco_skills['preferredLabel'].str.lower().tolist()
    
    print(f"Loaded {len(skill_labels)} ESCO skills")
    return skill_labels

# Extract skills using ESCO taxonomy
def extract_skills_esco(text, esco_skills):
    """Extract skills from text using ESCO taxonomy."""
    if pd.isna(text) or not isinstance(text, str):
        return []
    
    if esco_skills is None:
        return []
    
    text_lower = text.lower()
    
    # Find ESCO skills mentioned in the text
    found_skills = []
    for skill in esco_skills:
        if skill in text_lower:
            found_skills.append(skill)
    
    return found_skills

# Load ESCO skills once
esco_skills = load_esco_skills()

# Apply extraction
df['extracted_skills'] = df['description_cleaned'].apply(
    lambda x: extract_skills_esco(x, esco_skills)
)
df['skills_count'] = df['extracted_skills'].apply(len)

print(f" Skills extracted from {len(df)} jobs")
print(f"Average skills per job: {df['skills_count'].mean():.1f}")


Loaded 13939 ESCO skills
 Skills extracted from 1433 jobs
Average skills per job: 5.0


In [None]:
# Load ESCO data
print("Loading ESCO occupation data...")
esco_occ = pd.read_csv("../data/raw/esco/occupations_en.csv")
esco_relations = pd.read_csv("../data/raw/esco/occupationSkillRelations_en.csv")
esco_skills_df = pd.read_csv("../data/raw/esco/skills_en.csv")

print(f"Loaded {len(esco_occ)} occupations and {len(esco_relations)} skill relations")

# Map jobs to ESCO occupations 
print("\nMapping jobs to ESCO occupations...")
df['esco_occupation_uri'] = df['title'].apply(
    lambda x: map_job_to_esco_occupation(x, esco_occ)
)

# Get required skills for each occupation
print("Extracting required skills from ESCO...")
df['required_skills_esco'] = df['esco_occupation_uri'].apply(
    lambda x: get_skills_for_occupation(x, esco_relations, esco_skills_df)
)

df['required_skills_count'] = df['required_skills_esco'].apply(len)

# Show mapping stats
print(f"\ Jobs mapped to ESCO: {df['esco_occupation_uri'].notna().sum()} ({df['esco_occupation_uri'].notna().sum()/len(df)*100:.1f}%)")
print(f" Average required skills per job: {df[df['esco_occupation_uri'].notna()]['required_skills_count'].mean():.1f}")

Loading ESCO occupation data...
Loaded 3039 occupations and 129004 skill relations

Mapping jobs to ESCO occupations...
Extracting required skills from ESCO...

✓ Jobs mapped to ESCO: 977 (68.2%)
✓ Average required skills per job: 53.3


In [None]:
def map_job_to_esco_occupation(job_title, esco_occupations):
    """Map job title to ESCO occupation with improved fuzzy matching."""
    if pd.isna(job_title):
        return None
    
    job_lower = job_title.lower()
    
    # Remove common modifiers that don't change core occupation
    modifiers = ['senior', 'junior', 'lead', 'principal', 'assistant', 'associate', 
                 'chief', 'head', 'mid-level', 'entry-level', 'grade ii', 'grade i',
                 'probationary', 'cum', 'male', 'female', '/', '-', '(', ')']
    
    job_core = job_lower
    for mod in modifiers:
        job_core = job_core.replace(mod, ' ')
    
    # Clean up extra spaces
    job_core = ' '.join(job_core.split())
    
    # Define key occupation terms to look for 
    occupation_patterns = {
        'software engineer': ['software engineer', 'software developer', 'developer', 'engineer', 'app lead', 'tech lead', 'full stack', 'backend', 'frontend', 'front end'],
         'mobile application developer': ['android', 'ios', 'mobile app', 'mobile developer'],
        'lecturer': ['lecturer', 'instructor', 'teacher', 'professor'],
        ' business analyst': ['business analyst', 'analyst'],
        'architect': ['architect', 'solution architect'],
        'system administrator': ['system administrator', 'sysadmin', 'system support', 'workspace administrator', 'appsheet specialist','specialist'],
         'data scientist': ['data scientist', 'data analyst', 'ml engineer', 'machine learning'],
        'artificial intelligence engineer': ['ai engineer', 'ai app builder', 'ai developer','head of ai'],
         'project manager': ['project manager', 'programme manager', 'program manager'],
        'sales representative': ['sales executive', 'sales representative', 'sales officer'],
        'marketing specialist': ['marketing executive', 'marketing specialist', 'marketing officer'],
        'database administrator': ['database administrator', 'dba', 'oracle db'],
        ' web developer': ['web developer', 'web designer', 'frontend developer', 'front-end', 'ui/ux designer'],
    }
    
    # pattern matching first
    for esco_pattern, variants in occupation_patterns.items():
        for variant in variants:
            if variant in job_core:
                # Find matching ESCO occupation
                for idx, occ in esco_occupations.iterrows():
                    occ_label = occ['preferredLabel'].lower()
                    if esco_pattern in occ_label or any(v in occ_label for v in variants):
                        return occ['conceptUri']
    
    
    for idx, occ in esco_occupations.iterrows():
        occ_label = occ['preferredLabel'].lower()
        
        # Check both directions
        if occ_label in job_core or job_core in occ_label:
            return occ['conceptUri']
        
        # Word overlap matching (at least 2 significant words)
        job_words = set(job_core.split()) - {'and', 'or', 'of', 'the', 'in', 'a', 'an', 'for', 'to'}
        occ_words = set(occ_label.split()) - {'and', 'or', 'of', 'the', 'in', 'a', 'an', 'for', 'to'}
        
        if len(job_words & occ_words) >= 2:
            return occ['conceptUri']
    
    return None

In [131]:
# Extract structured information
# Note: Skills are already extracted in the previous cell using ESCO
df['years_experience'] = df['description_cleaned'].apply(extract_years_experience)
df['education_required'] = df['description_cleaned'].apply(extract_education_level)

# Display extraction stats
print(f"Records with experience: {df['years_experience'].notna().sum()} ({df['years_experience'].notna().sum()/len(df)*100:.1f}%)")
print(f"Records with education: {df['education_required'].notna().sum()} ({df['education_required'].notna().sum()/len(df)*100:.1f}%)")
print(f"Average skills per job: {df['skills_count'].mean():.1f}")


Records with experience: 910 (64.6%)
Records with education: 635 (45.1%)
Average skills per job: 5.1


In [136]:
# Check mapping success rate
print("="*80)
print("ESCO MAPPING RESULTS")
print("="*80)
print(f"Jobs mapped to ESCO: {df['esco_occupation_uri'].notna().sum()} / {len(df)} ({df['esco_occupation_uri'].notna().sum()/len(df)*100:.1f}%)")
print(f"Average required skills per mapped job: {df[df['esco_occupation_uri'].notna()]['required_skills_count'].mean():.1f}")

# Show examples of successful mappings
print("\n Sample Successful Mappings:")
mapped_jobs = df[df['esco_occupation_uri'].notna()][['title', 'required_skills_count']].head(10)
for idx, row in mapped_jobs.iterrows():
    print(f"  • {row['title']} → {row['required_skills_count']} skills")

# Show examples of unmapped jobs
print("\nSample Unmapped Jobs (need better matching):")
unmapped_jobs = df[df['esco_occupation_uri'].isna()]['title'].head(10)
for title in unmapped_jobs:
    print(f"  • {title}")

ESCO MAPPING RESULTS
Jobs mapped to ESCO: 960 / 1408 (68.2%)
Average required skills per mapped job: 53.2

 Sample Successful Mappings:
  • Senior Software Engineer : Ruby on Rails → 26 skills
  • Senior Lecturer (Grade II) / Lecturer - Department of Electronics & Telecommunications → 72 skills
  • Senior Lecturer - Grade II / Lecturer / Lecturer (Probationary) - Software Engineering → 26 skills
  • Lecturer cum Programme Manager (IT & Academics) → 72 skills
  • Lecturer - Business & IT → 72 skills
  • Assistant Lecturer - IT → 72 skills
  • Software Engineer / ABAP Developer → 26 skills
  • Mid-Level Software Engineer (React) — POS & Retail Systems → 26 skills
  • Software Engineer (Male) → 26 skills
  • Assistant Lecturer - IT → 72 skills

Sample Unmapped Jobs (need better matching):
  • UI/UX Designer
  • Technical Lead (NodeJS & ReactJS)
  • Head of Key Accounts - Fashion
  • Specialist - Network Analytics and Automation
  • Head of AI
  • Urgent Technical Assistant (POS Hardware &

In [137]:
df['text_for_embedding'] = (
    df['title'].fillna('') + ' ' + 
    df['category'].fillna('') + ' ' + 
    df['description_cleaned'].fillna('')
).str.strip()
print(f" Created embedding text (avg length: {df['text_for_embedding'].str.len().mean():.0f} chars)")

 Created embedding text (avg length: 1568 chars)


In [138]:
initial_count = len(df)
df = df.drop_duplicates(subset=['title', 'company'], keep='first')
df = df[df['description_cleaned'].str.len() > 100]
print(f"Filtered: {initial_count} → {len(df)} jobs")

Filtered: 1408 → 1408 jobs


In [139]:
# Save cleaned data
output_file = PROCESSED_DIR / "xpressjobs_cleaned_with_descriptions.csv"
df.to_csv(output_file, index=False)


print(" Data cleaned")

print(f"Output: {output_file}")
print(f"Total jobs: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"\nExtraction Stats:")
print(f"  - Experience: {df['years_experience'].notna().sum()} ({df['years_experience'].notna().sum()/len(df)*100:.1f}%)")
print(f"  - Education: {df['education_required'].notna().sum()} ({df['education_required'].notna().sum()/len(df)*100:.1f}%)")
print(f"  - ESCO mapped: {df['esco_occupation_uri'].notna().sum()} ({df['esco_occupation_uri'].notna().sum()/len(df)*100:.1f}%)")
print(f"  - Avg skills extracted: {df['skills_count'].mean():.1f}")
print(f"  - Avg ESCO skills: {df[df['esco_occupation_uri'].notna()]['required_skills_count'].mean():.1f}")
print(f"  - Avg text length: {df['text_for_embedding'].str.len().mean():.0f} chars")

 Data cleaned
Output: ..\data\processed\xpressjobs_cleaned_with_descriptions.csv
Total jobs: 1408
Columns: ['title', 'company', 'location', 'job_type', 'days_left', 'level', 'description', 'job_url', 'category', 'search_term', 'scraped_date', 'description_cleaned', 'description_length', 'extracted_skills', 'skills_count', 'esco_occupation_uri', 'required_skills_esco', 'required_skills_count', 'years_experience', 'education_required', 'text_for_embedding']

Extraction Stats:
  - Experience: 910 (64.6%)
  - Education: 635 (45.1%)
  - ESCO mapped: 960 (68.2%)
  - Avg skills extracted: 5.1
  - Avg ESCO skills: 53.2
  - Avg text length: 1568 chars
