# PIPELINE 1: JOB DATA CLEANING (SBERT-READY)




In [None]:
import pandas as pd
from pathlib import Path
import re
import unicodedata

# 1. Setup Paths
RAW_DIR = Path("../data/raw/jobs")
PROCESSED_DIR = Path("../data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# 2. Load Data
# We prioritize the dataset that HAS description/tech stack data for SBERT
source_file = RAW_DIR / "sri_lanka_it_jobs_dataset.csv"
print(f"Loading source: {source_file}")

try:
    jobs_raw = pd.read_csv(source_file)
    print(f"Loaded {len(jobs_raw)} rows.")
except FileNotFoundError:
    print(" File not found. Please ensure data is in data/raw/jobs/")
    # Fallback for demonstration if file missing
    jobs_raw = pd.DataFrame({'Role': ['Demo Job'], 'Tech Stack': ['Python']})

## STEP 2: STANDARDIZE COLUMNS


In [None]:
def standardize_schema(df):
    # Standardize column names to lower case
    df.columns = df.columns.str.strip().str.title()
    
    # Map dataset-specific columns to our "Master Schema"
    # For your dataset: 'Role' -> job_title, 'Tech Stack' -> description
    rename_map = {
        'Role': 'job_title',
        'Tech Stack': 'description',  # Crucial for SBERT!
        'Experience (Years)': 'experience_level',
        'Salary Range (Llr/Month)': 'salary_range'
    }
    
    df = df.rename(columns=rename_map)
    
    # Ensure required columns exist
    required = ['job_title', 'description']
    for col in required:
        if col not in df.columns:
            print(f"Warning: Missing '{col}'. Creating empty placeholder.")
            df[col] = ""
            
    return df[required + [c for c in df.columns if c not in required]]

jobs_df = standardize_schema(jobs_raw)
print("Schema Standardized. Columns:", list(jobs_df.columns))

## STEP 3: "GENTLE" TEXT CLEANING
SBERT is sensitive to special characters.

In [None]:
def gentle_clean_text(text):
    if not isinstance(text, str):
        return ""

    text = unicodedata.normalize("NFKD", text)

    text = text.strip()
    
    return text

jobs_df['job_title'] = jobs_df['job_title'].apply(gentle_clean_text)
jobs_df['description'] = jobs_df['description'].apply(gentle_clean_text)

jobs_df = jobs_df[jobs_df['job_title'].str.len() > 2].copy()

print(f" Text Cleaned. Valid Jobs: {len(jobs_df)}")
print("Sample Description:", jobs_df['description'].iloc[0][:100])

In [None]:

filename = PROCESSED_DIR / "jobs_cleaned_sbert_ready.csv"
jobs_df.to_csv(filename, index=False)

print("="*50)
print(f" SUCCESS! Saved to: {filename}")
print("Now run the SBERT Pipeline to generate embeddings.")
print("="*50)