In [2]:
import pandas as pd

df = pd.read_csv('computing_job_postings.csv')

totalRows = df.shape[0]

In [9]:
import re
import random
from collections import defaultdict

# Expanded regex patterns for better matching
patterns = {
    "Programming Language (PL)": r"\b(?:python\d*|java(?:script)?|c\+\+|c#|sql|ruby|go|swift|typescript|r|kotlin)\b",
    
    "Framework (FW)": r"\b(?:django|flask|spring|react(?:.js)?|angular|vue|express|fastapi|\.net|laravel)\b",
    
    "Database (DB)": r"\b(?:sql\s*server|mysql|postgresql|mongodb|oracle|sqlite|firebase|cassandra|database(?:\s+\w+){0,1})\b",
    
    "Cloud Platform (CP)": r"\b(?:aws(?:\s+\w+){0,2}|amazon\s*web\s*services|azure|google\s*cloud|gcp|ibm\s*cloud|digitalocean|heroku)\b",
    
    "DevOps (DO)": r"\b(?:docker(?:ized)?|kubernetes|jenkins|terraform|ansible|ci/cd|travis\s*ci|circleci)\b",
    
    "Network & Security": r"\b(?:firewall|vpn|ssl/tls|penetration\s*testing|ids|ips|tcp/ip|zero\s*trust)\b",
    
    "Data Analysis & Science": r"\b(?:pandas|numpy|scikit-learn|tensorflow|power\s*bi|excel|tableau|matplotlib|data\s*visualisation|data\s*visualization|visualizing\s*data|AI|artificial\s*intelligence|machine\s*learning|ML|NLP|natural\s*language\s*processing|text\s*analytics|language\s*model(?:s|ing)?|transaction\s*management|data\s*transaction(?:s)?|text\s*mining|data\s*mining|statistical\s*modelling|statistical\s*modeling|analytics|data\s*analytics|business\s*analytics|predictive\s*analytics)\b",
    
    "Software Engineering (SWE)": r"\b(?:software\s*development|design\s*patterns|unit\s*testing|full\s*stack|fullstack|full-stack|software\s*engineer(?:ing)?|junior\s*(?:software\s*)?engineer|senior\s*(?:software\s*)?engineer|staff\s*engineer|principal\s*engineer|software\s*engineer\s*intern(?:ship)?|code\s*optimi(?:s|z)ation|performance\s*tuning|code\s*refactoring|refactor(?:ing)?|microservice(?:s)?|MSA|microservice\s*architecture|containerization|docker|kubernetes|k8s|code\s*review(?:s)?|peer\s*review(?:s)?|eclipse|eclipse\s*ide|hibernate|hibernate\s*orm|jquery|rest(?:ful)?\s*api(?:s)?|rest\s*api(?:s)?|restful\s*web\s*service(?:s)?|api\s*development|web\s*service(?:s)?|object\s*oriented\s*programming|oop|object\s*oriented\s*design|jsp|java\s*server\s*pages|rpc|remote\s*procedure\s*call|j2ee|java\s*ee|jvm|java\s*virtual\s*machine|jax|jax-rs|jax-ws|apache|apache\s*(?:kafka|tomcat|maven|ant|struts|camel|spark|hadoop|flink)|bootstrap|front-end\s*framework|rabbitmq|message\s*(?:queue|broker)|front[\s-]end|back[\s-]end|web\s*design|ui|user\s*interface|ux|user\s*experience|ui\/ux|front[\s-]end\s*development|back[\s-]end\s*development|web\s*development)\b",
    
    "Project Management (PM)": r"\b(?:agile|jira|trello|asana|kanban|prince2|stakeholder\s*management|(?<!certified\s)scrum(?!\smaster))\b",

    "Education Certification (EC)": r"\bcertified\s+scrum\s+master\b|\bcsm\b|\bpmp\b|\baws\s+certified\b|\bazure\s+certified\b|\bgcp\s+certified\b|\bcissp\b|\bccna\b|\bceh\b|\bcomptia\b|\bcisa\b|\bcism\b|\b(?:bachelor(?:\s+of\s+(?:science|engineering|computer\s+science|information\s+technology|information\s+systems|cybersecurity|data\s+science|software\s+engineering))?(?:\s+in\s+(?:computer\s+science|computer\s+engineering|information\s+technology|information\s+systems|cybersecurity|data\s+science|software\s+engineering|artificial\s+intelligence|machine\s+learning))?|B\.?S\.?|B\.?E\.?|B\.?C\.?S\.?|B\.?Tech\.?|(?<!certified\s)master(?:\s+of\s+(?:science|engineering|computer\s+science|information\s+technology|information\s+systems|cybersecurity|data\s+science|software\s+engineering))?(?:\s+in\s+(?:computer\s+science|computer\s+engineering|information\s+technology|information\s+systems|cybersecurity|data\s+science|software\s+engineering|artificial\s+intelligence|machine\s+learning))?|M\.?S\.?|M\.?E\.?|M\.?C\.?S\.?|M\.?Tech\.?|phd|ph\.?d\.?|doctorate|doctor\s+of\s+philosophy(?:\s+in\s+(?:computer\s+science|computer\s+engineering|information\s+technology|data\s+science))?|specialisation\s+in\s+(?:software|data|cloud|security|networking|ai|machine\s+learning)|specialization\s+in\s+(?:software|data|cloud|security|networking|ai|machine\s+learning)|minor\s+in\s+(?:computer\s+science|information\s+technology|data\s+science|software\s+engineering)|major\s+in\s+(?:computer\s+science|information\s+technology|data\s+science|software\s+engineering))\b",
    
    "Soft Skills (SS)": r"\b(?:communication|leadership|teamwork|problem\s*solving|critical\s*thinking|adaptability)\b"
}

test_text = df.iloc[random.randint(1, totalRows)]['cleaned_description']

extracted_entities = defaultdict(set)
#modified_text = remove_stopwords(test_text_2)
modified_text = test_text

# Apply regex patterns with wildcard matching
for label, pattern in patterns.items():
    # Define the replacement function inside the loop
    # This creates a new function for each iteration
    def replacement_func(match_obj):
        match_text = match_obj.group(0)
        
        # Add the match to extracted entities
        if isinstance(match_text, tuple):  # Handle tuple case from capture groups
            extracted_entities[label].update(map(str.lower, match_text))
        else:
            extracted_entities[label].add(match_text.lower())
            
        # Return empty string to remove the match from the text
        return ""
    
    # Find matches and replace them with empty string
    modified_text = re.sub(pattern, replacement_func, modified_text, flags=re.IGNORECASE)

# Print the extracted entities
print("=== EXTRACTED ENTITIES ===\n")
for label, entities in extracted_entities.items():
    # Convert set to a comma-separated string for printing
    entities_str = ", ".join(sorted(entities))
    print(f"{label}: {entities_str}")

# Print the leftover text
print("\n=== LEFTOVER TEXT ===\n")
print(modified_text)

=== EXTRACTED ENTITIES ===

Programming Language (PL): typescript
Cloud Platform (CP): aws  reactbachelor degree
Software Engineering (SWE): code review, senior software engineer, software development
Soft Skills (SS): communication

=== LEFTOVER TEXT ===

veho veho experience company unlocks potential everyday consumer brand fully participate building entirely new logistics infrastructure powered latest technology designed modern era shopping veho reinventing shipping part experience know removing pain delivery return veho creating powerful opportunity brand engage build deep loyalty customer like never role  responsible building tooling power driver partner experience veho platform work project inform driver partner claim route get paid veho fairly price effort delivery build apis application integrate data science system empower decision work various team drive architectural solution development including inception design execution delivery following good design coding practice resp