In [1]:
import re
import os
import csv
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

In [2]:
SKILL_PATTERNS = {
    "programming": [
        "python",
        "java",
        "c++",
        "javascript",
        "ruby",
        "php",
        "swift",
        "kotlin",
        "scala",
        "typescript",
        "html",
        "css",
        "perl",
        "objectivec",
        "c#",
        "rust",
        "go",
        "r",
        "bash",
    ],
    "design": [
        "photoshop",
        "illustrator",
        "sketch",
        "indesign",
        "adobe xd",
        "ui/ux design",
        "user experience",
        "wireframing",
        "prototyping",
        "graphic design",
        "web design",
    ],
    "data analysis": [
        "sql",
        "excel",
        "tableau",
        "power bi",
        "matplotlib",
        "pandas",
        "numpy",
        "data visualization",
        "data mining",
        "statistics",
        "data cleaning",
        "data wrangling",
        "data modeling",
        "machine learning",
    ],
    "communication": [
        "communication",
        "interpersonal",
        "teamwork",
        "collaboration",
        "leadership",
        "presentation skills",
        "public speaking",
        "written communication",
        "conflict resolution",
        "empathy",
        "active listening",
    ],
    "marketing": [
        "seo",
        "social media",
        "content marketing",
        "google analytics",
        "email marketing",
        "digital marketing",
        "online advertising",
        "branding",
        "market research",
        "copywriting",
        "search engine marketing",
        "inbound marketing",
    ],
    "finance": [
        "financial analysis",
        "accounting",
        "financial modeling",
        "budgeting",
        "risk management",
        "investment analysis",
        "financial reporting",
        "forecasting",
        "valuation",
        "financial planning",
        "taxation",
    ],
    "customer service": [
        "customer support",
        "customer satisfaction",
        "problem solving",
        "conflict resolution",
        "empathy",
        "patience",
        "active listening",
        "relationship management",
        "customer retention",
        "complaint handling",
    ],
    "project management": [
        "project management",
        "agile",
        "scrum",
        "kanban",
        "project planning",
        "risk management",
        "time management",
        "resource management",
        "stakeholder management",
        "budget management",
        "quality management",
    ],
    "human resources": [
        "recruitment",
        "employee relations",
        "training and development",
        "performance management",
        "talent management",
        "human resource management",
        "workforce planning",
        "employee engagement",
        "onboarding",
        "succession planning",
    ],
    "sales": [
        "sales",
        "negotiation",
        "relationship building",
        "cold calling",
        "client acquisition",
        "prospecting",
        "sales presentation",
        "closing deals",
        "account management",
        "sales forecasting",
        "sales strategy",
    ],
    "version control": [
        "git",
        "svn",
        "mercurial",
        "bitbucket",
        "github",
        "gitlab",
        "version control",
    ],
    "cloud": [
        "cloud computing",
        "aws",
        "azure",
        "google cloud",
        "cloud infrastructure",
        "cloud deployment",
        "cloud storage",
    ],
    "devops": [
        "devops",
        "continuous integration",
        "continuous deployment",
        "jenkins",
        "ansible",
        "docker",
        "kubernetes",
        "terraform",
        "monitoring",
        "logging",
    ],
    "iot": [
        "internet of things",
        "iot",
        "mqtt",
        "raspberry pi",
        "arduino",
        "embedded systems",
        "sensor networks",
        "home automation",
    ],
    "frontend": [
        "react",
        "angular",
        "vue.js",
        "ember.js",
        "backbone.js",
        "svelte",
        "next.js",
        "gatsby",
    ],
    "backend": [
        "node.js",
        "express",
        "django",
        "flask",
        "spring",
        "laravel",
        "ruby on rails",
        "serverless",
        "graphql",
        "restful api",
    ],
    "databases": [
        "sql",
        "mysql",
        "mongodb",
        "postgresql",
        "oracle",
        "sqlite",
        "nosql",
        "redis",
        "cassandra",
        "firebase",
    ],
    "ides": [
        "visual studio code",
        "sublime text",
        "atom",
        "vim",
        "emacs",
        "pycharm",
        "intellij idea",
        "eclipse",
        "netbeans",
    ],
}

STOP_WORDS = set(stopwords.words("english"))
DATA_DIRECTORY = "../data/jobs/"
OUTPUT_FILE = "../data/keywords.csv"

In [3]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in STOP_WORDS]
    return tokens


def extract_skills_from_file(file_path):
    df = pd.read_csv(file_path)
    job_title = df["title"][0]
    descriptions = df["description"].dropna()
    all_skills_counter = Counter()
    for description in descriptions:
        tokens = preprocess_text(description)
        for pattern in SKILL_PATTERNS.values():
            all_skills_counter.update(re.findall("|".join(pattern), " ".join(tokens)))
    return job_title, all_skills_counter


def find_category(skill):
    for category, patterns in SKILL_PATTERNS.items():
        if skill.lower() in patterns:
            return category
    return "Other"


def write_data_to_csv(data, output_file):
    with open(output_file, "w", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Job Title", "Skill", "Category", "Count"])
        csv_writer.writerows(data)

In [4]:
from tqdm import tqdm

csv_data = []
dirs = os.listdir(DATA_DIRECTORY)

for filename in tqdm(dirs, desc="Processing CSV files", unit="file"):
    if filename.endswith(".csv"):
        file_path = os.path.join(DATA_DIRECTORY, filename)
        job_title, skills_counter = extract_skills_from_file(file_path)
        top_skills = skills_counter.most_common(15)
        for skill, count in top_skills:
            skill_category = find_category(skill)
            csv_data.append([job_title, skill, skill_category, count])

df = pd.DataFrame(csv_data, columns=["Job Title", "Skill", "Category", "Count"])
df.to_csv(OUTPUT_FILE, index=False)
print(f"Data saved to {OUTPUT_FILE}")

Processing CSV files: 100%|████████████████████████████████████████████████████████| 142/142 [07:13<00:00,  3.05s/file]


Data saved to ../data/keywords.csv
