## Import Required Packages

In [1]:
import re
import os
import csv
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

## Define Skill Patterns

In [2]:
# Define skill patterns categorized by skill groups
SKILL_PATTERNS = {
    "programming": [
        "python",
        "java",
        "c++",
        "javascript",
        "ruby",
        "php",
        "swift",
        "kotlin",
        "scala",
        "typescript",
        "html",
        "css",
        "perl",
        "objectivec",
        "c#",
        "rust",
        "go",
        "r",
        "bash",
    ],
    "design": [
        "photoshop",
        "illustrator",
        "sketch",
        "indesign",
        "adobe xd",
        "ui/ux design",
        "user experience",
        "wireframing",
        "prototyping",
        "graphic design",
        "web design",
    ],
    "data analysis": [
        "sql",
        "excel",
        "tableau",
        "power bi",
        "matplotlib",
        "pandas",
        "numpy",
        "data visualization",
        "data mining",
        "statistics",
        "data cleaning",
        "data wrangling",
        "data modeling",
        "machine learning",
    ],
    "communication": [
        "communication",
        "interpersonal",
        "teamwork",
        "collaboration",
        "leadership",
        "presentation skills",
        "public speaking",
        "written communication",
        "conflict resolution",
        "empathy",
        "active listening",
    ],
    "marketing": [
        "seo",
        "social media",
        "content marketing",
        "google analytics",
        "email marketing",
        "digital marketing",
        "online advertising",
        "branding",
        "market research",
        "copywriting",
        "search engine marketing",
        "inbound marketing",
    ],
    "finance": [
        "financial analysis",
        "accounting",
        "financial modeling",
        "budgeting",
        "risk management",
        "investment analysis",
        "financial reporting",
        "forecasting",
        "valuation",
        "financial planning",
        "taxation",
    ],
    "customer service": [
        "customer support",
        "customer satisfaction",
        "problem solving",
        "conflict resolution",
        "empathy",
        "patience",
        "active listening",
        "relationship management",
        "customer retention",
        "complaint handling",
    ],
    "project management": [
        "project management",
        "agile",
        "scrum",
        "kanban",
        "project planning",
        "risk management",
        "time management",
        "resource management",
        "stakeholder management",
        "budget management",
        "quality management",
    ],
    "human resources": [
        "recruitment",
        "employee relations",
        "training and development",
        "performance management",
        "talent management",
        "human resource management",
        "workforce planning",
        "employee engagement",
        "onboarding",
        "succession planning",
    ],
    "sales": [
        "sales",
        "negotiation",
        "relationship building",
        "cold calling",
        "client acquisition",
        "prospecting",
        "sales presentation",
        "closing deals",
        "account management",
        "sales forecasting",
        "sales strategy",
    ],
    "version control": [
        "git",
        "svn",
        "mercurial",
        "bitbucket",
        "github",
        "gitlab",
        "version control",
    ],
    "cloud": [
        "cloud computing",
        "aws",
        "azure",
        "google cloud",
        "cloud infrastructure",
        "cloud deployment",
        "cloud storage",
    ],
    "devops": [
        "devops",
        "continuous integration",
        "continuous deployment",
        "jenkins",
        "ansible",
        "docker",
        "kubernetes",
        "terraform",
        "monitoring",
        "logging",
    ],
    "iot": [
        "internet of things",
        "iot",
        "mqtt",
        "raspberry pi",
        "arduino",
        "embedded systems",
        "sensor networks",
        "home automation",
    ],
    "frontend": [
        "react",
        "angular",
        "vue.js",
        "ember.js",
        "backbone.js",
        "svelte",
        "next.js",
        "gatsby",
    ],
    "backend": [
        "node.js",
        "express",
        "django",
        "flask",
        "spring",
        "laravel",
        "ruby on rails",
        "serverless",
        "graphql",
        "restful api",
    ],
    "databases": [
        "sql",
        "mysql",
        "mongodb",
        "postgresql",
        "oracle",
        "sqlite",
        "nosql",
        "redis",
        "cassandra",
        "firebase",
    ],
    "ides": [
        "visual studio code",
        "sublime text",
        "atom",
        "vim",
        "emacs",
        "pycharm",
        "intellij idea",
        "eclipse",
        "netbeans",
    ],
}

# Set of stopwords for text preprocessing
STOP_WORDS = set(stopwords.words("english"))

# Directory containing job data files
DATA_DIRECTORY = "../data/jobs/"

# Output file for storing extracted keywords
OUTPUT_FILE = "../data/keywords.csv"

## Skill Patterns Extraction

In [3]:
from typing import List, Tuple


def preprocess_text(text: str) -> List[str]:
    """
    Preprocesses the input text by tokenizing, converting to lowercase,
    removing non-alphanumeric tokens, and filtering out stop words.

    Args:
    text (str): The input text to preprocess.

    Returns:
    List[str]: List of preprocessed tokens.
    """
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in STOP_WORDS]
    return tokens


def extract_skills_from_file(file_path: str) -> Tuple[str, Counter]:
    """
    Extracts skills from a CSV file containing job descriptions.

    Args:
    file_path (str): Path to the CSV file.

    Returns:
    Tuple[str, Counter]: A tuple containing the job title and a Counter
    object with skills and their frequencies.
    """
    df = pd.read_csv(file_path)
    job_title = df["title"][0]
    descriptions = df["description"].dropna()
    all_skills_counter = Counter()
    for description in descriptions:
        tokens = preprocess_text(description)
        for pattern in SKILL_PATTERNS.values():
            all_skills_counter.update(re.findall("|".join(pattern), " ".join(tokens)))
    return job_title, all_skills_counter


def find_category(skill: str) -> str:
    """
    Finds the category of a skill based on predefined patterns.

    Args:
    skill (str): The skill to find the category for.

    Returns:
    str: The category of the skill.
    """
    for category, patterns in SKILL_PATTERNS.items():
        if skill.lower() in patterns:
            return category
    return "Other"


def write_data_to_csv(data: List[Tuple[str, str, str, int]], output_file: str) -> None:
    """
    Writes extracted data to a CSV file.

    Args:
    data (List[Tuple[str, str, str, int]]): List of tuples containing job title,
    skill, category, and count.
    output_file (str): Path to the output CSV file.
    """
    with open(output_file, "w", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Job Title", "Skill", "Category", "Count"])
        csv_writer.writerows(data)

## Save the Extracted Data

In [4]:
from tqdm import tqdm

# Initialize an empty list to store CSV data
csv_data = []

# List all files in the specified directory
dirs = os.listdir(DATA_DIRECTORY)

# Iterate over each file in the directory
for filename in tqdm(dirs, desc="Processing CSV files", unit="file"):
    # Check if the file is a CSV file
    if filename.endswith(".csv"):
        # Construct the full file path
        file_path = os.path.join(DATA_DIRECTORY, filename)

        # Extract job title and skills from the CSV file
        job_title, skills_counter = extract_skills_from_file(file_path)

        # Extract the top 15 most common skills
        top_skills = skills_counter.most_common(15)

        # Iterate over each skill and its count
        for skill, count in top_skills:
            # Determine the category of the skill
            skill_category = find_category(skill)

            # Append the data to the CSV data list
            csv_data.append([job_title, skill, skill_category, count])

# Create a DataFrame from the CSV data
df = pd.DataFrame(csv_data, columns=["Job Title", "Skill", "Category", "Count"])

# Write the DataFrame to a CSV file
df.to_csv(OUTPUT_FILE, index=False)

# Print a message indicating where the data was saved
print(f"Data saved to {OUTPUT_FILE}")

Processing CSV files: 100%|████████████████████████████████████████████████████████| 142/142 [06:16<00:00,  2.65s/file]

Data saved to ../data/keywords.csv



